# Obesity in the US 2015 Exploration
- This is a dataset that looks at adults 18 years and older across the United States who are obese. 
- This data differentiates people by age, income, education, gender, and race/ethnicity. 
- In this notebook, I will explore and clean the dataset and create new data frames for deeper analysis
    - I will clean up the columns and filter out the year column to only look at 2015 for age, income, education, 
    gender, and race/ethnicity. 
    - I will make data frames for gender obesity rates in 2015, age obesity rates in 2015, income obesity rates in     2015, and education obesity rates in 2015


# Setup

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy.random as np
import sys
import matplotlib 
import seaborn as sns
import numpy as np

# Load & Inspect Data

In [2]:
obesity_in_US_df = pd.read_csv('../data/Raw Data/Nutrition_Physical_Activity_and_Obesity_Behavioral_Risk_Factor_Surveillance_System.csv')
obesity_in_US_df.shape

(53392, 33)

In [3]:
obesity_in_US_df.sample(20)

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Unit,Data_Value_Type,...,GeoLocation,ClassID,TopicID,QuestionID,DataValueTypeID,LocationID,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1
17324,2013,2013,MO,Missouri,Behavioral Risk Factor Surveillance System,Fruits and Vegetables,Fruits and Vegetables - Behavior,Percent of adults who report consuming vegetab...,,Value,...,"(38.635790776000476, -92.56630005299968)",FV,FV1,Q019,VALUE,29,Race/Ethnicity,Non-Hispanic White,RACE,RACEWHT
32862,2013,2013,WA,Washington,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(47.52227862900048, -120.47001078999972)",OWS,OWS1,Q037,VALUE,53,Income,"Less than $15,000",INC,INCLESS15
23152,2011,2011,NC,North Carolina,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in muscle-strengt...,,Value,...,"(35.466220975000454, -79.15925046299964)",PA,PA1,Q046,VALUE,37,Race/Ethnicity,Non-Hispanic Black,RACE,RACEBLK
1334,2014,2014,AK,Alaska,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,"(64.84507995700051, -147.72205903599973)",PA,PA1,Q047,VALUE,2,Age (years),35 - 44,AGEYR,AGEYR3544
25704,2011,2011,OR,Oregon,Behavioral Risk Factor Surveillance System,Fruits and Vegetables,Fruits and Vegetables - Behavior,Percent of adults who report consuming fruit l...,,Value,...,"(44.56744942400047, -120.15503132599969)",FV,FV1,Q018,VALUE,41,Race/Ethnicity,Non-Hispanic Black,RACE,RACEBLK
8461,2013,2013,ID,Idaho,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(43.682630005000476, -114.3637300419997)",OWS,OWS1,Q036,VALUE,16,Race/Ethnicity,Hispanic,RACE,RACEHIS
20914,2014,2014,NH,New Hampshire,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,"(43.65595011300047, -71.50036091999965)",PA,PA1,Q047,VALUE,33,Race/Ethnicity,Asian,RACE,RACEASN
13844,2012,2012,MD,Maryland,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,"(39.29058096400047, -76.60926011099963)",PA,PA1,Q047,VALUE,24,Gender,Male,GEN,MALE
4800,2011,2011,DE,Delaware,Behavioral Risk Factor Surveillance System,Fruits and Vegetables,Fruits and Vegetables - Behavior,Percent of adults who report consuming fruit l...,,Value,...,"(39.008830667000495, -75.57774116799965)",FV,FV1,Q018,VALUE,10,Race/Ethnicity,Other,RACE,RACEOTH
17857,2012,2012,MT,Montana,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(47.06652897200047, -109.42442064499971)",OWS,OWS1,Q037,VALUE,30,Age (years),65 or older,AGEYR,AGEYR65PLUS


# Make a 2015 Data frame 

In [4]:
# dropping unnecessary columns
# rename Stratication1 and StratificationCategory1 and grouping age, education, income, gender, race/ethnicity into columns

obesity_in_US_cols = obesity_in_US_df.drop(['Low_Confidence_Limit','High_Confidence_Limit ','YearEnd','Topic','Class','Datasource','Data_Value_Unit','QuestionID','ClassID','TopicID','DataValueTypeID','Data_Value_Type','Data_Value_Footnote_Symbol','Data_Value_Footnote','StratificationCategoryId1','StratificationID1'],1);

#Focusing only on 2015 
#create filter 
obesity_in_US_year2015=obesity_in_US_cols[obesity_in_US_cols['YearStart']==2015]
obesity_in_US_year2015

Unnamed: 0,YearStart,LocationAbbr,LocationDesc,Question,Data_Value,Data_Value_Alt,Sample_Size,Total,Age(years),Education,Gender,Income,Race/Ethnicity,GeoLocation,LocationID,StratificationCategory1,Stratification1
387,2015,US,National,Percent of adults aged 18 years and older who ...,28.9,28.9,398316.0,Total,,,,,,,59,Total,Total
423,2015,US,National,Percent of adults aged 18 years and older who ...,10.2,10.2,8324.0,,,,,,Asian,,59,Race/Ethnicity,Asian
613,2015,US,National,Percent of adults aged 18 years and older who ...,24.3,24.3,1691.0,,,,,,Other,,59,Race/Ethnicity,Other
33939,2015,AK,Alaska,Percent of adults who report consuming fruit l...,46.4,46.4,1545.0,,,,Male,,,"(64.84507995700051, -147.72205903599973)",2,Gender,Male
35130,2015,MO,Missouri,Percent of adults who engage in no leisure-tim...,27.2,27.2,3995.0,,,,Female,,,"(38.635790776000476, -92.56630005299968)",29,Gender,Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48767,2015,PR,Puerto Rico,Percent of adults aged 18 years and older who ...,,,,,,,,,Hawaiian/Pacific Islander,"(18.220833, -66.590149)",72,Race/Ethnicity,Hawaiian/Pacific Islander
48768,2015,PR,Puerto Rico,Percent of adults aged 18 years and older who ...,,,,,,,,,Non-Hispanic Black,"(18.220833, -66.590149)",72,Race/Ethnicity,Non-Hispanic Black
48769,2015,PR,Puerto Rico,Percent of adults aged 18 years and older who ...,,,,,,,,,Non-Hispanic White,"(18.220833, -66.590149)",72,Race/Ethnicity,Non-Hispanic White
48770,2015,PR,Puerto Rico,Percent of adults aged 18 years and older who ...,,,,,,,,,Other,"(18.220833, -66.590149)",72,Race/Ethnicity,Other


In [5]:
# create filter that chooses the following questions: 
# filter uses obesity_in_US_year2014 because we are only looking at 2015
       #q1: 'Percent of adults aged 18 years and older who have obesity',
       #q2: 'Percent of adults aged 18 years and older who have an overweight classification'
questions_filter=obesity_in_US_year2015[(obesity_in_US_year2015['Question']=='Percent of adults aged 18 years and older who have obesity') | (obesity_in_US_year2015['Question']=='Percent of adults aged 18 years and older who have an overweight classification')]
questions_filter['Question'].unique()

array(['Percent of adults aged 18 years and older who have obesity',
       'Percent of adults aged 18 years and older who have an overweight classification'],
      dtype=object)

In [6]:
# Rename "questions_filter" to "obesity_in_US_df3"
obesity_in_US_df3 = questions_filter
obesity_in_US_df3

Unnamed: 0,YearStart,LocationAbbr,LocationDesc,Question,Data_Value,Data_Value_Alt,Sample_Size,Total,Age(years),Education,Gender,Income,Race/Ethnicity,GeoLocation,LocationID,StratificationCategory1,Stratification1
387,2015,US,National,Percent of adults aged 18 years and older who ...,28.9,28.9,398316.0,Total,,,,,,,59,Total,Total
423,2015,US,National,Percent of adults aged 18 years and older who ...,10.2,10.2,8324.0,,,,,,Asian,,59,Race/Ethnicity,Asian
613,2015,US,National,Percent of adults aged 18 years and older who ...,24.3,24.3,1691.0,,,,,,Other,,59,Race/Ethnicity,Other
35289,2015,AK,Alaska,Percent of adults aged 18 years and older who ...,24.1,24.1,91.0,,,,,,Hispanic,"(64.84507995700051, -147.72205903599973)",2,Race/Ethnicity,Hispanic
37966,2015,AL,Alabama,Percent of adults aged 18 years and older who ...,41.1,41.1,836.0,,35 - 44,,,,,"(32.84057112200048, -86.63186076199969)",1,Age (years),35 - 44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48767,2015,PR,Puerto Rico,Percent of adults aged 18 years and older who ...,,,,,,,,,Hawaiian/Pacific Islander,"(18.220833, -66.590149)",72,Race/Ethnicity,Hawaiian/Pacific Islander
48768,2015,PR,Puerto Rico,Percent of adults aged 18 years and older who ...,,,,,,,,,Non-Hispanic Black,"(18.220833, -66.590149)",72,Race/Ethnicity,Non-Hispanic Black
48769,2015,PR,Puerto Rico,Percent of adults aged 18 years and older who ...,,,,,,,,,Non-Hispanic White,"(18.220833, -66.590149)",72,Race/Ethnicity,Non-Hispanic White
48770,2015,PR,Puerto Rico,Percent of adults aged 18 years and older who ...,,,,,,,,,Other,"(18.220833, -66.590149)",72,Race/Ethnicity,Other


In [7]:
# Saving dataset "obesity_in_US_df2" with year2015 and obesity/overweight questions
obesity_in_US_df3.to_csv('../data/Clean Data/Obesity_in_US_2015_cleaned.csv')

# (2015) Create New Gender, Income, Age Data Frames  

### Create Gender Data Frame (2015)

In [8]:
# Gender Dataset 2015
# create filter that only looks at gender rows 
obesity_in_US_gender_2015=obesity_in_US_df3[(obesity_in_US_df3['Stratification1']=='Male')|(obesity_in_US_df3['Stratification1']=='Female')]
obesity_in_US_gender_2015 = obesity_in_US_gender_2015.drop(['Age(years)','Education','Income','Race/Ethnicity'],1);

In [9]:
obesity_in_US_gender_2015.columns
cols_to_use = ['YearStart', 'LocationAbbr', 'LocationDesc', 'Data_Value', 'Data_Value_Alt', 'Question', 'Sample_Size', 'Total', 'Gender', 'GeoLocation', 'LocationID', 'StratificationCategory1', 'Stratification1']
obesity_in_US_gender_2015=obesity_in_US_gender_2015[cols_to_use]

obesity_in_US_gender_2015_labels = {
    'YearStart': 'year',
    'LocationAbbr': 'state',
    'LocationDesc': 'state_name',
    'Data_Value': 'gender_pc',
    'Data_Value_Alt': 'Data_Value_Alt',
    'Question': 'Question',
    'Sample_Size': 'Sample_Size', 
    'Total' : 'Total',
    'Gender': 'Gender', 
    'GeoLocation': 'GeoLocation', 
    'LocationID': 'LocationID', 
    'StratificationCategory1': 'StratificationCategory1', 
    'Stratification1': 'Stratification1'
}
obesity_in_US_gender_2015 = obesity_in_US_gender_2015.rename(columns = obesity_in_US_gender_2015_labels)


In [10]:
obesity_in_US_gender_2015 = obesity_in_US_gender_2015.reset_index(drop = True)
obesity_in_US_gender_2015

Unnamed: 0,year,state,state_name,gender_pc,Data_Value_Alt,Question,Sample_Size,Total,Gender,GeoLocation,LocationID,StratificationCategory1,Stratification1
0,2015,ID,Idaho,29.8,29.8,Percent of adults aged 18 years and older who ...,2420.0,,Male,"(43.682630005000476, -114.3637300419997)",16,Gender,Male
1,2015,FL,Florida,42.5,42.5,Percent of adults aged 18 years and older who ...,3895.0,,Male,"(28.932040377000476, -81.92896053899966)",12,Gender,Male
2,2015,AL,Alabama,34.9,34.9,Percent of adults aged 18 years and older who ...,2958.0,,Male,"(32.84057112200048, -86.63186076199969)",1,Gender,Male
3,2015,AL,Alabama,36.4,36.4,Percent of adults aged 18 years and older who ...,4425.0,,Female,"(32.84057112200048, -86.63186076199969)",1,Gender,Female
4,2015,AL,Alabama,28.4,28.4,Percent of adults aged 18 years and older who ...,4425.0,,Female,"(32.84057112200048, -86.63186076199969)",1,Gender,Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...
211,2015,GU,Guam,40.1,40.1,Percent of adults aged 18 years and older who ...,761.0,,Male,"(13.444304, 144.793731)",66,Gender,Male
212,2015,PR,Puerto Rico,27.8,27.8,Percent of adults aged 18 years and older who ...,1914.0,,Male,"(18.220833, -66.590149)",72,Gender,Male
213,2015,PR,Puerto Rico,31.1,31.1,Percent of adults aged 18 years and older who ...,3240.0,,Female,"(18.220833, -66.590149)",72,Gender,Female
214,2015,PR,Puerto Rico,34.3,34.3,Percent of adults aged 18 years and older who ...,3240.0,,Female,"(18.220833, -66.590149)",72,Gender,Female


In [11]:
# Saving gender dataframe "obesity_in_US_gender_2015.csv"
obesity_in_US_gender_2015.to_csv('../data/Clean Data/obesity_in_US_gender_2015.csv')

### Create Income Data Frame (2015) 

In [12]:
# Income Dataset (2015)
# create filter that only looks at income rows 
obesity_in_US_income_2015=obesity_in_US_df3[obesity_in_US_df3['StratificationCategory1']=='Income']
obesity_in_US_income_2015 = obesity_in_US_income_2015.drop(['Age(years)','Gender','Education','Race/Ethnicity'],1);

In [13]:
obesity_in_US_income_2015.columns
cols_to_use = ['YearStart', 'LocationAbbr', 'LocationDesc', 'Data_Value', 'Data_Value_Alt', 'Question', 'Sample_Size', 'Total', 'Income', 'GeoLocation', 'LocationID', 'StratificationCategory1', 'Stratification1']
obesity_in_US_income_2015=obesity_in_US_income_2015[cols_to_use]

obesity_in_US_income_2015_labels = {
    'YearStart': 'year',
    'LocationAbbr': 'state',
    'LocationDesc': 'state_name',
    'Data_Value': 'income_pc',
    'Data_Value_Alt': 'Data_Value_Alt',
    'Question': 'Question',
    'Sample_Size': 'Sample_Size', 
    'Total' : 'Total',
    'Income': 'Income', 
    'GeoLocation': 'GeoLocation', 
    'LocationID': 'LocationID', 
    'StratificationCategory1': 'StratificationCategory1', 
    'Stratification1': 'Stratification1'
}
obesity_in_US_income_2015 = obesity_in_US_income_2015.rename(columns = obesity_in_US_income_2015_labels)


In [14]:
obesity_in_US_income_2015 = obesity_in_US_income_2015.reset_index(drop = True)
obesity_in_US_income_2015

Unnamed: 0,year,state,state_name,income_pc,Data_Value_Alt,Question,Sample_Size,Total,Income,GeoLocation,LocationID,StratificationCategory1,Stratification1
0,2015,AL,Alabama,31.2,31.2,Percent of adults aged 18 years and older who ...,1298.0,,Data not reported,"(32.84057112200048, -86.63186076199969)",1,Income,Data not reported
1,2015,AL,Alabama,32.7,32.7,Percent of adults aged 18 years and older who ...,1427.0,,"$75,000 or greater","(32.84057112200048, -86.63186076199969)",1,Income,"$75,000 or greater"
2,2015,AL,Alabama,34.6,34.6,Percent of adults aged 18 years and older who ...,908.0,,"$50,000 - $74,999","(32.84057112200048, -86.63186076199969)",1,Income,"$50,000 - $74,999"
3,2015,AL,Alabama,36.5,36.5,Percent of adults aged 18 years and older who ...,843.0,,"$35,000 - $49,999","(32.84057112200048, -86.63186076199969)",1,Income,"$35,000 - $49,999"
4,2015,AL,Alabama,38.8,38.8,Percent of adults aged 18 years and older who ...,683.0,,"$25,000 - $34,999","(32.84057112200048, -86.63186076199969)",1,Income,"$25,000 - $34,999"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
751,2015,PR,Puerto Rico,37.7,37.7,Percent of adults aged 18 years and older who ...,1982.0,,"Less than $15,000","(18.220833, -66.590149)",72,Income,"Less than $15,000"
752,2015,PR,Puerto Rico,38.1,38.1,Percent of adults aged 18 years and older who ...,1217.0,,"$15,000 - $24,999","(18.220833, -66.590149)",72,Income,"$15,000 - $24,999"
753,2015,PR,Puerto Rico,39.5,39.5,Percent of adults aged 18 years and older who ...,186.0,,"$50,000 - $74,999","(18.220833, -66.590149)",72,Income,"$50,000 - $74,999"
754,2015,PR,Puerto Rico,40.5,40.5,Percent of adults aged 18 years and older who ...,422.0,,"$25,000 - $34,999","(18.220833, -66.590149)",72,Income,"$25,000 - $34,999"


In [15]:
# Saving income dataset 'obesity_in_US_income_2015.csv'
obesity_in_US_income_2015.to_csv('../data/Clean Data/obesity_in_US_income_2015.csv')

### Create Age Data Frame (2015)

In [16]:
# Age Dataset (2015)
# create filter that only looks at income rows 
obesity_in_US_age_2015=obesity_in_US_df3[(obesity_in_US_df3['Stratification1']=='18 - 24')|(obesity_in_US_df3['Stratification1']=='35 - 44')|(obesity_in_US_df3['Stratification1']=='45 - 54')|(obesity_in_US_df3['Stratification1']=='55 - 64')|(obesity_in_US_df3['Stratification1']=='65 or older')]
obesity_in_US_age_2015 = obesity_in_US_age_2015.drop(['Income','Gender','Education','Race/Ethnicity'],1);

In [17]:
obesity_in_US_age_2015.columns
cols_to_use = ['YearStart', 'LocationAbbr', 'LocationDesc', 'Data_Value', 'Data_Value_Alt', 'Question', 'Sample_Size', 'Total', 'Age(years)', 'GeoLocation', 'LocationID', 'StratificationCategory1', 'Stratification1']
obesity_in_US_age_2015=obesity_in_US_age_2015[cols_to_use]

obesity_in_US_age_2015_labels = {
    'YearStart': 'year',
    'LocationAbbr': 'state',
    'LocationDesc': 'state_name',
    'Data_Value': 'age_pc',
    'Data_Value_Alt': 'Data_Value_Alt',
    'Question': 'Question',
    'Sample_Size': 'Sample_Size', 
    'Total' : 'Total',
    'Age(years)': 'Age', 
    'GeoLocation': 'GeoLocation', 
    'LocationID': 'LocationID', 
    'StratificationCategory1': 'StratificationCategory1', 
    'Stratification1': 'Stratification1'
}
obesity_in_US_age_2015 = obesity_in_US_age_2015.rename(columns = obesity_in_US_age_2015_labels)


In [18]:
obesity_in_US_age_2015 = obesity_in_US_age_2015.reset_index(drop = True)
obesity_in_US_age_2015

Unnamed: 0,year,state,state_name,age_pc,Data_Value_Alt,Question,Sample_Size,Total,Age,GeoLocation,LocationID,StratificationCategory1,Stratification1
0,2015,AL,Alabama,41.1,41.1,Percent of adults aged 18 years and older who ...,836.0,,35 - 44,"(32.84057112200048, -86.63186076199969)",1,Age (years),35 - 44
1,2015,AK,Alaska,33.1,33.1,Percent of adults aged 18 years and older who ...,459.0,,35 - 44,"(64.84507995700051, -147.72205903599973)",2,Age (years),35 - 44
2,2015,AZ,Arizona,33.4,33.4,Percent of adults aged 18 years and older who ...,773.0,,35 - 44,"(34.865970280000454, -111.76381127699972)",4,Age (years),35 - 44
3,2015,AZ,Arizona,33.4,33.4,Percent of adults aged 18 years and older who ...,1419.0,,55 - 64,"(34.865970280000454, -111.76381127699972)",4,Age (years),55 - 64
4,2015,AR,Arkansas,42.2,42.2,Percent of adults aged 18 years and older who ...,661.0,,45 - 54,"(34.74865012400045, -92.27449074299966)",5,Age (years),45 - 54
...,...,...,...,...,...,...,...,...,...,...,...,...,...
535,2015,PR,Puerto Rico,19.9,19.9,Percent of adults aged 18 years and older who ...,397.0,,18 - 24,"(18.220833, -66.590149)",72,Age (years),18 - 24
536,2015,PR,Puerto Rico,34.1,34.1,Percent of adults aged 18 years and older who ...,656.0,,35 - 44,"(18.220833, -66.590149)",72,Age (years),35 - 44
537,2015,PR,Puerto Rico,42.0,42.0,Percent of adults aged 18 years and older who ...,997.0,,55 - 64,"(18.220833, -66.590149)",72,Age (years),55 - 64
538,2015,PR,Puerto Rico,43.0,43.0,Percent of adults aged 18 years and older who ...,1647.0,,65 or older,"(18.220833, -66.590149)",72,Age (years),65 or older


In [19]:
# Saving age dataset 'obesity_in_US_age_2015.csv'
obesity_in_US_age_2015.to_csv('../data/Clean Data/obesity_in_US_age_2015.csv')