# Obesity in the US in 2014 Exploration
- This is a dataset that looks at adults 18 years and older across the United States who are obese. 
- This data differentiates people by age, income, education, gender, and race/ethnicity. 
- In this notebook, I will explore and clean the dataset and create new data frames for deeper analysis
    - I will clean up the columns and filter out the year column to only look at 2014 for age, income, and gender.
    - I will make data frames for gender obesity rates in 2014, age obesity rates in 2014, and income obesity rates in 2014



# Setup

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy.random as np
import sys
import matplotlib 
import seaborn as sns
import numpy as np


%matplotlib inline

# Load & Inspect Data

In [2]:
obesity_in_US_df = pd.read_csv('../data/Raw Data/Nutrition_Physical_Activity_and_Obesity_Behavioral_Risk_Factor_Surveillance_System.csv')
obesity_in_US_df.shape

(53392, 33)

In [3]:
obesity_in_US_df.sample(20)

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Unit,Data_Value_Type,...,GeoLocation,ClassID,TopicID,QuestionID,DataValueTypeID,LocationID,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1
24157,2013,2013,ND,North Dakota,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in muscle-strengt...,,Value,...,"(47.47531977900047, -100.11842104899966)",PA,PA1,Q046,VALUE,38,Age (years),25 - 34,AGEYR,AGEYR2534
27845,2011,2011,SC,South Carolina,Behavioral Risk Factor Surveillance System,Fruits and Vegetables,Fruits and Vegetables - Behavior,Percent of adults who report consuming fruit l...,,Value,...,"(33.998821303000454, -81.04537120699968)",FV,FV1,Q018,VALUE,45,Race/Ethnicity,2 or more races,RACE,RACE2PLUS
49669,2016,2016,NH,New Hampshire,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(43.65595011300047, -71.50036091999965)",OWS,OWS1,Q037,VALUE,33,Race/Ethnicity,American Indian/Alaska Native,RACE,RACENAA
42398,2015,2015,NH,New Hampshire,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,"(43.65595011300047, -71.50036091999965)",PA,PA1,Q047,VALUE,33,Income,"$25,000 - $34,999",INC,INC2535
6357,2012,2012,FL,Florida,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(28.932040377000476, -81.92896053899966)",OWS,OWS1,Q037,VALUE,12,Income,"$15,000 - $24,999",INC,INC1525
32155,2013,2013,VA,Virginia,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(37.54268067400045, -78.45789046299967)",OWS,OWS1,Q036,VALUE,51,Age (years),18 - 24,AGEYR,AGEYR1824
17284,2013,2013,MO,Missouri,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(38.635790776000476, -92.56630005299968)",OWS,OWS1,Q037,VALUE,29,Race/Ethnicity,2 or more races,RACE,RACE2PLUS
39723,2015,2015,GA,Georgia,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,"(32.83968109300048, -83.62758034599966)",PA,PA1,Q047,VALUE,13,Age (years),55 - 64,AGEYR,AGEYR5564
2523,2013,2013,AR,Arkansas,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who achieve at least 150 min...,,Value,...,"(34.74865012400045, -92.27449074299966)",PA,PA1,Q044,VALUE,5,Income,Data not reported,INC,INCNR
43890,2015,2015,SC,South Carolina,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who achieve at least 300 min...,,Value,...,"(33.998821303000454, -81.04537120699968)",PA,PA1,Q045,VALUE,45,Race/Ethnicity,Hawaiian/Pacific Islander,RACE,RACEHPI


# Make a 2014 data frame

In [4]:
# dropping unnecessary columns
# rename Stratication1 and StratificationCategory1 and grouping age, education, income, gender, race/ethnicity into columns

obesity_in_US_cols = obesity_in_US_df.drop(['Low_Confidence_Limit','High_Confidence_Limit ','YearEnd','Topic','Class','Datasource','Data_Value_Unit','QuestionID','ClassID','TopicID','DataValueTypeID','Data_Value_Type','Data_Value_Footnote_Symbol','Data_Value_Footnote','StratificationCategoryId1','StratificationID1'],1);

#Focusing only on 2014
#create filter 'obesity_in_US_cols' in order to not use certain columns 
obesity_in_US_year2014=obesity_in_US_cols[obesity_in_US_cols['YearStart']==2014]
obesity_in_US_year2014



Unnamed: 0,YearStart,LocationAbbr,LocationDesc,Question,Data_Value,Data_Value_Alt,Sample_Size,Total,Age(years),Education,Gender,Income,Race/Ethnicity,GeoLocation,LocationID,StratificationCategory1,Stratification1
299,2014,AL,Alabama,Percent of adults aged 18 years and older who ...,32.9,32.9,5156.0,,,,Female,,,"(32.84057112200048, -86.63186076199969)",1,Gender,Female
354,2014,AL,Alabama,Percent of adults aged 18 years and older who ...,35.1,35.1,1886.0,,55 - 64,,,,,"(32.84057112200048, -86.63186076199969)",1,Age (years),55 - 64
428,2014,AL,Alabama,Percent of adults who engage in no leisure-tim...,24.2,24.2,3109.0,,,,Male,,,"(32.84057112200048, -86.63186076199969)",1,Gender,Male
469,2014,US,National,Percent of adults who engage in no leisure-tim...,21.7,21.7,188835.0,,,,Male,,,,59,Gender,Male
599,2014,AL,Alabama,Percent of adults aged 18 years and older who ...,33.5,33.5,8190.0,Total,,,,,,"(32.84057112200048, -86.63186076199969)",1,Total,Total
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35168,2014,WY,Wyoming,Percent of adults who engage in no leisure-tim...,,,,,,,,,Asian,"(43.23554134300048, -108.10983035299967)",56,Race/Ethnicity,Asian
35169,2014,WY,Wyoming,Percent of adults who engage in no leisure-tim...,,,,,,,,,Hawaiian/Pacific Islander,"(43.23554134300048, -108.10983035299967)",56,Race/Ethnicity,Hawaiian/Pacific Islander
35170,2014,WY,Wyoming,Percent of adults who engage in no leisure-tim...,39.8,39.8,76.0,,,,,,American Indian/Alaska Native,"(43.23554134300048, -108.10983035299967)",56,Race/Ethnicity,American Indian/Alaska Native
35171,2014,WY,Wyoming,Percent of adults who engage in no leisure-tim...,17.1,17.1,72.0,,,,,,2 or more races,"(43.23554134300048, -108.10983035299967)",56,Race/Ethnicity,2 or more races


In [5]:
# Explore the questions in order to create a filter that only uses 
       #q1: 'Percent of adults aged 18 years and older who have obesity',
       #q2: 'Percent of adults aged 18 years and older who have an overweight classification'
obesity_in_US_year2014['Question'].unique()

array(['Percent of adults aged 18 years and older who have obesity',
       'Percent of adults aged 18 years and older who have an overweight classification',
       'Percent of adults who engage in no leisure-time physical activity'],
      dtype=object)

In [6]:
# create filter that chooses the following questions: 
# filter uses obesity_in_US_year2014 because we are only looking at 2014
       #q1: 'Percent of adults aged 18 years and older who have obesity',
       #q2: 'Percent of adults aged 18 years and older who have an overweight classification'
questions_filter=obesity_in_US_year2014[(obesity_in_US_year2014['Question']=='Percent of adults aged 18 years and older who have obesity') | (obesity_in_US_year2014['Question']=='Percent of adults aged 18 years and older who have an overweight classification')]
questions_filter['Question'].unique()

array(['Percent of adults aged 18 years and older who have obesity',
       'Percent of adults aged 18 years and older who have an overweight classification'],
      dtype=object)

In [7]:
# Rename "questions_filter" to "obesity_in_US_df2"
obesity_in_US_df2 = questions_filter
obesity_in_US_df2

Unnamed: 0,YearStart,LocationAbbr,LocationDesc,Question,Data_Value,Data_Value_Alt,Sample_Size,Total,Age(years),Education,Gender,Income,Race/Ethnicity,GeoLocation,LocationID,StratificationCategory1,Stratification1
299,2014,AL,Alabama,Percent of adults aged 18 years and older who ...,32.9,32.9,5156.0,,,,Female,,,"(32.84057112200048, -86.63186076199969)",1,Gender,Female
354,2014,AL,Alabama,Percent of adults aged 18 years and older who ...,35.1,35.1,1886.0,,55 - 64,,,,,"(32.84057112200048, -86.63186076199969)",1,Age (years),55 - 64
599,2014,AL,Alabama,Percent of adults aged 18 years and older who ...,33.5,33.5,8190.0,Total,,,,,,"(32.84057112200048, -86.63186076199969)",1,Total,Total
600,2014,AL,Alabama,Percent of adults aged 18 years and older who ...,34.1,34.1,3034.0,,,,Male,,,"(32.84057112200048, -86.63186076199969)",1,Gender,Male
601,2014,AL,Alabama,Percent of adults aged 18 years and older who ...,34.8,34.8,995.0,,,Less than high school,,,,"(32.84057112200048, -86.63186076199969)",1,Education,Less than high school
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35141,2014,WY,Wyoming,Percent of adults aged 18 years and older who ...,,,,,,,,,Asian,"(43.23554134300048, -108.10983035299967)",56,Race/Ethnicity,Asian
35142,2014,WY,Wyoming,Percent of adults aged 18 years and older who ...,,,,,,,,,Hawaiian/Pacific Islander,"(43.23554134300048, -108.10983035299967)",56,Race/Ethnicity,Hawaiian/Pacific Islander
35143,2014,WY,Wyoming,Percent of adults aged 18 years and older who ...,30.0,30.0,72.0,,,,,,American Indian/Alaska Native,"(43.23554134300048, -108.10983035299967)",56,Race/Ethnicity,American Indian/Alaska Native
35144,2014,WY,Wyoming,Percent of adults aged 18 years and older who ...,33.2,33.2,70.0,,,,,,2 or more races,"(43.23554134300048, -108.10983035299967)",56,Race/Ethnicity,2 or more races


In [8]:
# Saving dataset "obesity_in_US_df2" with year2014 and obesity/overweight questions
obesity_in_US_df2.to_csv('../data/Clean Data/Obesity_in_US_2014cleaned.csv')

# (2014) New Gender, Income, Age Data Frames  

### Create Gender Data Frame (2014)

In [9]:
# Gender Dataset 2014
# create filter that only looks at gender rows 
#rename columns
obesity_in_US_gender_2014=obesity_in_US_df2[(obesity_in_US_df2['Stratification1']=='Male')|(obesity_in_US_df2['Stratification1']=='Female')]
obesity_in_US_gender_2014 = obesity_in_US_gender_2014.drop(['Age(years)','Education','Income','Race/Ethnicity'],1);




In [10]:
obesity_in_US_gender_2014.columns
cols_to_use = ['YearStart', 'LocationAbbr', 'LocationDesc', 'Data_Value', 'Data_Value_Alt', 'Question', 'Sample_Size', 'Total', 'Gender', 'GeoLocation', 'LocationID', 'StratificationCategory1', 'Stratification1']
obesity_in_US_gender_2014=obesity_in_US_gender_2014[cols_to_use]

obesity_in_US_gender_2014_labels = {
    'YearStart': 'year',
    'LocationAbbr': 'state',
    'LocationDesc': 'state_name',
    'Data_Value': 'gender_pc',
    'Data_Value_Alt': 'Data_Value_Alt',
    'Question': 'Question',
    'Sample_Size': 'Sample_Size', 
    'Total' : 'Total',
    'Gender': 'Gender', 
    'GeoLocation': 'GeoLocation', 
    'LocationID': 'LocationID', 
    'StratificationCategory1': 'StratificationCategory1', 
    'Stratification1': 'Stratification1'
}
obesity_in_US_gender_2014 = obesity_in_US_gender_2014.rename(columns = obesity_in_US_gender_2014_labels)

In [11]:
obesity_in_US_gender_2014 = obesity_in_US_gender_2014.reset_index(drop = True)
obesity_in_US_gender_2014

Unnamed: 0,year,state,state_name,gender_pc,Data_Value_Alt,Question,Sample_Size,Total,Gender,GeoLocation,LocationID,StratificationCategory1,Stratification1
0,2014,AL,Alabama,32.9,32.9,Percent of adults aged 18 years and older who ...,5156.0,,Female,"(32.84057112200048, -86.63186076199969)",1,Gender,Female
1,2014,AL,Alabama,34.1,34.1,Percent of adults aged 18 years and older who ...,3034.0,,Male,"(32.84057112200048, -86.63186076199969)",1,Gender,Male
2,2014,AL,Alabama,37.2,37.2,Percent of adults aged 18 years and older who ...,3034.0,,Male,"(32.84057112200048, -86.63186076199969)",1,Gender,Male
3,2014,AL,Alabama,29.8,29.8,Percent of adults aged 18 years and older who ...,5156.0,,Female,"(32.84057112200048, -86.63186076199969)",1,Gender,Female
4,2014,AK,Alaska,29.3,29.3,Percent of adults aged 18 years and older who ...,1921.0,,Male,"(64.84507995700051, -147.72205903599973)",2,Gender,Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...
211,2014,WI,Wisconsin,29.6,29.6,Percent of adults aged 18 years and older who ...,3573.0,,Female,"(44.39319117400049, -89.81637074199966)",55,Gender,Female
212,2014,WY,Wyoming,29.7,29.7,Percent of adults aged 18 years and older who ...,2490.0,,Male,"(43.23554134300048, -108.10983035299967)",56,Gender,Male
213,2014,WY,Wyoming,29.2,29.2,Percent of adults aged 18 years and older who ...,3483.0,,Female,"(43.23554134300048, -108.10983035299967)",56,Gender,Female
214,2014,WY,Wyoming,41.7,41.7,Percent of adults aged 18 years and older who ...,2490.0,,Male,"(43.23554134300048, -108.10983035299967)",56,Gender,Male


In [12]:
# Saving gender dataframe "obesity_in_US_gender_2014.csv"
obesity_in_US_gender_2014.to_csv('../data/Clean Data/obesity_in_US_gender_2014.csv')


### Create Income Data Frame (2014) 

In [13]:
# Income Dataset
# create filter that only looks at income rows 
obesity_in_US_income_2014=obesity_in_US_df2[obesity_in_US_df2['StratificationCategory1']=='Income']
obesity_in_US_income_2014 = obesity_in_US_income_2014.drop(['Age(years)','Gender','Education','Race/Ethnicity'],1);


In [14]:
obesity_in_US_income_2014.columns
cols_to_use = ['YearStart', 'LocationAbbr', 'LocationDesc', 'Data_Value', 'Data_Value_Alt', 'Question', 'Sample_Size', 'Total', 'Income', 'GeoLocation', 'LocationID', 'StratificationCategory1', 'Stratification1']
obesity_in_US_income_2014=obesity_in_US_income_2014[cols_to_use]

obesity_in_US_income_2014_labels = {
    'YearStart': 'year',
    'LocationAbbr': 'state',
    'LocationDesc': 'state_name',
    'Data_Value': 'income_pc',
    'Data_Value_Alt': 'Data_Value_Alt',
    'Question': 'Question',
    'Sample_Size': 'Sample_Size', 
    'Total' : 'Total',
    'Income': 'Income', 
    'GeoLocation': 'GeoLocation', 
    'LocationID': 'LocationID', 
    'StratificationCategory1': 'StratificationCategory1', 
    'Stratification1': 'Stratification1'
}
obesity_in_US_income_2014 = obesity_in_US_income_2014.rename(columns = obesity_in_US_income_2014_labels)

In [15]:
obesity_in_US_income_2014 = obesity_in_US_income_2014.reset_index(drop = True)
obesity_in_US_income_2014

Unnamed: 0,year,state,state_name,income_pc,Data_Value_Alt,Question,Sample_Size,Total,Income,GeoLocation,LocationID,StratificationCategory1,Stratification1
0,2014,AL,Alabama,34.5,34.5,Percent of adults aged 18 years and older who ...,1108.0,,"Less than $15,000","(32.84057112200048, -86.63186076199969)",1,Income,"Less than $15,000"
1,2014,AL,Alabama,40.6,40.6,Percent of adults aged 18 years and older who ...,1406.0,,"$15,000 - $24,999","(32.84057112200048, -86.63186076199969)",1,Income,"$15,000 - $24,999"
2,2014,AL,Alabama,35.4,35.4,Percent of adults aged 18 years and older who ...,776.0,,"$25,000 - $34,999","(32.84057112200048, -86.63186076199969)",1,Income,"$25,000 - $34,999"
3,2014,AL,Alabama,35.5,35.5,Percent of adults aged 18 years and older who ...,986.0,,"$35,000 - $49,999","(32.84057112200048, -86.63186076199969)",1,Income,"$35,000 - $49,999"
4,2014,AL,Alabama,32.4,32.4,Percent of adults aged 18 years and older who ...,959.0,,"$50,000 - $74,999","(32.84057112200048, -86.63186076199969)",1,Income,"$50,000 - $74,999"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
751,2014,WY,Wyoming,31.7,31.7,Percent of adults aged 18 years and older who ...,599.0,,"$25,000 - $34,999","(43.23554134300048, -108.10983035299967)",56,Income,"$25,000 - $34,999"
752,2014,WY,Wyoming,32.8,32.8,Percent of adults aged 18 years and older who ...,768.0,,"$35,000 - $49,999","(43.23554134300048, -108.10983035299967)",56,Income,"$35,000 - $49,999"
753,2014,WY,Wyoming,39.7,39.7,Percent of adults aged 18 years and older who ...,898.0,,"$50,000 - $74,999","(43.23554134300048, -108.10983035299967)",56,Income,"$50,000 - $74,999"
754,2014,WY,Wyoming,38.8,38.8,Percent of adults aged 18 years and older who ...,1498.0,,"$75,000 or greater","(43.23554134300048, -108.10983035299967)",56,Income,"$75,000 or greater"


In [16]:
# Saving income dataset 'obesity_in_US_income_2014.csv'
obesity_in_US_income_2014.to_csv('../data/Clean Data/obesity_in_US_income_2014.csv')

### Create Age Data Frame (2014)

In [17]:
# Age Dataset (2014)
# create filter that only looks at income rows 
obesity_in_US_age_2014=obesity_in_US_df2[(obesity_in_US_df2['Stratification1']=='18 - 24')|(obesity_in_US_df2['Stratification1']=='35 - 44')|(obesity_in_US_df2['Stratification1']=='45 - 54')|(obesity_in_US_df2['Stratification1']=='55 - 64')|(obesity_in_US_df2['Stratification1']=='65 or older')]
obesity_in_US_age_2014 = obesity_in_US_age_2014.drop(['Income','Gender','Education','Race/Ethnicity'],1);

In [18]:
obesity_in_US_age_2014.columns
cols_to_use = ['YearStart', 'LocationAbbr', 'LocationDesc', 'Data_Value', 'Data_Value_Alt', 'Question', 'Sample_Size', 'Total', 'Age(years)', 'GeoLocation', 'LocationID', 'StratificationCategory1', 'Stratification1']
obesity_in_US_age_2014=obesity_in_US_age_2014[cols_to_use]

obesity_in_US_age_2014_labels = {
    'YearStart': 'year',
    'LocationAbbr': 'state',
    'LocationDesc': 'state_name',
    'Data_Value': 'age_pc',
    'Data_Value_Alt': 'Data_Value_Alt',
    'Question': 'Question',
    'Sample_Size': 'Sample_Size', 
    'Total' : 'Total',
    'Age(years)': 'Age', 
    'GeoLocation': 'GeoLocation', 
    'LocationID': 'LocationID', 
    'StratificationCategory1': 'StratificationCategory1', 
    'Stratification1': 'Stratification1'
}
obesity_in_US_age_2014 = obesity_in_US_age_2014.rename(columns = obesity_in_US_age_2014_labels)




In [19]:
obesity_in_US_age_2014 = obesity_in_US_age_2014.reset_index(drop = True)
obesity_in_US_age_2014

Unnamed: 0,year,state,state_name,age_pc,Data_Value_Alt,Question,Sample_Size,Total,Age,GeoLocation,LocationID,StratificationCategory1,Stratification1
0,2014,AL,Alabama,35.1,35.1,Percent of adults aged 18 years and older who ...,1886.0,,55 - 64,"(32.84057112200048, -86.63186076199969)",1,Age (years),55 - 64
1,2014,AL,Alabama,20.4,20.4,Percent of adults aged 18 years and older who ...,409.0,,18 - 24,"(32.84057112200048, -86.63186076199969)",1,Age (years),18 - 24
2,2014,AL,Alabama,40.9,40.9,Percent of adults aged 18 years and older who ...,839.0,,35 - 44,"(32.84057112200048, -86.63186076199969)",1,Age (years),35 - 44
3,2014,AL,Alabama,39.9,39.9,Percent of adults aged 18 years and older who ...,1364.0,,45 - 54,"(32.84057112200048, -86.63186076199969)",1,Age (years),45 - 54
4,2014,AL,Alabama,37.4,37.4,Percent of adults aged 18 years and older who ...,1886.0,,55 - 64,"(32.84057112200048, -86.63186076199969)",1,Age (years),55 - 64
...,...,...,...,...,...,...,...,...,...,...,...,...,...
535,2014,WY,Wyoming,20.3,20.3,Percent of adults aged 18 years and older who ...,131.0,,18 - 24,"(43.23554134300048, -108.10983035299967)",56,Age (years),18 - 24
536,2014,WY,Wyoming,39.0,39.0,Percent of adults aged 18 years and older who ...,546.0,,35 - 44,"(43.23554134300048, -108.10983035299967)",56,Age (years),35 - 44
537,2014,WY,Wyoming,37.1,37.1,Percent of adults aged 18 years and older who ...,773.0,,45 - 54,"(43.23554134300048, -108.10983035299967)",56,Age (years),45 - 54
538,2014,WY,Wyoming,36.8,36.8,Percent of adults aged 18 years and older who ...,1549.0,,55 - 64,"(43.23554134300048, -108.10983035299967)",56,Age (years),55 - 64


In [20]:
# Saving age dataset 'obesity_in_US_age_2014.csv'
obesity_in_US_age_2014.to_csv('../data/Clean Data/obesity_in_US_age_2014.csv')