# Obesity in the US 2016 Exploration
- This is a dataset that looks at adults 18 years and older across the United States who are obese. 
- This data differentiates people by age, income, education, gender, and race/ethnicity. 
- In this notebook, I will explore and clean the dataset and create new data frames for deeper analysis
    - I will clean up the columns and filter out the year column to only look at 2016 for age, income, education, 
    gender, and race/ethnicity. 
    - I will make data frames for gender obesity rates in 2016, age obesity rates in 2016, income obesity rates in     2016, and education obesity rates in 2016

# Setup

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy.random as np
import sys
import matplotlib 
import seaborn as sns
import numpy as np

# Load & Inspect Data

In [2]:
obesity_in_US_df = pd.read_csv('../data/Raw Data/Nutrition_Physical_Activity_and_Obesity_Behavioral_Risk_Factor_Surveillance_System.csv')
obesity_in_US_df.shape

(53392, 33)

# Make a 2016 Data frame 

In [3]:
# dropping unnecessary columns
# rename Stratication1 and StratificationCategory1 and grouping age, education, income, gender, race/ethnicity into columns

obesity_in_US_cols = obesity_in_US_df.drop(['Low_Confidence_Limit','High_Confidence_Limit ','YearEnd','Topic','Class','Datasource','Data_Value_Unit','QuestionID','ClassID','TopicID','DataValueTypeID','Data_Value_Type','Data_Value_Footnote_Symbol','Data_Value_Footnote','StratificationCategoryId1','StratificationID1'],1);

#Focusing only on 2016
#create filter 
obesity_in_US_year2016=obesity_in_US_cols[obesity_in_US_cols['YearStart']==2016]
obesity_in_US_year2016

Unnamed: 0,YearStart,LocationAbbr,LocationDesc,Question,Data_Value,Data_Value_Alt,Sample_Size,Total,Age(years),Education,Gender,Income,Race/Ethnicity,GeoLocation,LocationID,StratificationCategory1,Stratification1
48772,2016,US,National,Percent of adults aged 18 years and older who ...,35.2,35.2,438479.0,Total,,,,,,,59,Total,Total
48773,2016,US,National,Percent of adults aged 18 years and older who ...,41.0,41.0,198440.0,,,,Male,,,,59,Gender,Male
48774,2016,US,National,Percent of adults aged 18 years and older who ...,29.3,29.3,240000.0,,,,Female,,,,59,Gender,Female
48775,2016,US,National,Percent of adults aged 18 years and older who ...,34.7,34.7,32325.0,,,Less than high school,,,,,59,Education,Less than high school
48776,2016,US,National,Percent of adults aged 18 years and older who ...,34.2,34.2,123241.0,,,High school graduate,,,,,59,Education,High school graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53387,2016,VI,Virgin Islands,Percent of adults who engage in no leisure-tim...,,,,,,,,,Asian,"(18.335765, -64.896335)",78,Race/Ethnicity,Asian
53388,2016,VI,Virgin Islands,Percent of adults who engage in no leisure-tim...,,,,,,,,,Hawaiian/Pacific Islander,"(18.335765, -64.896335)",78,Race/Ethnicity,Hawaiian/Pacific Islander
53389,2016,VI,Virgin Islands,Percent of adults who engage in no leisure-tim...,,,,,,,,,American Indian/Alaska Native,"(18.335765, -64.896335)",78,Race/Ethnicity,American Indian/Alaska Native
53390,2016,VI,Virgin Islands,Percent of adults who engage in no leisure-tim...,,,,,,,,,2 or more races,"(18.335765, -64.896335)",78,Race/Ethnicity,2 or more races


In [4]:
# create filter that chooses the following questions: 
# filter uses obesity_in_US_year2014 because we are only looking at 2016
       #q1: 'Percent of adults aged 18 years and older who have obesity',
       #q2: 'Percent of adults aged 18 years and older who have an overweight classification'
questions_filter=obesity_in_US_year2016[(obesity_in_US_year2016['Question']=='Percent of adults aged 18 years and older who have obesity') | (obesity_in_US_year2016['Question']=='Percent of adults aged 18 years and older who have an overweight classification')]
questions_filter['Question'].unique()

array(['Percent of adults aged 18 years and older who have an overweight classification',
       'Percent of adults aged 18 years and older who have obesity'],
      dtype=object)

In [5]:
# Rename "questions_filter" to "obesity_in_US_df3"
obesity_in_US_df4 = questions_filter
obesity_in_US_df4

Unnamed: 0,YearStart,LocationAbbr,LocationDesc,Question,Data_Value,Data_Value_Alt,Sample_Size,Total,Age(years),Education,Gender,Income,Race/Ethnicity,GeoLocation,LocationID,StratificationCategory1,Stratification1
48772,2016,US,National,Percent of adults aged 18 years and older who ...,35.2,35.2,438479.0,Total,,,,,,,59,Total,Total
48773,2016,US,National,Percent of adults aged 18 years and older who ...,41.0,41.0,198440.0,,,,Male,,,,59,Gender,Male
48774,2016,US,National,Percent of adults aged 18 years and older who ...,29.3,29.3,240000.0,,,,Female,,,,59,Gender,Female
48775,2016,US,National,Percent of adults aged 18 years and older who ...,34.7,34.7,32325.0,,,Less than high school,,,,,59,Education,Less than high school
48776,2016,US,National,Percent of adults aged 18 years and older who ...,34.2,34.2,123241.0,,,High school graduate,,,,,59,Education,High school graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51847,2016,VI,Virgin Islands,Percent of adults aged 18 years and older who ...,,,,,,,,,Asian,"(18.335765, -64.896335)",78,Race/Ethnicity,Asian
51848,2016,VI,Virgin Islands,Percent of adults aged 18 years and older who ...,,,,,,,,,Hawaiian/Pacific Islander,"(18.335765, -64.896335)",78,Race/Ethnicity,Hawaiian/Pacific Islander
51849,2016,VI,Virgin Islands,Percent of adults aged 18 years and older who ...,,,,,,,,,American Indian/Alaska Native,"(18.335765, -64.896335)",78,Race/Ethnicity,American Indian/Alaska Native
51850,2016,VI,Virgin Islands,Percent of adults aged 18 years and older who ...,,,,,,,,,2 or more races,"(18.335765, -64.896335)",78,Race/Ethnicity,2 or more races


# (2016) New Gender, Income, Age Data Frames  

### Create Gender Data Frame (2016)

In [6]:
# Gender Dataset 2016
# create filter that only looks at gender rows 
obesity_in_US_gender_2016=obesity_in_US_df4[(obesity_in_US_df4['Stratification1']=='Male')|(obesity_in_US_df4['Stratification1']=='Female')]
obesity_in_US_gender_2016 = obesity_in_US_gender_2016.drop(['Age(years)','Education','Income','Race/Ethnicity'],1);



In [7]:
obesity_in_US_gender_2016.columns
cols_to_use = ['YearStart', 'LocationAbbr', 'LocationDesc', 'Data_Value', 'Data_Value_Alt', 'Question', 'Sample_Size', 'Total', 'Gender', 'GeoLocation', 'LocationID', 'StratificationCategory1', 'Stratification1']
obesity_in_US_gender_2016=obesity_in_US_gender_2016[cols_to_use]

obesity_in_US_gender_2016_labels = {
    'YearStart': 'year',
    'LocationAbbr': 'state',
    'LocationDesc': 'state_name',
    'Data_Value': 'gender_pc',
    'Data_Value_Alt': 'Data_Value_Alt',
    'Question': 'Question',
    'Sample_Size': 'Sample_Size', 
    'Total' : 'Total',
    'Gender': 'Gender', 
    'GeoLocation': 'GeoLocation', 
    'LocationID': 'LocationID', 
    'StratificationCategory1': 'StratificationCategory1', 
    'Stratification1': 'Stratification1'
}
obesity_in_US_gender_2016 = obesity_in_US_gender_2016.rename(columns = obesity_in_US_gender_2016_labels)


In [8]:
obesity_in_US_gender_2016 = obesity_in_US_gender_2016.reset_index(drop = True)
obesity_in_US_gender_2016

Unnamed: 0,year,state,state_name,gender_pc,Data_Value_Alt,Question,Sample_Size,Total,Gender,GeoLocation,LocationID,StratificationCategory1,Stratification1
0,2016,US,National,41.0,41.0,Percent of adults aged 18 years and older who ...,198440.0,,Male,,59,Gender,Male
1,2016,US,National,29.3,29.3,Percent of adults aged 18 years and older who ...,240000.0,,Female,,59,Gender,Female
2,2016,AR,Arkansas,38.0,38.0,Percent of adults aged 18 years and older who ...,1918.0,,Male,"(34.74865012400045, -92.27449074299966)",5,Gender,Male
3,2016,AR,Arkansas,27.0,27.0,Percent of adults aged 18 years and older who ...,2941.0,,Female,"(34.74865012400045, -92.27449074299966)",5,Gender,Female
4,2016,AL,Alabama,38.2,38.2,Percent of adults aged 18 years and older who ...,2711.0,,Male,"(32.84057112200048, -86.63186076199969)",1,Gender,Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...
215,2016,GU,Guam,27.3,27.3,Percent of adults aged 18 years and older who ...,805.0,,Female,"(13.444304, 144.793731)",66,Gender,Female
216,2016,PR,Puerto Rico,30.0,30.0,Percent of adults aged 18 years and older who ...,2105.0,,Male,"(18.220833, -66.590149)",72,Gender,Male
217,2016,PR,Puerto Rico,31.4,31.4,Percent of adults aged 18 years and older who ...,3423.0,,Female,"(18.220833, -66.590149)",72,Gender,Female
218,2016,VI,Virgin Islands,22.2,22.2,Percent of adults aged 18 years and older who ...,468.0,,Male,"(18.335765, -64.896335)",78,Gender,Male


In [9]:
# Saving gender dataframe "obesity_in_US_gender_2016.csv"
obesity_in_US_gender_2016.to_csv('../data/Clean Data/obesity_in_US_gender_2016.csv')

### Create Income Data Frame (2016)

In [10]:
# Income Dataset (2016)
# create filter that only looks at income rows 
obesity_in_US_income_2016=obesity_in_US_df4[obesity_in_US_df4['StratificationCategory1']=='Income']
obesity_in_US_income_2016 = obesity_in_US_income_2016.drop(['Age(years)','Gender','Education','Race/Ethnicity'],1);

In [11]:
obesity_in_US_income_2016.columns
cols_to_use = ['YearStart', 'LocationAbbr', 'LocationDesc', 'Data_Value', 'Data_Value_Alt', 'Question', 'Sample_Size', 'Total', 'Income', 'GeoLocation', 'LocationID', 'StratificationCategory1', 'Stratification1']
obesity_in_US_income_2016=obesity_in_US_income_2016[cols_to_use]

obesity_in_US_income_2016_labels = {
    'YearStart': 'year',
    'LocationAbbr': 'state',
    'LocationDesc': 'state_name',
    'Data_Value': 'income_pc',
    'Data_Value_Alt': 'Data_Value_Alt',
    'Question': 'Question',
    'Sample_Size': 'Sample_Size', 
    'Total' : 'Total',
    'Income': 'Income', 
    'GeoLocation': 'GeoLocation', 
    'LocationID': 'LocationID', 
    'StratificationCategory1': 'StratificationCategory1', 
    'Stratification1': 'Stratification1'
}
obesity_in_US_income_2016 = obesity_in_US_income_2016.rename(columns = obesity_in_US_income_2016_labels)



In [12]:
obesity_in_US_income_2016 = obesity_in_US_income_2016.reset_index(drop = True)
obesity_in_US_income_2016

Unnamed: 0,year,state,state_name,income_pc,Data_Value_Alt,Question,Sample_Size,Total,Income,GeoLocation,LocationID,StratificationCategory1,Stratification1
0,2016,AZ,Arizona,34.7,34.7,Percent of adults aged 18 years and older who ...,842.0,,"Less than $15,000","(34.865970280000454, -111.76381127699972)",4,Income,"Less than $15,000"
1,2016,AZ,Arizona,30.9,30.9,Percent of adults aged 18 years and older who ...,1379.0,,"$15,000 - $24,999","(34.865970280000454, -111.76381127699972)",4,Income,"$15,000 - $24,999"
2,2016,AZ,Arizona,34.1,34.1,Percent of adults aged 18 years and older who ...,918.0,,"$25,000 - $34,999","(34.865970280000454, -111.76381127699972)",4,Income,"$25,000 - $34,999"
3,2016,AZ,Arizona,35.3,35.3,Percent of adults aged 18 years and older who ...,1355.0,,"$35,000 - $49,999","(34.865970280000454, -111.76381127699972)",4,Income,"$35,000 - $49,999"
4,2016,AZ,Arizona,36.5,36.5,Percent of adults aged 18 years and older who ...,1421.0,,"$50,000 - $74,999","(34.865970280000454, -111.76381127699972)",4,Income,"$50,000 - $74,999"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
765,2016,VI,Virgin Islands,30.6,30.6,Percent of adults aged 18 years and older who ...,150.0,,"$25,000 - $34,999","(18.335765, -64.896335)",78,Income,"$25,000 - $34,999"
766,2016,VI,Virgin Islands,36.3,36.3,Percent of adults aged 18 years and older who ...,153.0,,"$35,000 - $49,999","(18.335765, -64.896335)",78,Income,"$35,000 - $49,999"
767,2016,VI,Virgin Islands,43.3,43.3,Percent of adults aged 18 years and older who ...,158.0,,"$50,000 - $74,999","(18.335765, -64.896335)",78,Income,"$50,000 - $74,999"
768,2016,VI,Virgin Islands,32.6,32.6,Percent of adults aged 18 years and older who ...,206.0,,"$75,000 or greater","(18.335765, -64.896335)",78,Income,"$75,000 or greater"


In [13]:
obesity_in_US_income_2016.to_csv('../data/Clean Data/obesity_in_US_income_2016.csv')

### Create Age Data Frame (2016)

In [14]:
# Age Dataset (2016)
# create filter that only looks at income rows 
obesity_in_US_age_2016=obesity_in_US_df4[(obesity_in_US_df4['Stratification1']=='18 - 24')|(obesity_in_US_df4['Stratification1']=='35 - 44')|(obesity_in_US_df4['Stratification1']=='45 - 54')|(obesity_in_US_df4['Stratification1']=='55 - 64')|(obesity_in_US_df4['Stratification1']=='65 or older')]
obesity_in_US_age_2016 = obesity_in_US_age_2016.drop(['Income','Gender','Education','Race/Ethnicity'],1);




In [15]:
obesity_in_US_age_2016.columns
cols_to_use = ['YearStart', 'LocationAbbr', 'LocationDesc', 'Data_Value', 'Data_Value_Alt', 'Question', 'Sample_Size', 'Total', 'Age(years)', 'GeoLocation', 'LocationID', 'StratificationCategory1', 'Stratification1']
obesity_in_US_age_2016=obesity_in_US_age_2016[cols_to_use]

obesity_in_US_age_2016_labels = {
    'YearStart': 'year',
    'LocationAbbr': 'state',
    'LocationDesc': 'state_name',
    'Data_Value': 'age_pc',
    'Data_Value_Alt': 'Data_Value_Alt',
    'Question': 'Question',
    'Sample_Size': 'Sample_Size', 
    'Total' : 'Total',
    'Age(years)': 'Age', 
    'GeoLocation': 'GeoLocation', 
    'LocationID': 'LocationID', 
    'StratificationCategory1': 'StratificationCategory1', 
    'Stratification1': 'Stratification1'
}
obesity_in_US_age_2016 = obesity_in_US_age_2016.rename(columns = obesity_in_US_age_2016_labels)


In [16]:
obesity_in_US_age_2016 = obesity_in_US_age_2016.reset_index(drop = True)
obesity_in_US_age_2016

Unnamed: 0,year,state,state_name,age_pc,Data_Value_Alt,Question,Sample_Size,Total,Age,GeoLocation,LocationID,StratificationCategory1,Stratification1
0,2016,US,National,25.5,25.5,Percent of adults aged 18 years and older who ...,23734.0,,18 - 24,,59,Age (years),18 - 24
1,2016,US,National,35.7,35.7,Percent of adults aged 18 years and older who ...,48951.0,,35 - 44,,59,Age (years),35 - 44
2,2016,AZ,Arizona,37.6,37.6,Percent of adults aged 18 years and older who ...,1281.0,,45 - 54,"(34.865970280000454, -111.76381127699972)",4,Age (years),45 - 54
3,2016,AZ,Arizona,39.1,39.1,Percent of adults aged 18 years and older who ...,2127.0,,55 - 64,"(34.865970280000454, -111.76381127699972)",4,Age (years),55 - 64
4,2016,AZ,Arizona,38.6,38.6,Percent of adults aged 18 years and older who ...,4790.0,,65 or older,"(34.865970280000454, -111.76381127699972)",4,Age (years),65 or older
...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,2016,VI,Virgin Islands,15.5,15.5,Percent of adults aged 18 years and older who ...,50.0,,18 - 24,"(18.335765, -64.896335)",78,Age (years),18 - 24
546,2016,VI,Virgin Islands,39.8,39.8,Percent of adults aged 18 years and older who ...,91.0,,35 - 44,"(18.335765, -64.896335)",78,Age (years),35 - 44
547,2016,VI,Virgin Islands,41.4,41.4,Percent of adults aged 18 years and older who ...,217.0,,45 - 54,"(18.335765, -64.896335)",78,Age (years),45 - 54
548,2016,VI,Virgin Islands,33.2,33.2,Percent of adults aged 18 years and older who ...,306.0,,55 - 64,"(18.335765, -64.896335)",78,Age (years),55 - 64


In [17]:
# Saving age dataset 'obesity_in_US_age_2016.csv'
obesity_in_US_age_2016.to_csv('../data/Clean Data/obesity_in_US_age_2016.csv')