## Behavioral Risk Factor Surveillance System

#### Following commands were executed to download the CSV file from Kaggle API.

#### https://github.com/Kaggle/kaggle-api




In [None]:
!pip install kaggle

!kaggle datasets download -d cdc/behavioral-risk-factor-surveillance-system


#### Installing Dependencies

In [109]:
# Installing Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import json
from pprint import pprint
import warnings
warnings.filterwarnings('ignore')


#### Cleaning up the Data Set.




In [2]:
%%time
# The behavioral-risk-factor-surveillance-system.zip had data for 5 years. 
# Using the 2015 data for analysis now.


file_path = "./Resources/2015.csv"
behavioral_data_original = pd.read_csv(file_path, encoding="utf-8")

# Reviewing the original dataset.
behavioral_data_original.head()

Wall time: 16.3 s


#### Creating a new CSV with the selected columns list

In [110]:
%%time
# New dataframe with selected columns from the original dataframe.
behavioral_data_reduced = behavioral_data_original [["_STATE", "SEX", "MARITAL","INTERNET", "_RACEGR3", "EDUCA", "_AGEG5YR", "WEIGHT2",  "HTIN4","_BMI5CAT","BPHIGH4" ,"TOLDHI2", "INCOME2", "SMOKDAY2" , "AVEDRNK2", "_VEGESUM", "_FRUTSUM", "CVDSTRK3", "PA1MIN_", "CVDCRHD4", "CHCOCNCR", "CHCKIDNY", "DIABETE3"]]

# Writing this new dataframe to a file
behavioral_data_reduced.to_csv("./Resources/behavioral_revised_data.csv")

# Viewing the metadate info.
behavioral_data_reduced.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 441456 entries, 0 to 441455
Data columns (total 23 columns):
_STATE      441456 non-null float64
SEX         441456 non-null float64
MARITAL     441456 non-null float64
INTERNET    437146 non-null float64
_RACEGR3    441456 non-null float64
EDUCA       441456 non-null float64
_AGEG5YR    441456 non-null float64
WEIGHT2     436141 non-null float64
HTIN4       424196 non-null float64
_BMI5CAT    405058 non-null float64
BPHIGH4     441455 non-null float64
TOLDHI2     382302 non-null float64
INCOME2     438155 non-null float64
SMOKDAY2    184193 non-null float64
AVEDRNK2    210838 non-null float64
_VEGESUM    390339 non-null float64
_FRUTSUM    397745 non-null float64
CVDSTRK3    441456 non-null float64
PA1MIN_     289037 non-null float64
CVDCRHD4    441455 non-null float64
CHCOCNCR    441456 non-null float64
CHCKIDNY    441456 non-null float64
DIABETE3    441449 non-null float64
dtypes: float64(23)
memory usage: 77.5 MB
Wall time: 17.2 s


In [111]:
%%time
# Viewing the Descriptive stats info on the revised data.
behavioral_data_reduced.describe()

Wall time: 1.01 s


Unnamed: 0,_STATE,SEX,MARITAL,INTERNET,_RACEGR3,EDUCA,_AGEG5YR,WEIGHT2,HTIN4,_BMI5CAT,...,SMOKDAY2,AVEDRNK2,_VEGESUM,_FRUTSUM,CVDSTRK3,PA1MIN_,CVDCRHD4,CHCOCNCR,CHCKIDNY,DIABETE3
count,441456.0,441456.0,441456.0,437146.0,441456.0,441456.0,441456.0,436141.0,424196.0,405058.0,...,184193.0,210838.0,390339.0,397745.0,441456.0,289037.0,441455.0,441456.0,441456.0,441449.0
mean,29.968715,1.576542,2.263653,1.233819,1.681386,4.920094,7.803623,733.204388,66.725177,2.938461,...,2.439034,3.493758,194.73,136.242,1.97388,483.804,1.986952,1.914746,1.98331,2.757888
std,16.03471,0.494107,1.687844,0.575152,1.527553,1.076198,3.495609,2197.377381,4.129768,0.826482,...,0.88636,10.559959,155.65,137.9642,0.348689,744.4642,0.534279,0.403817,0.368072,0.723319
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,50.0,36.0,1.0,...,1.0,1.0,5.397605e-79,5.397605e-79,1.0,5.397605e-79,1.0,1.0,1.0,1.0
25%,19.0,1.0,1.0,1.0,1.0,4.0,5.0,149.0,64.0,2.0,...,2.0,1.0,110.0,57.0,2.0,120.0,2.0,2.0,2.0,3.0
50%,29.0,2.0,1.0,1.0,1.0,5.0,8.0,175.0,66.0,3.0,...,3.0,2.0,169.0,100.0,2.0,275.0,2.0,2.0,2.0,3.0
75%,44.0,2.0,3.0,1.0,1.0,6.0,10.0,210.0,70.0,4.0,...,3.0,2.0,243.0,200.0,2.0,546.0,2.0,2.0,2.0,3.0
max,72.0,2.0,9.0,9.0,9.0,9.0,14.0,9999.0,95.0,4.0,...,9.0,99.0,19929.0,15000.0,9.0,54000.0,9.0,9.0,9.0,9.0


#### Reading from the new CSV with selected columns

In [112]:
%%time
# Reading the new revised behavioral revised csv for processing.
file_path = "./Resources/behavioral_revised_data.csv"
behavioral_data =  pd.read_csv(file_path,skip_blank_lines=True, na_values=[9999,"", 7777],index_col=0)

# Previewing the data
behavioral_data.head()


Wall time: 2.42 s


#### Cleaning up the data. Removing outliers and Nan values.

In [113]:
# Replacing all numeric values with state names
states_data = {1 : "Alabama", 2: "Alaska" , 4: "Arizona",
               5 : "Arkansas", 6: "California",8 : "Colorado",
               9 : "Connecticut", 10 : "Delaware", 11: "District of Columbia",
               12 : "Florida", 13 :  "Georgia",15 : "Hawaii", 16 : "Idaho",
               17 : "Illinois", 18 : "Indiana", 19 : "Iowa", 20 : "Kansas",
               21 : "Kentucky", 22 : "Louisiana", 23 : "Maine" ,24 : "Maryland",
               25 : "Massachusetts", 26 : "Michigan" ,27 : "Minnesota", 28 : "Mississippi",
               29 : "Missouri",30 : "Montana",31 : "Nebraska",32 : "Nevada",33 : "New Hampshire ",34 : "New Jersey"
               ,35 : "New Mexico",36 : "New York",37 : "North Carolina",38 : "North Dakota",39 : "Ohio",40 : "Oklahoma"
               ,41 : "Oregon",42 : "Pennsylvania",44 : "Rhode Island",45 : "South Carolina",46 : "South Dakota",47 : "Tennessee"
               ,48 : "Texas",49 : "Utah",50 : "Vermont",51 : "Virginia",53 : "Washington",54 : "West Virginia ",55 : "Wisconsin"
               ,56 : "Wyoming",66 : "Guam",72 : "Puerto Rico"}

#Replacing gender codes with actual values
gender = { 1: "Male" , 2 : "Female"}

# Replacing Marital status codes with actual values
marital_status = { 1 : "Married", 2 : "Divorced", 3 : "Widowed", 4 : "Separated", 5 : "Never married", 
                  6 : "A member of an unmarried couple",9 : "Refused" }

# Replacing education values to categories
education = {1 : "No School", 2 : "Elementary", 
            3 : "Junior High",4 : "High School", 
             5 : "College 3yrs", 
             6 : "College 4yrs", 9 : "Refused"}

# Replacing internet usage to values
internet_values = {1 : "Yes", 2 : "No", 7 : "Don’t know/Not Sure", 9 : "Refused", "" :  "Not asked or Missing"}

# Replacing race values to it's categories
race = { 1 :"White only", 2 :"Black only", 3 :"Other race only", 4 :"Multiracial",
                5 :"Hispanic", 9 :"Don’t know/Not sure/Refused"}


# Replacing age values to it's categories
age_values = {1 : "18-24", 2 : "25-29", 3 : "30-34", 4 : "35-39", 
              5 : "40-44", 6 : "45-49", 7 : "50-54", 8 : "55-59", 
              9 : "60-64", 10: "65-69", 11: "70-74", 12: "75-79",
              13: ">80", 14: "Don’t know/Refused/Missing"}

# Replacing income values to it's categories
income_values = {1.0 : "< 10000", 2 : "10000-15000", 3:  "15000-20000", 
                 4.0 : "20000-25000" , 5 : "25000-35000" , 6 : "35000-50000", 
                 7.0 : "50000-75000" , 8 : " >75000", 77 : "Don’t know/Not sure", 99 : "Refused"
                }

# Replacing smoke values to it's categories
smoke_values = { 1 : "Every day", 2 : "Some days", 3 :  "Not at all", 
                7 : "Don´t Know/Not Sure" , 9 : "Refused", "" :  "Not asked or Missing"}

# Replacing heart disease values to actual state.
heart_disease_values = { 1 : 'Yes' , 2 : 'No' , 7 : 'Don’t know/Not sure' , 9: 'Refused'}

# Replacing BMI values to it's categories.
bmi_values = { 1: "Underweight", 2: "Normal Weight" , 3:"Overweight" , 4 : "Obese" , "" : "Refused"}


# Replacing BP values to actual state.
blood_pressure_values = {1: "Yes", 2 : "Yes" , 3: "No", 4: "Borderline" , 7: "Don´t know/Not Sure", 9:"Refused" , "" :"Not Asked"}

# Replacing Cholestrol values to it's state.
cholestrol_values = { 1: "Yes", 2:"No", 3:"Don’t know/Not Sure", 9: "Refused", "": "Not Asked"}

# Replacing Cancer values to it's state.
cancer_values = { 1: "Yes" , 2 : "No" , 7 : "Don’t know / Not sure", 9 : "Refused"}

# Replacing Kidney values it's state
kidney_values = { 1: "Yes" , 2: "No", 7 : "Don’t know / Not sure" , 9 : "Refused" }

# Replacing Stroke values to it's state.
stroke_values = {1 : "Yes", 2 : "No", 7 : "Don’t know/Not sure", 9 : "Refused"}

# Replacing Diabetic values to it's state.
diabetes_values = { 1: "Yes", 2: "Yes" , 3: "No" , 4: "No" , 7: "Don’t know/Not Sure" , 9: "Refused"}

behavioral_data.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 441456 entries, 0.0 to 441455.0
Data columns (total 23 columns):
_STATE      441456 non-null float64
SEX         441456 non-null float64
MARITAL     441456 non-null float64
INTERNET    437146 non-null float64
_RACEGR3    441456 non-null float64
EDUCA       441456 non-null float64
_AGEG5YR    441456 non-null float64
WEIGHT2     410543 non-null float64
HTIN4       424196 non-null float64
_BMI5CAT    405058 non-null float64
BPHIGH4     441455 non-null float64
TOLDHI2     382302 non-null float64
INCOME2     438155 non-null float64
SMOKDAY2    184193 non-null float64
AVEDRNK2    210838 non-null float64
_VEGESUM    390339 non-null float64
_FRUTSUM    397745 non-null float64
CVDSTRK3    441456 non-null float64
PA1MIN_     289037 non-null float64
CVDCRHD4    441455 non-null float64
CHCOCNCR    441456 non-null float64
CHCKIDNY    441456 non-null float64
DIABETE3    441449 non-null float64
dtypes: float64(23)
memory usage: 80.8 MB


In [114]:
# Also different state has different number of records . Will keep uniform number of records from each state.
# Keeping 1500 records from each state

#behavioral_data_grouped = behavioral_data.groupby(["_STATE"]).apply(lambda x: x.sort_values(["_STATE"])).reset_index(drop=True)
# select top N rows within each continent
#behavioral_data_truncated = behavioral_data_grouped.groupby('_STATE').head(1500)


In [115]:

# Creating some new columns to store the text values
behavioral_data["State"] = ""
behavioral_data["Sex"] = ""
behavioral_data["Marital Status"] = ""
behavioral_data["Education"] = ""
behavioral_data["Internet"] = ""
behavioral_data["Race"] = ""
behavioral_data["Income"] = ""
behavioral_data["Smoking"] = ""
behavioral_data["Heart Disease"] = ""
behavioral_data["Kidney Disease"] = ""
behavioral_data["Blood Pressure"] = ""
behavioral_data["Diabetes"] = ""
behavioral_data["Cancer"] = ""
behavioral_data["Cholestrol"] = ""
behavioral_data["BMI"] = ""
behavioral_data["Stroke"] = ""
behavioral_data["Age"] = ""
behavioral_data.head()

Unnamed: 0,_STATE,SEX,MARITAL,INTERNET,_RACEGR3,EDUCA,_AGEG5YR,WEIGHT2,HTIN4,_BMI5CAT,...,Smoking,Heart Disease,Kidney Disease,Blood Pressure,Diabetes,Cancer,Cholestrol,BMI,Stroke,Age
0.0,1.0,2.0,1.0,2.0,1.0,4.0,9.0,280.0,70.0,4.0,...,,,,,,,,,,
1.0,1.0,2.0,2.0,1.0,1.0,6.0,7.0,165.0,68.0,3.0,...,,,,,,,,,,
2.0,1.0,2.0,2.0,2.0,1.0,4.0,11.0,158.0,71.0,2.0,...,,,,,,,,,,
3.0,1.0,2.0,1.0,2.0,1.0,4.0,9.0,180.0,67.0,3.0,...,,,,,,,,,,
4.0,1.0,2.0,1.0,1.0,1.0,5.0,9.0,142.0,64.0,2.0,...,,,,,,,,,,


In [116]:
%%time
# Converting Numeric data to Text form
behavioral_data['State'] = behavioral_data['_STATE'].map(lambda x: states_data.get(x, None))
behavioral_data["Sex"] = behavioral_data['SEX'].map(lambda x: gender.get(x, None))
behavioral_data["Marital Status"] = behavioral_data['MARITAL'].map(lambda x: marital_status.get(x, None))
behavioral_data["Education"] =  behavioral_data["EDUCA"].map(lambda x: education.get(x, None))
behavioral_data["Internet"] = behavioral_data["INTERNET"].map(lambda x: internet_values.get(x, None))
behavioral_data["Race"] = behavioral_data["_RACEGR3"].map(lambda x: race.get(x, None))
behavioral_data["Income"] = behavioral_data["INCOME2"].map(lambda x: income_values.get(x, None))
behavioral_data["Smoking"] = behavioral_data["SMOKDAY2"].map(lambda x: smoke_values.get(x, None))
behavioral_data["Heart Disease"] = behavioral_data["CVDCRHD4"].map(lambda x: heart_disease_values.get(x, None))
behavioral_data["Kidney Disease"] = behavioral_data["CHCKIDNY"].map(lambda x: kidney_values.get(x, None))
behavioral_data["Blood Pressure"] = behavioral_data["BPHIGH4"].map(lambda x: blood_pressure_values.get(x, None))
behavioral_data["Cancer"] = behavioral_data["CHCOCNCR"].map(lambda x: cancer_values.get(x, None))
behavioral_data["Diabetes"] = behavioral_data["DIABETE3"].map(lambda x: diabetes_values.get(x, None))
behavioral_data["Cholestrol"] = behavioral_data["TOLDHI2"].map(lambda x: cholestrol_values.get(x, None))
behavioral_data["BMI"] = behavioral_data["_BMI5CAT"].map(lambda x: bmi_values.get(x, None))
behavioral_data["Stroke"] = behavioral_data["CVDSTRK3"].map(lambda x: stroke_values.get(x, None))
behavioral_data["Age"] = behavioral_data["_AGEG5YR"].map(lambda x: age_values.get(x, None))
behavioral_data["HTIN4"] = behavioral_data["HTIN4"].map(lambda x: x * 0.08333)  # Converting to Feet from inches
behavioral_data["PA1MIN_"] = behavioral_data["PA1MIN_"] / 7   # Converting to Per day from Per Week
behavioral_data["_VEGESUM"] = round(behavioral_data["_VEGESUM"] / 100,2)
behavioral_data["_FRUTSUM"] = round(behavioral_data["_FRUTSUM"] / 100,2)


Wall time: 4.75 s


In [117]:
%%time
# Replacing Refused value with None
#behavioral_data['MARITAL'] = behavioral_data["MARITAL"].map(lambda x: np.nan if x==9 else x)
#behavioral_data['INTERNET'] = behavioral_data["INTERNET"].map(lambda x: np.nan if x == 9 else x)
#behavioral_data['_RACEGR3'] = behavioral_data["_RACEGR3"].map(lambda x: np.nan if x==9 else x)
#behavioral_data['EDUCA'] = behavioral_data["EDUCA"].map(lambda x: np.nan if x==9 else x)
#behavioral_data['SMOKDAY2'] = behavioral_data["SMOKDAY2"].map(lambda x: np.nan if x==9 else x)
#behavioral_data['AVEDRNK2'] = behavioral_data["AVEDRNK2"].map(lambda x: np.nan if x==9 else x)
#behavioral_data['CVDSTRK3'] = behavioral_data["CVDSTRK3"].map(lambda x: np.nan if x==9 else x)
#behavioral_data['CVDCRHD4'] = behavioral_data["CVDCRHD4"].map(lambda x: np.nan if x==9 else x)
#behavioral_data['CHCOCNCR'] = behavioral_data["CHCOCNCR"].map(lambda x: np.nan if x==9 else x)
#behavioral_data['CHCKIDNY'] = behavioral_data["CHCKIDNY"].map(lambda x: np.nan if x==9 else x)
#behavioral_data['DIABETE3'] = behavioral_data["DIABETE3"].map(lambda x: np.nan if x==9 else x)
# Dropping all NA rows

behavioral_data = behavioral_data.dropna()
behavioral_data = behavioral_data.reset_index(drop=True)

behavioral_data.describe()

Wall time: 1.38 s


In [118]:
# Dropping the old columns
behavioral_data = behavioral_data.drop(columns = ["_STATE", "SEX", "MARITAL","EDUCA", "INTERNET", "_RACEGR3", "INCOME2", "SMOKDAY2", "CVDCRHD4","CHCKIDNY","BPHIGH4", "CHCOCNCR", "TOLDHI2", "_BMI5CAT", "CVDSTRK3", "_AGEG5YR" ,"DIABETE3"])

behavioral_data.head()

Unnamed: 0,WEIGHT2,HTIN4,AVEDRNK2,_VEGESUM,_FRUTSUM,PA1MIN_,State,Sex,Marital Status,Education,...,Smoking,Heart Disease,Kidney Disease,Blood Pressure,Diabetes,Cancer,Cholestrol,BMI,Stroke,Age
0,128.0,4.9998,2.0,2.67,0.34,30.0,Alabama,Female,Widowed,High School,...,Every day,No,No,Yes,No,No,Yes,Overweight,No,70-74
1,172.0,5.8331,1.0,1.83,1.67,40.0,Alabama,Male,Married,College 4yrs,...,Not at all,No,No,No,Yes,No,No,Normal Weight,No,>80
2,135.0,5.33312,1.0,0.97,0.0,308.0,Alabama,Male,Married,College 3yrs,...,Every day,No,No,No,Yes,No,No,Normal Weight,No,50-54
3,190.0,5.99976,1.0,1.42,4.29,20.0,Alabama,Male,Married,College 3yrs,...,Every day,No,No,No,No,No,No,Overweight,No,35-39
4,212.0,5.91643,1.0,2.43,1.07,150.0,Alabama,Male,Married,College 4yrs,...,Not at all,Yes,No,Yes,No,No,Yes,Overweight,Yes,65-69


In [119]:
# Re-naming the columns
behavioral_data.rename(columns={"WEIGHT2" : "Weight(lbs)", "HTIN4" : "Height(ft)", "AVEDRNK2": "Alcohol/Day", "_VEGESUM" : "Vegetables/Day" ,"_FRUTSUM" : "Fruits/Day", "PA1MIN_" : "Physical Activity/Day(mints)"},inplace=True)

# Re-arranging the columns

behavioral_data = behavioral_data[["State", "Sex", "Marital Status","Age","Race","Weight(lbs)","Height(ft)", "Income","Vegetables/Day","Fruits/Day", "Physical Activity/Day(mints)", "Smoking","Alcohol/Day","BMI", "Blood Pressure", "Cholestrol", "Heart Disease", "Stroke", "Cancer","Diabetes"]]

behavioral_data.head()

Unnamed: 0,State,Sex,Marital Status,Age,Race,Weight(lbs),Height(ft),Income,Vegetables/Day,Fruits/Day,Physical Activity/Day(mints),Smoking,Alcohol/Day,BMI,Blood Pressure,Cholestrol,Heart Disease,Stroke,Cancer,Diabetes
0,Alabama,Female,Widowed,70-74,White only,128.0,4.9998,20000-25000,2.67,0.34,30.0,Every day,2.0,Overweight,Yes,Yes,No,No,No,No
1,Alabama,Male,Married,>80,White only,172.0,5.8331,>75000,1.83,1.67,40.0,Not at all,1.0,Normal Weight,No,No,No,No,No,Yes
2,Alabama,Male,Married,50-54,White only,135.0,5.33312,35000-50000,0.97,0.0,308.0,Every day,1.0,Normal Weight,No,No,No,No,No,Yes
3,Alabama,Male,Married,35-39,White only,190.0,5.99976,15000-20000,1.42,4.29,20.0,Every day,1.0,Overweight,No,No,No,No,No,No
4,Alabama,Male,Married,65-69,White only,212.0,5.91643,Refused,2.43,1.07,150.0,Not at all,1.0,Overweight,Yes,Yes,Yes,Yes,No,No


In [120]:
%%time
# Writing the cleane data to a new csv file.
output_file = "./Resources/behavioral_revised_data_final.csv"
behavioral_data.to_csv(output_file,index=0)


Wall time: 1.2 s


In [121]:
behavioral_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56830 entries, 0 to 56829
Data columns (total 20 columns):
State                           56830 non-null object
Sex                             56830 non-null object
Marital Status                  56830 non-null object
Age                             56830 non-null object
Race                            56830 non-null object
Weight(lbs)                     56830 non-null float64
Height(ft)                      56830 non-null float64
Income                          56830 non-null object
Vegetables/Day                  56830 non-null float64
Fruits/Day                      56830 non-null float64
Physical Activity/Day(mints)    56830 non-null float64
Smoking                         56830 non-null object
Alcohol/Day                     56830 non-null float64
BMI                             56830 non-null object
Blood Pressure                  56830 non-null object
Cholestrol                      56830 non-null object
Heart Disease        

In [122]:
behavioral_data.head()

Unnamed: 0,State,Sex,Marital Status,Age,Race,Weight(lbs),Height(ft),Income,Vegetables/Day,Fruits/Day,Physical Activity/Day(mints),Smoking,Alcohol/Day,BMI,Blood Pressure,Cholestrol,Heart Disease,Stroke,Cancer,Diabetes
0,Alabama,Female,Widowed,70-74,White only,128.0,4.9998,20000-25000,2.67,0.34,30.0,Every day,2.0,Overweight,Yes,Yes,No,No,No,No
1,Alabama,Male,Married,>80,White only,172.0,5.8331,>75000,1.83,1.67,40.0,Not at all,1.0,Normal Weight,No,No,No,No,No,Yes
2,Alabama,Male,Married,50-54,White only,135.0,5.33312,35000-50000,0.97,0.0,308.0,Every day,1.0,Normal Weight,No,No,No,No,No,Yes
3,Alabama,Male,Married,35-39,White only,190.0,5.99976,15000-20000,1.42,4.29,20.0,Every day,1.0,Overweight,No,No,No,No,No,No
4,Alabama,Male,Married,65-69,White only,212.0,5.91643,Refused,2.43,1.07,150.0,Not at all,1.0,Overweight,Yes,Yes,Yes,Yes,No,No
