## Behavioral Risk Factor Surveillance System

#### Following commands were executed to download the CSV file from Kaggle API.

#### https://github.com/Kaggle/kaggle-api




In [None]:
!pip install kaggle

!kaggle datasets download -d cdc/behavioral-risk-factor-surveillance-system


#### Installing Dependencies

In [1]:
# Installing Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import json
from pprint import pprint
import warnings
warnings.filterwarnings('ignore')


#### Cleaning up the Data Set.




In [2]:
# The behavioral-risk-factor-surveillance-system.zip had data for 5 years. 
# Using the 2015 data for analysis now.

file_path = "./Resources/2015.csv"
behavioral_data_original = pd.read_csv(file_path, encoding="utf-8")

behavioral_data_original.describe()
behavioral_data_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 441456 entries, 0 to 441455
Columns: 330 entries, _STATE to _AIDTST3
dtypes: float64(323), object(7)
memory usage: 1.1+ GB


In [3]:
behavioral_data_original.head()

Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENUM,...,_PAREC1,_PASTAE1,_LMTACT1,_LMTWRK1,_LMTSCL1,_RFSEAT2,_RFSEAT3,_FLSHOT6,_PNEUMO2,_AIDTST3
0,1.0,1.0,b'01292015',b'01',b'29',b'2015',1200.0,2015000000.0,2015000000.0,1.0,...,4.0,2.0,1.0,1.0,1.0,1.0,1.0,,,1.0
1,1.0,1.0,b'01202015',b'01',b'20',b'2015',1100.0,2015000000.0,2015000000.0,1.0,...,2.0,2.0,3.0,3.0,4.0,2.0,2.0,,,2.0
2,1.0,1.0,b'02012015',b'02',b'01',b'2015',1200.0,2015000000.0,2015000000.0,1.0,...,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,
3,1.0,1.0,b'01142015',b'01',b'14',b'2015',1100.0,2015000000.0,2015000000.0,1.0,...,4.0,2.0,1.0,1.0,1.0,1.0,1.0,,,9.0
4,1.0,1.0,b'01142015',b'01',b'14',b'2015',1100.0,2015000000.0,2015000000.0,1.0,...,4.0,2.0,1.0,1.0,1.0,1.0,1.0,,,1.0


In [4]:
# Selecting only required columns
# Column description of selected columns
# _STATE = States of US, SEX = Various gender types, Marital = Various Marital status types , EDUCA = What is the highest grade or year of school you completed?
# INTERNET = Internet use in the past 30 days? , WEIGHT2 = About how much do you weigh without shoes? , HEIGHT3 = About how tall are you without shoes?
# INCOME2 = Is your annual household income from all sources, SMOKDAY2 = Do you now smoke cigarettes every day, some days, or not at all? , 
# "ALCDAY5" = Days in past 30 had alcoholic beverage, AVEDRNK2 = Avg alcoholic drinks per day in past 30
# FRUIT1 = How many times did you eat fruit?,
# _VEGESUM = Total vegetables consumed per day , _FRUTSUM = Total fruits consumed per day,
#  PA1MIN_ = Minutes of total Physical Activity per week
# CVDCRHD4 = Ever Diagnosed with Angina or Coronary Heart Disease , CHCOCNCR = Ever told) you had any other types of cancer?
# CHCKIDNY = Ever told) you have kidney disease? , DIABETE3 = Ever told) you have diabetes
# _RACEGR3 = 
# BLANK Not asked or Missing
behavioral_data_reduced = behavioral_data_original [["_STATE", "SEX", "MARITAL", "_RACEGR3", "EDUCA", "INTERNET",  "WTKG3",  "HTIN4","_BMI5","BPHIGH4" ,"INCOME2", "SMOKDAY2", "ALCDAY5" , "AVEDRNK2", "FRUIT1", "_VEGESUM", "_FRUTSUM", "PA1MIN_", "CVDCRHD4", "CHCOCNCR", "CHCKIDNY", "DIABETE3"]]

behavioral_data_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 441456 entries, 0 to 441455
Data columns (total 22 columns):
_STATE      441456 non-null float64
SEX         441456 non-null float64
MARITAL     441456 non-null float64
_RACEGR3    441456 non-null float64
EDUCA       441456 non-null float64
INTERNET    437146 non-null float64
WTKG3       410535 non-null float64
HTIN4       424196 non-null float64
_BMI5       405058 non-null float64
BPHIGH4     441455 non-null float64
INCOME2     438155 non-null float64
SMOKDAY2    184193 non-null float64
ALCDAY5     425525 non-null float64
AVEDRNK2    210838 non-null float64
FRUIT1      412306 non-null float64
_VEGESUM    390339 non-null float64
_FRUTSUM    397745 non-null float64
PA1MIN_     289037 non-null float64
CVDCRHD4    441455 non-null float64
CHCOCNCR    441456 non-null float64
CHCKIDNY    441456 non-null float64
DIABETE3    441449 non-null float64
dtypes: float64(22)
memory usage: 74.1 MB


In [5]:
# Testing via API but not successful
# baseURL = " https://sdp-v.services.cdc.gov/api/fhir/Questionnaire/S-1/_history/1"

# try:
#    response_json = requests.get(baseURL).json()
#    pprint(response_json)
# except Exception as e:
#    print(f"Bad Jason {e}")


In [6]:
# Replacing all numeric values with state names
states_data = {1 : "Alabama", 2: "Alaska" , 4: "Arizona",
               5 : "Arkansas", 6: "California",8 : "Colorado",
               9 : "Connecticut", 10 : "Delaware", 11: "District of Columbia",
               12 : "Florida", 13 :  "Georgia",15 : "Hawaii", 16 : "Idaho",
               17 : "Illinois", 18 : "Indiana", 19 : "Iowa", 20 : "Kansas",
               21 : "Kentucky", 22 : "Louisiana", 23 : "Maine" ,24 : "Maryland",
               25 : "Massachusetts", 26 : "Michigan" ,27 : "Minnesota", 28 : "Mississippi",
               29 : "Missouri",30 : "Montana",31 : "Nebraska",32 : "Nevada",33 : "New Hampshire ",34 : "New Jersey"
               ,35 : "New Mexico",36 : "New York",37 : "North Carolina",38 : "North Dakota",39 : "Ohio",40 : "Oklahoma"
               ,41 : "Oregon",42 : "Pennsylvania",44 : "Rhode Island",45 : "South Carolina",46 : "South Dakota",47 : "Tennessee"
               ,48 : "Texas",49 : "Utah",50 : "Vermont",51 : "Virginia",53 : "Washington",54 : "West Virginia ",55 : "Wisconsin"
               ,56 : "Wyoming",66 : "Guam",72 : "Puerto Rico"}

gender = { 1: "Male" , 2 : "Female"}

marital_status = { 1 : "Married", 2 : "Divorced", 3 : "Widowed", 4 : "Separated", 5 : "Never married", 
                  6 : "A member of an unmarried couple",9 : "Refused" }
education = {1 : "Never attended school or only kindergarten", 2 : "Grades 1 through 8 (Elementary", 
             3 : "Grades 9 through 11 (Some high school)",4 : "Grade 12 or GED (High school graduate)", 
             5 : "College 1 year to 3 years (Some college or technical school", 
             6 : "College 4 years or more (College graduate)", 9 : "Refused"}

internet_values = {1 : "Yes", 2 : "No", 7 : "Don’t know/Not Sure", 9 : "Refused", "BLANK" :  "Not asked or Missing"}
race = { 1 :"White only", 2 :"Black only", 3 :"Other race only", 4 :"Multiracial",
                5 :"Hispanic", 9 :"Don’t know/Not sure/Refused"}
# Weight classification
#50 - 0999 Weight (pounds)
#7777 Don’t know/Not sure
#9000 - 9998 Weight (kilograms)
#9999 Refused
#BLANK Not asked or Missing

# Height classification
# 200 - 711 Height (ft/inches)
# 7777 Don’t know/Not sure
# 9000 - 9998 Height (meters/centimeters)
# 9999 Refused
# BLANK Not asked or Missing

income_values = {1.0 : "Less than $10,000", 2 : "$10,000 to $15,000", 3:  "$15,000 to less than $20,000", 
                 4.0 : "$20,000 to $25,000" , 5 : "$25,000 to $35,000" , 6 : "$35,000 to$50,000", 
                 7.0 : "$50,000 to $75,000" , 8 : " > $75,000", 77 : "Don’t know/Not sure", 99 : "Refused"
                }

smoke_values = { 1 : "Every day", 2 : "Some days", 3 :  "Not at all", 
                7 : "Don´t Know/Not Sure" , 9 : "Refused", "BLANK" :  "Not asked or Missing"}

# Alcohol intake in days
# "101 - 199" :  "Days per week"
# "201 - 299" :  Days in past 30 days
# "777" :  "Don’t know/Not sure"
# "888" :  "No drinks in past 30 days"
# "999" :  "Refused"
# "BLANK" : "Not asked or Missing"

# Average alcoholic consumption per day in last 30 days
# 1 - 76 : Number of drinks
# 77 : Don’t know/Not sure
# 99 : Refused
# BLANK : Not asked or Missing

# Fruita intake How many times did you eat fruit?
# 101 - 199 Times per day
# 201 - 299 Times per week
# 300 Less than one time per month
# 301 - 399 Times per month
# 555 Never 16,204 3.93 4.31
# 777 Don’t know/Not sure 5,139 1.25 1.08
# 999 Refused 2,293 0.56 1.68
# BLANK Not asked or Missing

# Total vegetables consumed per day (: _VEGESUM)
# 0 - 99998 Number of Vegetables consumed per day (two implied decimal places)
# BLANK Not asked or Missing

# Total fruits consumed per day (_FRUTSUM)
# 0 - 99998 Number of Fruits consumed per day (two implied decimal places)
# BLANK Not asked or Missing

# Minutes of total Physical Activity per week (PA1MIN_)
# 0 - 99999 Minutes of Activity per week
# BLANK Not asked or Missing

# Ever Diagnosed with Angina or Coronary Heart Disease (CVDCRHD4)
# 1 Yes
# 2 No 412,349 93.41 95.28
# 7 Don’t know/Not sure 3,591 0.81 0.63
# 9 Refused 225 0.05 0.05
# BLANK Not asked or Missing
heart_disease_values = { 1 : 'Yes' , 2 : 'No' , 7 : 'Don’t know/Not sure' , 9: 'Refused'}

# Ever told) you had any other types of cancer? ( CHCOCNCR )
# Yes 43,441 9.84 6.59
# 2 No 396,932 89.91 93.19
# 7 Don’t know / Not sure
# 9 Refused
cancer_values = { 1: "Yes" , 2 : "No" , 7 : "Don’t know / Not sure", 9 : "Refused"}

# Ever told) you have kidney disease? (CHCKIDNY)
# 1 Yes 15,657 3.55 2.68
# 2 No 424,222 96.10 97.00
# 7 Don’t know / Not sure 1,375 0.31 0.29
# 9 Refused
kidney_values = { 1: "Yes" , 2: "No", 7 : "Don’t know / Not sure" , 9 : "Refused" }

# Ever told) you have diabetes ( DIABETE3 )
# 1 Yes 57,256 12.97 10.48
# 2 Yes, but female told only during pregnancy----Go to Section 07.7.1 SEX
# 3 No----Go to Section 07.7.1 SEX 372,104 84.29 86.77
# 4 No, pre-diabetes or borderline diabetes----Go to Section 07.7.1 SEX 7,690 1.74 1.60
# 7 Don’t know/Not Sure----Go to Section 07.7.1 SEX 598 0.14 0.16
# 9 Refused----Go to Section 07.7.1 SEX 193 0.04 0.04
# BLANK Not asked or Missing


In [7]:
# Keep NA rows for which at least 5 Non-NA values and drop rest
behavioral_data_reduced = behavioral_data_reduced.dropna(thresh=3)
behavioral_data_reduced.info()
behavioral_data_reduced = behavioral_data_reduced.fillna(0)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 441456 entries, 0 to 441455
Data columns (total 22 columns):
_STATE      441456 non-null float64
SEX         441456 non-null float64
MARITAL     441456 non-null float64
_RACEGR3    441456 non-null float64
EDUCA       441456 non-null float64
INTERNET    437146 non-null float64
WTKG3       410535 non-null float64
HTIN4       424196 non-null float64
_BMI5       405058 non-null float64
BPHIGH4     441455 non-null float64
INCOME2     438155 non-null float64
SMOKDAY2    184193 non-null float64
ALCDAY5     425525 non-null float64
AVEDRNK2    210838 non-null float64
FRUIT1      412306 non-null float64
_VEGESUM    390339 non-null float64
_FRUTSUM    397745 non-null float64
PA1MIN_     289037 non-null float64
CVDCRHD4    441455 non-null float64
CHCOCNCR    441456 non-null float64
CHCKIDNY    441456 non-null float64
DIABETE3    441449 non-null float64
dtypes: float64(22)
memory usage: 77.5 MB


In [8]:
# Creating some new columns to store the text values
behavioral_data_reduced["State"] = ""
behavioral_data_reduced["Sex"] = ""
behavioral_data_reduced["Marital Status"] = ""

behavioral_data_reduced.head()

Unnamed: 0,_STATE,SEX,MARITAL,_RACEGR3,EDUCA,INTERNET,WTKG3,HTIN4,_BMI5,BPHIGH4,...,_VEGESUM,_FRUTSUM,PA1MIN_,CVDCRHD4,CHCOCNCR,CHCKIDNY,DIABETE3,State,Sex,Marital Status
0,1.0,2.0,1.0,1.0,4.0,2.0,12701.0,70.0,4018.0,1.0,...,217.0,50.0,0.0,2.0,2.0,2.0,3.0,,,
1,1.0,2.0,2.0,1.0,6.0,1.0,7484.0,68.0,2509.0,3.0,...,78.0,24.0,168.0,2.0,2.0,2.0,3.0,,,
2,1.0,2.0,2.0,1.0,4.0,2.0,7167.0,71.0,2204.0,3.0,...,0.0,0.0,0.0,2.0,1.0,2.0,3.0,,,
3,1.0,2.0,1.0,1.0,4.0,2.0,8165.0,67.0,2819.0,1.0,...,20.0,100.0,0.0,2.0,1.0,2.0,3.0,,,
4,1.0,2.0,1.0,1.0,5.0,1.0,6441.0,64.0,2437.0,3.0,...,200.0,0.0,0.0,2.0,2.0,2.0,3.0,,,


In [None]:
# Converting State , Sex , Marital Status data to text format by creating new columns and dropping old columns


for ind, row in behavioral_data_reduced.iterrows():
    
    behavioral_data_reduced.loc[ind, "State"] = states_data.get(row["_STATE"], None)
    behavioral_data_reduced.loc[ind, "Sex"] = gender.get(row["SEX"], None)
    behavioral_data_reduced.loc[ind, "Marital Status"] = marital_status.get(row["MARITAL"], None)

behavioral_data_reduced.drop(columns = ["_STATE", "SEX", "MARITAL"])

In [None]:
behavioral_data_reduced.head()