In [1]:
# ----------------------------------------------------------------------
# Clean CSV for pollution data
# ----------------------------------------------------------------------

import pandas as pd
import datetime as dt

# read in CSV
data = pd.read_csv("../data/pollution_original.csv")

In [2]:
# reduce to columns I care about, reset index, and delete old index
data = data[['Date Local', 'State', 'City', 
             'NO2 Mean', 'NO2 Units', 
             'SO2 Mean', 'SO2 Units', 
             'CO Mean', 'CO Units']]

# drop any empty values
data = data.dropna(axis=0,how='any')

In [3]:
# reformat date as datetime column
data['Date Local'] = pd.to_datetime(data['Date Local'], format="%Y/%m/%d")

In [20]:
data_new = data.groupby(['State', 'Date Local']).mean().reset_index()
data_new.head()

Unnamed: 0,State,Date Local,NO2 Mean,SO2 Mean,CO Mean
0,Alabama,2013-12-01,17.208333,0.28539,0.262879
1,Alabama,2013-12-02,20.6875,0.531666,0.352812
2,Alabama,2013-12-03,14.9125,0.252632,0.237575
3,Alabama,2013-12-04,7.825,0.123052,0.115152
4,Alabama,2013-12-05,8.004762,-0.014285,0.117575


In [21]:
data_new.to_csv(path_or_buf="../data/pollution_cleaned.csv")

In [6]:
units = {'NO2':data['NO2 Units'][0],
         'SO2':data['SO2 Units'][0],
         'CO':data['CO Units'][0]}
units

{'CO': 'Parts per million',
 'NO2': 'Parts per billion',
 'SO2': 'Parts per billion'}

In [25]:
cities_set = set(data_new['State'].values.flatten()) 
cities_set

{'Alabama',
 'Alaska',
 'Arizona',
 'Arkansas',
 'California',
 'Colorado',
 'Connecticut',
 'Country Of Mexico',
 'Delaware',
 'District Of Columbia',
 'Florida',
 'Georgia',
 'Hawaii',
 'Idaho',
 'Illinois',
 'Indiana',
 'Iowa',
 'Kansas',
 'Kentucky',
 'Louisiana',
 'Maine',
 'Maryland',
 'Massachusetts',
 'Michigan',
 'Minnesota',
 'Missouri',
 'Nevada',
 'New Hampshire',
 'New Jersey',
 'New Mexico',
 'New York',
 'North Carolina',
 'North Dakota',
 'Ohio',
 'Oklahoma',
 'Oregon',
 'Pennsylvania',
 'Rhode Island',
 'South Carolina',
 'South Dakota',
 'Tennessee',
 'Texas',
 'Utah',
 'Virginia',
 'Washington',
 'Wisconsin',
 'Wyoming'}

In [40]:
# ----------------------------------------------------------------------
# Clean CSV for pollution data
# ----------------------------------------------------------------------
# read in CSV
data = pd.read_csv("../data/cdi_original.csv", low_memory=False)
data.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,Response,DataValueUnit,DataValueType,...,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
0,2016,2016,US,United States,BRFSS,Alcohol,Binge drinking prevalence among adults aged >=...,,%,Crude Prevalence,...,59,ALC,ALC2_2,CRDPREV,OVERALL,OVR,,,,
1,2016,2016,AL,Alabama,BRFSS,Alcohol,Binge drinking prevalence among adults aged >=...,,%,Crude Prevalence,...,1,ALC,ALC2_2,CRDPREV,OVERALL,OVR,,,,
2,2016,2016,AK,Alaska,BRFSS,Alcohol,Binge drinking prevalence among adults aged >=...,,%,Crude Prevalence,...,2,ALC,ALC2_2,CRDPREV,OVERALL,OVR,,,,
3,2016,2016,AZ,Arizona,BRFSS,Alcohol,Binge drinking prevalence among adults aged >=...,,%,Crude Prevalence,...,4,ALC,ALC2_2,CRDPREV,OVERALL,OVR,,,,
4,2016,2016,AR,Arkansas,BRFSS,Alcohol,Binge drinking prevalence among adults aged >=...,,%,Crude Prevalence,...,5,ALC,ALC2_2,CRDPREV,OVERALL,OVR,,,,


In [41]:
# view all questions
questions = set(data['Question'].values.flatten()) 
questions

{'Activity limitation due to arthritis among adults aged >= 18 years who have doctor-diagnosed arthritis',
 'Adults aged >= 18 years with arthritis who have taken a class to learn how to manage arthritis symptoms',
 'Adults with diagnosed diabetes aged >= 18 years who have taken a diabetes self-management course',
 'Alcohol use among youth',
 'Alcohol use before pregnancy',
 'All teeth lost among adults aged >= 65 years',
 'Amount of alcohol excise tax by beverage type (beer)',
 'Amount of alcohol excise tax by beverage type (distilled spirits)',
 'Amount of alcohol excise tax by beverage type (wine)',
 'Amount of tobacco product excise tax',
 'Amputation of a lower extremity attributable to diabetes',
 'Arthritis among adults aged >= 18 years',
 'Arthritis among adults aged >= 18 years who are obese',
 'Arthritis among adults aged >= 18 years who have diabetes',
 'Arthritis among adults aged >= 18 years who have heart disease',
 'Asthma mortality rate',
 'Asthma prevalence among women

In [54]:
# filter dataframe to just overall asthma prevalance
asthma_df = data[data['Question'] ==  'Current asthma prevalence among adults aged >= 18 years']
asthma_df = asthma_df[asthma_df['StratificationCategory1'] == 'Overall']

# remove data points for US as a whole and reset index
asthma_df = asthma_df[asthma_df['LocationDesc'] != 'United States'].reset_index()

# reduce to necessary columns and drop null values
asthma_df = asthma_df[['YearStart', 'LocationDesc', 
                       'DataValue']].dropna(how='any')

#rename columns in place
asthma_df.rename(columns={'YearStart':'Year', 
                          'LocationDesc':'State', 
                          'DataValue':'Percent Asthma'}, 
                 inplace=True)

# export to CSV
asthma_df.to_csv(path_or_buf="../data/asthma_byState.csv")