In [2]:
# ----------------------------------------------------------------------
# Clean CSV for pollution data
# ----------------------------------------------------------------------

import pandas as pd
import datetime as dt
import numpy as np

In [2]:
# read in CSV
data = pd.read_csv("../data/pollution_original.csv")

# reduce to columns I care about, reset index, and delete old index
data = data[['Date Local', 'State', 'City', 
             'NO2 Mean', 'NO2 Units', 
             'SO2 Mean', 'SO2 Units', 
             'CO Mean', 'CO Units']]

data.rename(columns={'Date Local':'Date'}, inplace=True)

# drop any empty values
data = data.dropna(axis=0,how='any')
print('Done')

Done


In [3]:
data['Date'] = pd.to_datetime(data['Date'], format="%Y/%m/%d")
data_new = data.groupby([data.State, data.Date.dt.year]).mean().reset_index()
data_new.head()

Unnamed: 0,State,Date,NO2 Mean,SO2 Mean,CO Mean
0,Alabama,2013,12.065635,0.750311,0.211007
1,Alabama,2014,9.411746,1.186517,0.205921
2,Alabama,2015,9.07858,1.009429,0.221027
3,Alabama,2016,9.524873,0.835118,0.209514
4,Alaska,2014,11.65851,5.951531,0.455444


In [4]:
data_new.to_csv(path_or_buf="../data/pollution_cleaned.csv")
data_new.to_json(path_or_buf="../data/pollution_cleaned.json", orient='records')

In [5]:
units = {'NO2':data['NO2 Units'][0],
         'SO2':data['SO2 Units'][0],
         'CO':data['CO Units'][0]}
units

{'CO': 'Parts per million',
 'NO2': 'Parts per billion',
 'SO2': 'Parts per billion'}

In [6]:
cities_set = set(data_new['State'].values.flatten()) 
cities_set

{'Alabama',
 'Alaska',
 'Arizona',
 'Arkansas',
 'California',
 'Colorado',
 'Connecticut',
 'Country Of Mexico',
 'Delaware',
 'District Of Columbia',
 'Florida',
 'Georgia',
 'Hawaii',
 'Idaho',
 'Illinois',
 'Indiana',
 'Iowa',
 'Kansas',
 'Kentucky',
 'Louisiana',
 'Maine',
 'Maryland',
 'Massachusetts',
 'Michigan',
 'Minnesota',
 'Missouri',
 'Nevada',
 'New Hampshire',
 'New Jersey',
 'New Mexico',
 'New York',
 'North Carolina',
 'North Dakota',
 'Ohio',
 'Oklahoma',
 'Oregon',
 'Pennsylvania',
 'Rhode Island',
 'South Carolina',
 'South Dakota',
 'Tennessee',
 'Texas',
 'Utah',
 'Virginia',
 'Washington',
 'Wisconsin',
 'Wyoming'}

In [7]:
# ----------------------------------------------------------------------
# Clean CSV for different health data
# ----------------------------------------------------------------------
# read in CSV
data = pd.read_csv("../data/cdi_original.csv", low_memory=False)
data.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,Response,DataValueUnit,DataValueType,...,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
0,2016,2016,US,United States,BRFSS,Alcohol,Binge drinking prevalence among adults aged >=...,,%,Crude Prevalence,...,59,ALC,ALC2_2,CRDPREV,OVERALL,OVR,,,,
1,2016,2016,AL,Alabama,BRFSS,Alcohol,Binge drinking prevalence among adults aged >=...,,%,Crude Prevalence,...,1,ALC,ALC2_2,CRDPREV,OVERALL,OVR,,,,
2,2016,2016,AK,Alaska,BRFSS,Alcohol,Binge drinking prevalence among adults aged >=...,,%,Crude Prevalence,...,2,ALC,ALC2_2,CRDPREV,OVERALL,OVR,,,,
3,2016,2016,AZ,Arizona,BRFSS,Alcohol,Binge drinking prevalence among adults aged >=...,,%,Crude Prevalence,...,4,ALC,ALC2_2,CRDPREV,OVERALL,OVR,,,,
4,2016,2016,AR,Arkansas,BRFSS,Alcohol,Binge drinking prevalence among adults aged >=...,,%,Crude Prevalence,...,5,ALC,ALC2_2,CRDPREV,OVERALL,OVR,,,,


In [8]:
# view all questions
questions = set(data['Question'].values.flatten()) 
questions

{'Activity limitation due to arthritis among adults aged >= 18 years who have doctor-diagnosed arthritis',
 'Adults aged >= 18 years with arthritis who have taken a class to learn how to manage arthritis symptoms',
 'Adults with diagnosed diabetes aged >= 18 years who have taken a diabetes self-management course',
 'Alcohol use among youth',
 'Alcohol use before pregnancy',
 'All teeth lost among adults aged >= 65 years',
 'Amount of alcohol excise tax by beverage type (beer)',
 'Amount of alcohol excise tax by beverage type (distilled spirits)',
 'Amount of alcohol excise tax by beverage type (wine)',
 'Amount of tobacco product excise tax',
 'Amputation of a lower extremity attributable to diabetes',
 'Arthritis among adults aged >= 18 years',
 'Arthritis among adults aged >= 18 years who are obese',
 'Arthritis among adults aged >= 18 years who have diabetes',
 'Arthritis among adults aged >= 18 years who have heart disease',
 'Asthma mortality rate',
 'Asthma prevalence among women

In [9]:
# filter dataframe to just overall asthma prevalance
asthma_df = data[data['Question'] ==  'Current asthma prevalence among adults aged >= 18 years']
asthma_df = asthma_df[asthma_df['StratificationCategory1'] == 'Overall']

# remove data points for US as a whole and reset index
asthma_df = asthma_df[asthma_df['LocationDesc'] != 'United States'].reset_index()

# reduce to necessary columns and drop null values
asthma_df = asthma_df[['YearStart', 'LocationDesc', 
                       'DataValue']].dropna(how='any')

#rename columns in place
asthma_df.rename(columns={'YearStart':'Year', 
                          'LocationDesc':'State', 
                          'DataValue':'Percent Asthma'}, 
                 inplace=True)

# export to CSV
asthma_df.to_csv(path_or_buf="../data/asthma_byState.csv")
asthma_df.to_json(path_or_buf="../data/asthma_byState.json", orient='records')

In [10]:
# create column to join on
asthma_df['Year2'] = asthma_df['Year'].apply(str)
asthma_df['key'] = asthma_df['Year2'] + asthma_df['State']
asthma_df.head()

Unnamed: 0,Year,State,Percent Asthma,Year2,key
0,2016,Alabama,9.7,2016,2016Alabama
1,2016,Alaska,8.8,2016,2016Alaska
2,2016,Arizona,9.4,2016,2016Arizona
3,2016,Arkansas,8.5,2016,2016Arkansas
4,2016,California,7.8,2016,2016California


In [11]:
data_new['Date2'] = data_new['Date'].apply(str)
data_new['key'] = data_new['Date2'] + data_new['State']
data_new.head()

Unnamed: 0,State,Date,NO2 Mean,SO2 Mean,CO Mean,Date2,key
0,Alabama,2013,12.065635,0.750311,0.211007,2013,2013Alabama
1,Alabama,2014,9.411746,1.186517,0.205921,2014,2014Alabama
2,Alabama,2015,9.07858,1.009429,0.221027,2015,2015Alabama
3,Alabama,2016,9.524873,0.835118,0.209514,2016,2016Alabama
4,Alaska,2014,11.65851,5.951531,0.455444,2014,2014Alaska


In [12]:
# create single dataframe
final_data = pd.merge(data_new, asthma_df, how='inner', on='key')

In [13]:
# remove unnecessary columns
final_data = final_data[['State_x', 'Year', 'Percent Asthma', 'NO2 Mean', 'SO2 Mean', 'CO Mean']]
final_data.rename(columns={'State_x':'State'}, inplace=True)
final_data.head()

Unnamed: 0,State,Year,Percent Asthma,NO2 Mean,SO2 Mean,CO Mean
0,Alabama,2013,8.5,12.065635,0.750311,0.211007
1,Alabama,2013,8.5,12.065635,0.750311,0.211007
2,Alabama,2014,9.5,9.411746,1.186517,0.205921
3,Alabama,2014,9.5,9.411746,1.186517,0.205921
4,Alabama,2015,9.9,9.07858,1.009429,0.221027


In [14]:
#export to json
final_data.to_csv(path_or_buf="../data/asthmaPollution_byState.csv")
final_data.to_json(path_or_buf="../data/asthmaPollution_byState.json", orient='records')

In [5]:
# ----------------------------------------------------------------------
# Clean CSV for different health data
# ----------------------------------------------------------------------
complete_data = pd.DataFrame()

for i in np.arange(39):
    new_data = pd.read_csv(f"../data/annual_aqi_by_county_{1980 + i}.csv").dropna()
    complete_data = pd.concat([complete_data, new_data], ignore_index=True)

In [None]:
complete_data.to_json(path_or_buf="../data/asthmaPollution_byState.json", orient='records')