In [10]:
import pandas as pd
import numpy as np

## State CDC Dataframe creation

In [55]:
df_states = pd.read_csv("data/State_Custom_Data.csv")

In [56]:
#CDC will add 'Insufficient Data' as a value in columns
df_states = df_states[df_states['TOTAL DEATHS'] != 'Insufficient Data']

In [57]:
#CDC data has commas in numbers
df_states = df_states.replace(to_replace =',', value = '', regex = True) 


In [58]:
#rename columns to remove spaces
column_list = df_states.columns
new_column_names = []
for column in column_list:
    new_column_name = column.capitalize().replace(' ','_').replace('&','_')
    new_column_names.append(new_column_name)

df_states.columns  = new_column_names

In [59]:
#convert appropriate columns to ints and floats
df_states[['Num_influenza_deaths','Num_pneumonia_deaths','Total_deaths']] = \
df_states[['Num_influenza_deaths','Num_pneumonia_deaths','Total_deaths']].astype(int)

df_states['Percent_p_i'] = df_states['Percent_p_i']/100
# df_states['Threshold'] = df_states['Threshold']/100
# df_states['Baseline'] = df_states['Baseline']/100

df_states['Percent_complete'] = df_states['Percent_complete'].str.replace('> 100%', '100%', regex=False)
df_states['Percent_complete'] = df_states['Percent_complete'].str.rstrip('%').astype('float') / 100.0

In [60]:
#add Calendar_year column
conditions = [
    df_states['Week']<40,
    df_states['Week']>=40    
]

df_states['First_year'] = df_states['Season'].apply(lambda x: int(x[0:4])) 
df_states['Second_year'] = df_states['Season'].apply(lambda x: int(x[0:4])+1) 
choices = [df_states['Second_year'],df_states['First_year']]
df_states['Calendar_year'] = np.select(conditions,choices)
df_states['Calendar_year'] = df_states['Calendar_year'].astype(int) 

In [61]:
#drop unused columns
df_states = df_states.drop('First_year', axis = 1)
df_states = df_states.drop('Second_year', axis = 1)
# df_states = df_states.drop('Sub_area', axis = 1)
df_states = df_states.drop('Age_group', axis = 1)
                 

In [62]:
#sort values and reset index
df_states = df_states.sort_values(['Calendar_year','Week']).reset_index()
df_states = df_states.drop('index', axis = 1)


In [63]:
df_states['State'] = df_states['Sub_area']
df_states = df_states.drop('Sub_area', axis = 1)


In [64]:
#save dataframe as csv file to be used in other files
df_states.to_csv('data/states_cdc_pandas_df.csv')

In [65]:
df_states.head()

Unnamed: 0,Area,Season,Week,Percent_p_i,Num_influenza_deaths,Num_pneumonia_deaths,Total_deaths,Percent_complete,Calendar_year,State
0,State,2012-13,1,0.088,4,73,874,0.9,2013,Alabama
1,State,2012-13,1,0.082,0,5,61,0.75,2013,Alaska
2,State,2012-13,1,0.078,2,60,798,0.77,2013,Arizona
3,State,2012-13,1,0.119,3,55,488,0.814,2013,Arkansas
4,State,2012-13,1,0.1,5,386,3897,0.793,2013,California
