In [59]:
import pandas as pd
import numpy as np

## National CDC Dataframe creation

In [50]:
df_national = pd.read_csv("data/National_Custom_Data.csv")

In [51]:
#CDC will add 'Insufficient Data' as a value in columns
df_national = df_national[df_national['TOTAL DEATHS'] != 'Insufficient Data']

In [52]:
#CDC data has commas in numbers
df_national = df_national.replace(to_replace =',', value = '', regex = True) 


In [53]:
#rename columns to remove spaces
column_list = df_national.columns
new_column_names = []
for column in column_list:
    new_column_name = column.capitalize().replace(' ','_').replace('&','_')
    new_column_names.append(new_column_name)

df_national.columns  = new_column_names

In [54]:
#convert appropriate columns to ints and floats
df_national[['Num_influenza_deaths','Num_pneumonia_deaths','Total_deaths']] = \
df_national[['Num_influenza_deaths','Num_pneumonia_deaths','Total_deaths']].astype(int)

df_national['Percent_p_i'] = df_national['Percent_p_i']/100
df_national['Threshold'] = df_national['Threshold']/100
df_national['Baseline'] = df_national['Baseline']/100

df_national['Percent_complete'] = df_national['Percent_complete'].str.replace('> 100%', '100%', regex=False)
df_national['Percent_complete'] = df_national['Percent_complete'].str.rstrip('%').astype('float') / 100.0

In [55]:
#add Calendar_year column
conditions = [
    df_national['Week']<40,
    df_national['Week']>=40    
]

df_national['First_year'] = df_national['Season'].apply(lambda x: int(x[0:4])) 
df_national['Second_year'] = df_national['Season'].apply(lambda x: int(x[0:4])+1) 
choices = [df_national['Second_year'],df_national['First_year']]
df_national['Calendar_year'] = np.select(conditions,choices)
df_national['Calendar_year'] = df_national['Calendar_year'].astype(int) 

In [56]:
#drop unused columns
df_national = df_national.drop('First_year', axis = 1)
df_national = df_national.drop('Second_year', axis = 1)
df_national = df_national.drop('Sub_area', axis = 1)
df_national = df_national.drop('Age_group', axis = 1)

                 

In [57]:
#sort values and reset index
df_national = df_national.sort_values(['Calendar_year','Week']).reset_index()
df_national = df_national.drop('index', axis = 1)


In [58]:
#save dataframe as csv file to be used in other files
df_national.to_csv('data/national_cdc_pandas_df.csv')