In [1]:
import pandas as pd
import numpy as np

## National CDC Dataframe creation

In [2]:
df_national = pd.read_csv("data/National_Custom_Data.csv")

In [3]:
#CDC will add 'Insufficient Data' as a value in columns
df_national = df_national[df_national['TOTAL DEATHS'] != 'Insufficient Data']

In [4]:
#CDC data has commas in numbers
df_national = df_national.replace(to_replace =',', value = '', regex = True) 


In [5]:
#rename columns to remove spaces
column_list = df_national.columns
new_column_names = []
for column in column_list:
    new_column_name = column.capitalize().replace(' ','_').replace('&','_')
    new_column_names.append(new_column_name)

df_national.columns  = new_column_names

In [6]:
#convert appropriate columns to ints and floats
df_national[['Num_influenza_deaths','Num_pneumonia_deaths','Total_deaths']] = \
df_national[['Num_influenza_deaths','Num_pneumonia_deaths','Total_deaths']].astype(int)

df_national['Percent_p_i'] = df_national['Percent_p_i']/100
df_national['Threshold'] = df_national['Threshold']/100
df_national['Baseline'] = df_national['Baseline']/100

df_national['Percent_complete'] = df_national['Percent_complete'].str.replace('> 100%', '100%', regex=False)
df_national['Percent_complete'] = df_national['Percent_complete'].str.rstrip('%').astype('float') / 100.0

In [7]:
#add Calendar_year column
conditions = [
    df_national['Week']<40,
    df_national['Week']>=40    
]

df_national['First_year'] = df_national['Season'].apply(lambda x: int(x[0:4])) 
df_national['Second_year'] = df_national['Season'].apply(lambda x: int(x[0:4])+1) 
choices = [df_national['Second_year'],df_national['First_year']]
df_national['Calendar_year'] = np.select(conditions,choices)
df_national['Calendar_year'] = df_national['Calendar_year'].astype(int) 

In [8]:
#drop unused columns
df_national = df_national.drop('First_year', axis = 1)
df_national = df_national.drop('Second_year', axis = 1)
df_national = df_national.drop('Sub_area', axis = 1)
df_national = df_national.drop('Age_group', axis = 1)

                 

In [9]:
#sort values and reset index
df_national = df_national.sort_values(['Calendar_year','Week']).reset_index()
df_national = df_national.drop('index', axis = 1)


In [10]:
#bring in Population Dataframe
df_population = pd.read_csv("data/population_df.csv", index_col=0)

In [11]:
#Merge National and Population DataFrame to get Population
df_national_official = pd.merge(df_national, df_population,  how='left', 
                                left_on=['Area','Calendar_year'], right_on = ['Name','Calendar_year'])



In [12]:
# Drop useless col and add Deaths per hund thou metric
df_national_official = df_national_official.drop('Name', axis = 1)
df_national_official['Deaths_per_hund_thou'] = df_national_official['Total_deaths'] / (df_national_official['Population']/100000)

In [14]:
df_national_official['Percent_p'] = df_national_official['Num_pneumonia_deaths'] / df_national_official['Total_deaths']

In [16]:
#save dataframe as csv file to be used in other files
df_national_official.to_csv('data/national_cdc_pandas_df.csv')