In [1]:
# Import dependencies
import pandas as pd
from path import Path

In [2]:
# Import datasets
# Population file path
pop_file_path = Path("./Resources/Flu_data/cleaned_population_density.csv")

# Flu file path
flu_file_path = Path("./Resources/Flu_data/cleaned_flu.csv")

# Create population dataframe 
pop_df = pd.read_csv(pop_file_path)

# Creater flu cases data frame
flu_df = pd.read_csv(flu_file_path)

In [3]:
# Checkout the population dataframe
pop_df.head()

Unnamed: 0,State,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,Unnamed: 11,State.1,Land Area (sq mi),Unnamed: 14,2016 Population Density (persons/square mile),2017 Population Density (persons/square mile),2018 Population Density (persons/square mile),2019 Population Density (persons/square mile)
0,.Alabama,4785437.0,4799069.0,4815588.0,4830081.0,4841799.0,4852347.0,4863525.0,4874486.0,4887681.0,4903185.0,,Alabama,50645.39,,96.03,96.25,96.51,96.81
1,.Alaska,713910.0,722128.0,730443.0,737068.0,736283.0,737498.0,741456.0,739700.0,735139.0,731545.0,,Alaska,570640.61,,1.3,1.3,1.29,1.28
2,.Arizona,6407172.0,6472643.0,6554978.0,6632764.0,6730413.0,6829676.0,6941072.0,7044008.0,7158024.0,7278717.0,,Arizona,113593.91,,61.1,62.01,63.01,64.08
3,.Arkansas,2921964.0,2940667.0,2952164.0,2959400.0,2967392.0,2978048.0,2989918.0,3001345.0,3009733.0,3017804.0,,Arkansas,52035.35,,57.46,57.68,57.84,58.0
4,.California,37319502.0,37638369.0,37948800.0,38260787.0,38596972.0,38918045.0,39167117.0,39358497.0,39461588.0,39512223.0,,California,155779.03,,251.43,252.66,253.32,253.64


In [4]:
# Checkout the flu dataframe
flu_df.head()

Unnamed: 0,state,2016,2017,2018,2019
0,Alabama,11546,26878,63999,68724
1,Alaska,1063,3306,8352,8249
2,Arizona,22366,22543,24574,20504
3,Arkansas,2883,5249,5831,5939
4,California,44159,46097,52153,52210


In [5]:
# Clean population dataframe
pop_df = pop_df.drop(pop_df.columns[1:7], axis=1)

pop_df = pop_df.drop(pop_df.columns[5:13], axis=1)

pop_df = pop_df.rename(columns = {"2016": "population_2016",
                                  "2017": "population_2017", 
                                  "2018": "population_2018", 
                                  "2019": "population_2019"})

# print(pop_df)

In [6]:
# Clean flu dataframe
flu_df.drop(flu_df.index[(flu_df['state'] == "New York City")], axis=0, inplace=True)
flu_df.drop(flu_df.index[(flu_df['state'] == "Virgin Islands")], axis=0, inplace=True)
flu_df.drop(flu_df.index[(flu_df['state'] == "Puerto Rico")], axis=0, inplace=True)

flu_df.columns

flu_df = flu_df.rename(columns={"2016": "cases_2016", 
                       "2017": "cases_2017",
                       "2018": "cases_2018",
                       "2019": "cases_2019"})

flu_df = flu_df.reset_index(drop=True)

# print(flu_df)

flu_df.to_csv("./Resources/Flu_data/flu_cleaned.csv", index=True)

In [7]:
# Concatinate dataframes
pop_flu_df = pd.concat([pop_df, flu_df], axis=1)

# pop_flu_df.drop(columns="State", inplace=True)

pop_flu_df.head()

pop_flu_df = pop_flu_df[["state", "population_2016", "population_2017", "population_2018", "population_2019", 
                         "cases_2016", "cases_2017", "cases_2018", "cases_2019"]]

# print(pop_flu_df)

In [8]:
# Create new column with cases by percentage
pop_flu_df.dtypes

for row in pop_flu_df:
    pop_flu_df["2016_cases_percent"] = (pop_flu_df['cases_2016'] / pop_flu_df['population_2016']) * 100
    pop_flu_df["2017_cases_percent"] = (pop_flu_df['cases_2017'] / pop_flu_df['population_2017']) * 100
    pop_flu_df["2018_cases_percent"] = (pop_flu_df['cases_2018'] / pop_flu_df['population_2018']) * 100
    pop_flu_df["2019_cases_percent"] = (pop_flu_df['cases_2019'] / pop_flu_df['population_2019']) * 100
    
pop_flu_df.head()

Unnamed: 0,state,population_2016,population_2017,population_2018,population_2019,cases_2016,cases_2017,cases_2018,cases_2019,2016_cases_percent,2017_cases_percent,2018_cases_percent,2019_cases_percent
0,Alabama,4863525.0,4874486.0,4887681.0,4903185.0,11546,26878,63999,68724,0.2374,0.551402,1.309394,1.40162
1,Alaska,741456.0,739700.0,735139.0,731545.0,1063,3306,8352,8249,0.143367,0.446938,1.136112,1.127613
2,Arizona,6941072.0,7044008.0,7158024.0,7278717.0,22366,22543,24574,20504,0.322227,0.320031,0.343307,0.281698
3,Arkansas,2989918.0,3001345.0,3009733.0,3017804.0,2883,5249,5831,5939,0.096424,0.174888,0.193738,0.196799
4,California,39167117.0,39358497.0,39461588.0,39512223.0,44159,46097,52153,52210,0.112745,0.117121,0.132161,0.132136


In [9]:
# Clean data to proper form
cols_to_drop = ["population_2016", "population_2017", "population_2018", "population_2019", 
                "cases_2016", "cases_2017", "cases_2018", "cases_2019"]

pop_flu_df.drop(columns=cols_to_drop, inplace=True)

# print(pop_flu_df)

In [10]:
# Check data types
pop_flu_df.dtypes

state                  object
2016_cases_percent    float64
2017_cases_percent    float64
2018_cases_percent    float64
2019_cases_percent    float64
dtype: object

In [11]:
# Export flu case by percent data
# pop_flu_df.to_csv("./Resources/Flu_data/flu_percentages.csv", index=False)