In [1]:
# Import dependencies
import pandas as pd
from path import Path

In [2]:
# Import dataset
file_path = Path("./Resources/Flu_data/ILINet.csv", )
flu_cases_df = pd.read_csv(file_path)
flu_cases_df.head()

Unnamed: 0,REGION,YEAR,WEEK,% WEIGHTED ILI,%UNWEIGHTED ILI,AGE 0-4,AGE 25-49,AGE 25-64,AGE 5-24,AGE 50-64,AGE 65,ILITOTAL,NUM. OF PROVIDERS,TOTAL PATIENTS
States,Alabama,2010,40,X,2.13477,X,X,X,X,X,X,249,35,11664
States,Alaska,2010,40,X,0.875146,X,X,X,X,X,X,15,7,1714
States,Arizona,2010,40,X,0.674721,X,X,X,X,X,X,172,49,25492
States,Arkansas,2010,40,X,0.696056,X,X,X,X,X,X,18,15,2586
States,California,2010,40,X,1.95412,X,X,X,X,X,X,632,112,32342


In [3]:
# Drop unessessary columns
flu_cases_df.columns

drop_columns_list = ["WEEK", "% WEIGHTED ILI", "%UNWEIGHTED ILI", "AGE 0-4", "AGE 25-49", "AGE 5-24",
                     "AGE 50-64", "AGE 25-64", "AGE 65", "NUM. OF PROVIDERS", "TOTAL PATIENTS"]

flu_cases_filtered_df = flu_cases_df.drop(columns=drop_columns_list).copy()

print(flu_cases_filtered_df.shape)

flu_cases_filtered_df.head()

(28968, 3)


Unnamed: 0,REGION,YEAR,ILITOTAL
States,Alabama,2010,249
States,Alaska,2010,15
States,Arizona,2010,172
States,Arkansas,2010,18
States,California,2010,632


In [4]:
# Drop cases before year 2015
flu_year_filtered_df = flu_cases_filtered_df.loc[(flu_cases_filtered_df["YEAR"] >= 2016) & (flu_cases_filtered_df["YEAR"] <= 2019)]

flu_year_filtered_df.set_index('REGION', drop=True, inplace=True)

flu_year_filtered_df.tail()

Unnamed: 0_level_0,YEAR,ILITOTAL
REGION,Unnamed: 1_level_1,Unnamed: 2_level_1
Wyoming,2019,181
Commonwealth of the Northern Mariana Islands,2019,X
Puerto Rico,2019,415
Virgin Islands,2019,9
New York City,2019,6853


In [5]:
### Check for Na values
flu_nona_df = flu_year_filtered_df.replace("X", 0)

flu_nona_df.head()

flu_nona_df.dtypes

flu_nona_df['ILITOTAL'] = flu_nona_df.ILITOTAL.astype(int)

flu_nona_df.dtypes

flu_nona_df.head()

Unnamed: 0_level_0,YEAR,ILITOTAL
REGION,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,2016,293
Alaska,2016,5
Arizona,2016,644
Arkansas,2016,57
California,2016,1077


In [6]:
# Group by Region & Year
flu_cases_by_year_df = flu_nona_df.groupby(["REGION", "YEAR"]).agg({"ILITOTAL": ['sum']})

flu_clean_df = flu_cases_by_year_df.stack().reset_index()

flu_clean_df = flu_clean_df.drop(columns=['level_2']).copy()

flu_clean_df = flu_clean_df.rename(columns={"REGION": "state", "YEAR": 'year', "ILITOTAL": "cases"})

flu_clean_df = flu_clean_df.pivot(index="state", columns='year', values="cases")

flu_clean_df.head()

year,2016,2017,2018,2019
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alabama,11546.0,26878.0,63999.0,68724.0
Alaska,1063.0,3306.0,8352.0,8249.0
Arizona,22366.0,22543.0,24574.0,20504.0
Arkansas,2883.0,5249.0,5831.0,5939.0
California,44159.0,46097.0,52153.0,52210.0


In [7]:
# Export file to csv0000

flu_clean_df.to_csv("./Resources/Flu_data/cleaned_flu.csv", index=True)