In [1]:
# Import dependencies
import pandas as pd
from path import Path

In [2]:
# Import dataset
file_path = Path("./Resources/Flu_data/ILINet.csv", )
flu_cases_df = pd.read_csv(file_path)
flu_cases_df.reset_index(drop=True)

Unnamed: 0,REGION,YEAR,WEEK,% WEIGHTED ILI,%UNWEIGHTED ILI,AGE 0-4,AGE 25-49,AGE 25-64,AGE 5-24,AGE 50-64,AGE 65,ILITOTAL,NUM. OF PROVIDERS,TOTAL PATIENTS
0,Alabama,2010,40,X,2.13477,X,X,X,X,X,X,249,35,11664
1,Alaska,2010,40,X,0.875146,X,X,X,X,X,X,15,7,1714
2,Arizona,2010,40,X,0.674721,X,X,X,X,X,X,172,49,25492
3,Arkansas,2010,40,X,0.696056,X,X,X,X,X,X,18,15,2586
4,California,2010,40,X,1.95412,X,X,X,X,X,X,632,112,32342
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28963,Wyoming,2021,4,X,0.820707,X,X,X,X,X,X,13,5,1584
28964,Commonwealth of the Northern Mariana Islands,2021,4,X,X,X,X,X,X,X,X,X,X,X
28965,Puerto Rico,2021,4,X,1.6129,X,X,X,X,X,X,12,3,744
28966,Virgin Islands,2021,4,X,0.113895,X,X,X,X,X,X,1,3,878


In [3]:
# Drop unessessary columns
flu_cases_df.columns

drop_columns_list = ["WEEK", "% WEIGHTED ILI", "%UNWEIGHTED ILI", "AGE 0-4", "AGE 25-49", "AGE 5-24",
                     "AGE 50-64", "AGE 25-64", "AGE 65", "NUM. OF PROVIDERS", "TOTAL PATIENTS"]

flu_cases_filtered_df = flu_cases_df.drop(columns=drop_columns_list).copy()

print(flu_cases_filtered_df.shape)

flu_cases_filtered_df = flu_cases_filtered_df.reset_index(drop=True)

flu_cases_filtered_df.head()

# Drop columns that do not match other datasets
flu_cases_filtered_df.drop(flu_cases_filtered_df.index[(flu_cases_filtered_df['REGION'] == "New York City")], axis=0, inplace=True)
flu_cases_filtered_df.drop(flu_cases_filtered_df.index[(flu_cases_filtered_df['REGION'] == "Commonwealth of the Northern Mariana Islands")], axis=0, inplace=True)
flu_cases_filtered_df.drop(flu_cases_filtered_df.index[(flu_cases_filtered_df['REGION'] == "Puerto Rico")], axis=0, inplace=True)
flu_cases_filtered_df.drop(flu_cases_filtered_df.index[(flu_cases_filtered_df['REGION'] == "Virgin Islands")], axis=0, inplace=True)

print(flu_cases_filtered_df)

(28968, 3)
              REGION  YEAR ILITOTAL
0            Alabama  2010      249
1             Alaska  2010       15
2            Arizona  2010      172
3           Arkansas  2010       18
4         California  2010      632
...              ...   ...      ...
28959       Virginia  2021     1167
28960     Washington  2021       82
28961  West Virginia  2021       47
28962      Wisconsin  2021       76
28963        Wyoming  2021       13

[27489 rows x 3 columns]


In [4]:
# Drop cases before year 2015
flu_year_filtered_df = flu_cases_filtered_df.loc[(flu_cases_filtered_df["YEAR"] >= 2016) & (flu_cases_filtered_df["YEAR"] <= 2019)]

flu_year_filtered_df.set_index('REGION', drop=True, inplace=True)

flu_year_filtered_df.tail()

Unnamed: 0_level_0,YEAR,ILITOTAL
REGION,Unnamed: 1_level_1,Unnamed: 2_level_1
Virginia,2019,9140
Washington,2019,1907
West Virginia,2019,401
Wisconsin,2019,949
Wyoming,2019,181


In [5]:
### Check for Na values
flu_nona_df = flu_year_filtered_df.replace("X", 0)

flu_nona_df.head()

flu_nona_df.dtypes

flu_nona_df['ILITOTAL'] = flu_nona_df.ILITOTAL.astype(int)

flu_nona_df.dtypes

flu_nona_df.head()

Unnamed: 0_level_0,YEAR,ILITOTAL
REGION,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,2016,293
Alaska,2016,5
Arizona,2016,644
Arkansas,2016,57
California,2016,1077


In [6]:
# Group by Region & Year
flu_cases_by_year_df = flu_nona_df.groupby(["REGION", "YEAR"]).agg({"ILITOTAL": ['sum']})

flu_clean_df = flu_cases_by_year_df.stack().reset_index()

flu_clean_df = flu_clean_df.drop(columns=['level_2']).copy()

flu_clean_df = flu_clean_df.rename(columns={"REGION": "state", "YEAR": 'year', "ILITOTAL": "cases"})

flu_clean_df = flu_clean_df.pivot(index="state", columns='year', values="cases")

flu_clean_df.head()

year,2016,2017,2018,2019
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alabama,11546,26878,63999,68724
Alaska,1063,3306,8352,8249
Arizona,22366,22543,24574,20504
Arkansas,2883,5249,5831,5939
California,44159,46097,52153,52210


In [7]:
# Check Florida value
flu_clean_df.columns

Int64Index([2016, 2017, 2018, 2019], dtype='int64', name='year')

In [8]:
# Export file to csv

# flu_clean_df.to_csv("./Resources/Flu_data/cleaned_flu.csv", index=True)