In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import os

In [3]:
file_path = "Conditions_Contributing_to_COVID-19_Deaths__by_State_and_Age__Provisional_2020-2022.csv"

covid_data_df = pd.read_csv(file_path, index_col=False)
covid_data_df.head()

Unnamed: 0,Data As Of,Start Date,End Date,Group,Year,Month,State,Condition Group,Condition,ICD10_codes,Age Group,COVID-19 Deaths,Number of Mentions,Flag
0,10/16/2022,01/01/2020,10/15/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,1430.0,1495.0,
1,10/16/2022,01/01/2020,10/15/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,25-34,5647.0,5860.0,
2,10/16/2022,01/01/2020,10/15/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,35-44,14738.0,15345.0,
3,10/16/2022,01/01/2020,10/15/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,45-54,36674.0,38105.0,
4,10/16/2022,01/01/2020,10/15/2022,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,55-64,80438.0,83347.0,


In [4]:
covid_data_df = covid_data_df.drop(columns=['Data As Of', 'Start Date', 'End Date', 'Group', 'Year', 'Month', 'State', 'ICD10_codes', 'Number of Mentions', 'Flag'])

In [5]:
covid_data_df = covid_data_df.dropna()

In [6]:
covid_data_df = covid_data_df[covid_data_df['Age Group'] != 'All Ages']
covid_data_df

Unnamed: 0,Condition Group,Condition,Age Group,COVID-19 Deaths
0,Respiratory diseases,Influenza and pneumonia,0-24,1430.0
1,Respiratory diseases,Influenza and pneumonia,25-34,5647.0
2,Respiratory diseases,Influenza and pneumonia,35-44,14738.0
3,Respiratory diseases,Influenza and pneumonia,45-54,36674.0
4,Respiratory diseases,Influenza and pneumonia,55-64,80438.0
...,...,...,...,...
471921,COVID-19,COVID-19,Not stated,0.0
471922,COVID-19,COVID-19,Not stated,0.0
471923,COVID-19,COVID-19,Not stated,0.0
471924,COVID-19,COVID-19,Not stated,0.0


In [7]:
covid_data_df = covid_data_df[covid_data_df['Age Group'] != 'Not stated']
covid_data_df

Unnamed: 0,Condition Group,Condition,Age Group,COVID-19 Deaths
0,Respiratory diseases,Influenza and pneumonia,0-24,1430.0
1,Respiratory diseases,Influenza and pneumonia,25-34,5647.0
2,Respiratory diseases,Influenza and pneumonia,35-44,14738.0
3,Respiratory diseases,Influenza and pneumonia,45-54,36674.0
4,Respiratory diseases,Influenza and pneumonia,55-64,80438.0
...,...,...,...,...
471887,COVID-19,COVID-19,85+,95.0
471888,COVID-19,COVID-19,85+,74.0
471889,COVID-19,COVID-19,85+,90.0
471890,COVID-19,COVID-19,85+,71.0


In [8]:
covid_data_df = covid_data_df.set_index('Age Group')
covid_data_df.head()

Unnamed: 0_level_0,Condition Group,Condition,COVID-19 Deaths
Age Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0-24,Respiratory diseases,Influenza and pneumonia,1430.0
25-34,Respiratory diseases,Influenza and pneumonia,5647.0
35-44,Respiratory diseases,Influenza and pneumonia,14738.0
45-54,Respiratory diseases,Influenza and pneumonia,36674.0
55-64,Respiratory diseases,Influenza and pneumonia,80438.0


In [9]:
covid_data_df = covid_data_df.rename(columns={'Age Group': 'Age_Group', 'Condition Group': 'Condition_Group', 'COVID-19 Deaths': 'Deaths'})

In [10]:
covid_data_df.head()

Unnamed: 0_level_0,Condition_Group,Condition,Deaths
Age Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0-24,Respiratory diseases,Influenza and pneumonia,1430.0
25-34,Respiratory diseases,Influenza and pneumonia,5647.0
35-44,Respiratory diseases,Influenza and pneumonia,14738.0
45-54,Respiratory diseases,Influenza and pneumonia,36674.0
55-64,Respiratory diseases,Influenza and pneumonia,80438.0


In [11]:
covid_data_df = covid_data_df.astype({'Deaths': 'int64'})
covid_data_df.head()

Unnamed: 0_level_0,Condition_Group,Condition,Deaths
Age Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0-24,Respiratory diseases,Influenza and pneumonia,1430
25-34,Respiratory diseases,Influenza and pneumonia,5647
35-44,Respiratory diseases,Influenza and pneumonia,14738
45-54,Respiratory diseases,Influenza and pneumonia,36674
55-64,Respiratory diseases,Influenza and pneumonia,80438


In [13]:
bins = [-1, 0, 100, 500, 1500, 280000]
labels = ['no risk', 'low', 'medium', 'high', 'highest']
covid_data_df['Risk'] = pd.cut(covid_data_df['Deaths'], bins=bins, labels=labels)
covid_data_df.head()

Unnamed: 0_level_0,Condition_Group,Condition,Deaths,Risk
Age Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0-24,Respiratory diseases,Influenza and pneumonia,1430,high
25-34,Respiratory diseases,Influenza and pneumonia,5647,highest
35-44,Respiratory diseases,Influenza and pneumonia,14738,highest
45-54,Respiratory diseases,Influenza and pneumonia,36674,highest
55-64,Respiratory diseases,Influenza and pneumonia,80438,highest


In [24]:
covid_data_df.to_csv('clean_covid_data.csv')