In [2]:
import pandas as pd

# Load the CSV file
file_path = 'C:/Users/jeeva thangamani/Downloads/Provisional_COVID-19_death_counts_and_rates_by_month__jurisdiction_of_residence__and_demographic_characteristics_20240604.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the data
data.head()


Unnamed: 0,data_as_of,jurisdiction_residence,year,month,group,subgroup1,subgroup2,COVID_deaths,crude_COVID_rate,aa_COVID_rate,crude_COVID_rate_ann,aa_COVID_rate_ann,footnote
0,05/30/2024 12:00:00 AM,United States,2020,1,Sex,Female,,3.0,,,,,Rates for death counts <20 are unreliable.
1,05/30/2024 12:00:00 AM,United States,2020,1,Sex,Male,,3.0,,,,,Rates for death counts <20 are unreliable.
2,05/30/2024 12:00:00 AM,United States,2020,1,Age,0-4 years,,0.0,0.0,,0.0,,
3,05/30/2024 12:00:00 AM,United States,2020,1,Age,12-17 years,,0.0,0.0,,0.0,,
4,05/30/2024 12:00:00 AM,United States,2020,1,Age,18-29 years,,0.0,0.0,,0.0,,


In [5]:
# Descriptive statistics for COVID deaths
covid_deaths_desc = data['COVID_deaths'].describe()

# Total COVID deaths by year and jurisdiction
total_deaths_by_year_jurisdiction = data.groupby(['year', 'jurisdiction_residence'])['COVID_deaths'].sum().reset_index()

# Monthly COVID deaths for the United States
monthly_deaths_us = data[data['jurisdiction_residence'] == 'United States'].groupby(['year', 'month'])['COVID_deaths'].sum().reset_index()

# COVID deaths by age group
deaths_by_age_group = data[data['group'] == 'Age'].groupby(['year', 'subgroup1'])['COVID_deaths'].sum().reset_index()

# Display the results
print("Descriptive Statistics for COVID Deaths:")
print(covid_deaths_desc)

print("\nTotal COVID Deaths by Year and Jurisdiction:")
print(total_deaths_by_year_jurisdiction)

print("\nMonthly COVID Deaths for the United States:")
print(monthly_deaths_us)

print("\nCOVID Deaths by Age Group:")
print(deaths_by_age_group)


Descriptive Statistics for COVID Deaths:
count    35273.000000
mean       272.718623
std       1742.689101
min          0.000000
25%          0.000000
50%          0.000000
75%         45.000000
max      67990.000000
Name: COVID_deaths, dtype: float64

Total COVID Deaths by Year and Jurisdiction:
    year jurisdiction_residence  COVID_deaths
0   2020               Region 1       79061.0
1   2020              Region 10       28584.0
2   2020               Region 2      232295.0
3   2020               Region 3      140627.0
4   2020               Region 4      283274.0
5   2020               Region 5      267676.0
6   2020               Region 6      211282.0
7   2020               Region 7       73640.0
8   2020               Region 8       44806.0
9   2020               Region 9      194967.0
10  2020          United States     1562764.0
11  2021               Region 1       50773.0
12  2021              Region 10       55635.0
13  2021               Region 2      138158.0
14  2021    

In [6]:
#Project 3: Python Code for Predictive Analysis
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Preprocessing the data
data = data.dropna(subset=['COVID_deaths'])  # Dropping rows with missing target values

# Features and target variable
features = ['year', 'month']
X = data[features]
y = data['COVID_deaths']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating and training the model
model = LinearRegression()
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Displaying actual vs predicted values
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(results.head())


Mean Squared Error: 2761142.0850272174
       Actual   Predicted
2737      0.0  449.800814
33338     0.0  152.543400
41513     0.0  121.236674
11888   427.0  355.353043
12960  1553.0  351.874517


In [10]:
#Fine tuning the model
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error



# Dropping rows with missing target values
data = data.dropna(subset=['COVID_deaths'])

# Features and target variable
features = ['year', 'month', 'group', 'subgroup1']
X = data[features]
y = data['COVID_deaths']

# Preprocessing: One-hot encode categorical variables
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['group', 'subgroup1'])
    ],
    remainder='passthrough'
)

# Define the model
model = RandomForestRegressor(random_state=42)

# Create a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2]
}

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Making predictions
y_pred = best_model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Displaying actual vs predicted values
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(results.head())


Mean Squared Error: 2327274.7024567192
       Actual    Predicted
2737      0.0     2.412858
33338     0.0     7.816824
41513     0.0     4.485431
11888   427.0   138.500608
12960  1553.0  2028.903191
