In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import time
from scipy.stats import linregress
import numpy as np
from scipy import stats

In [2]:
# Create a DataFrame with csv file
data_df = pd.read_csv('Resources/United_States_COVID-19.csv')
data_df.head()

Unnamed: 0,submission_date,state,tot_cases,conf_cases,prob_cases,new_case,pnew_case,tot_death,conf_death,prob_death,new_death,pnew_death,created_at,consent_cases,consent_deaths
0,03/11/2021,KS,297229,241035.0,56194.0,0,0.0,4851,,,0,0.0,03/12/2021 03:20:13 PM,Agree,
1,12/01/2021,ND,163565,135705.0,27860.0,589,220.0,1907,,,9,0.0,12/02/2021 02:35:20 PM,Agree,Not agree
2,01/02/2022,AS,11,,,0,0.0,0,,,0,0.0,01/03/2022 03:18:16 PM,,
3,11/22/2021,AL,841461,620483.0,220978.0,703,357.0,16377,12727.0,3650.0,7,3.0,11/22/2021 12:00:00 AM,Agree,Agree
4,05/30/2022,AK,251425,,,0,0.0,1252,,,0,0.0,05/31/2022 01:20:20 PM,,


In [3]:
# Create a copy of DataFrame with columns we need
reduced_data_df = data_df[["submission_date", "state", "tot_cases", "new_case"]].copy()
reduced_data_df.head()

Unnamed: 0,submission_date,state,tot_cases,new_case
0,03/11/2021,KS,297229,0
1,12/01/2021,ND,163565,589
2,01/02/2022,AS,11,0
3,11/22/2021,AL,841461,703
4,05/30/2022,AK,251425,0


In [4]:
# Changing data type of submission date to datetime
reduced_data_df["submission_date"] = reduced_data_df[['submission_date']].apply(pd.to_datetime)

In [5]:
# Sorting the data by ascending submission date
reduced_data_df = reduced_data_df.sort_values("submission_date", ascending = True)

In [6]:
# List of Regions 
West = ["WA", "OR", "CA", "NV", "ID", "MT", "WY", "UT", "CO", "AZ", "NM"]
Mid_West = ["ND", "SD", "NE", "KS", "MN", "IA", "MO", "WI", "IL", "IN", "MI", "OH"]
North_East = ["ME", "NH", "VT", "MA", "CT", "RMI", "NY", "PA", "NJ"]
South = ["TX", "OK", "AR", "LA", "MS", "TN", "KY", "AL", "FL", "GA", "SC", "NC", "VA", "WV", "DC", "MD", "DE"]

reduced_data_df["Region"] = reduced_data_df["state"].apply(lambda x: "West" if x in West else
                                                          ("Mid_West" if x in Mid_West else
                                                          ("North_East" if x in North_East else
                                                          ("South" if x in South else np.nan))))
reduced_data_df

Unnamed: 0,submission_date,state,tot_cases,new_case,Region
50494,2020-01-22,WA,0,0,West
32276,2020-01-22,WI,0,0,Mid_West
51294,2020-01-22,WV,0,0,South
8824,2020-01-22,UT,0,0,West
2152,2020-01-22,GA,0,0,South
...,...,...,...,...,...
37836,2022-10-18,NE,540345,956,Mid_West
23106,2022-10-18,MD,1261834,689,South
25286,2022-10-18,MO,1660094,0,Mid_West
18065,2022-10-18,VI,23269,13,


In [7]:

reduced_data_df['Year'] = reduced_data_df['submission_date'].dt.year
reduced_data_df['Month'] = reduced_data_df['submission_date'].dt.month
reduced_data_df

Unnamed: 0,submission_date,state,tot_cases,new_case,Region,Year,Month
50494,2020-01-22,WA,0,0,West,2020,1
32276,2020-01-22,WI,0,0,Mid_West,2020,1
51294,2020-01-22,WV,0,0,South,2020,1
8824,2020-01-22,UT,0,0,West,2020,1
2152,2020-01-22,GA,0,0,South,2020,1
...,...,...,...,...,...,...,...
37836,2022-10-18,NE,540345,956,Mid_West,2022,10
23106,2022-10-18,MD,1261834,689,South,2022,10
25286,2022-10-18,MO,1660094,0,Mid_West,2022,10
18065,2022-10-18,VI,23269,13,,2022,10


In [11]:
# Dividing each region into sepereate DataFrames
West_data = reduced_data_df.loc[(reduced_data_df["Region"] == "West")]
Mid_West_data = reduced_data_df.loc[(reduced_data_df["Region"] == "Mid_West")]
South_data = reduced_data_df.loc[(reduced_data_df["Region"] == "South")]
North_East_data = reduced_data_df.loc[(reduced_data_df["Region"] == "North_East")]

West_data.tail()

Unnamed: 0,submission_date,state,tot_cases,new_case,Region,Year,Month
390,2022-10-18,NM,623179,266,West,2022,10
50632,2022-10-18,CO,1665451,570,West,2022,10
32065,2022-10-18,NV,850204,0,West,2022,10
18554,2022-10-18,MT,312863,0,West,2022,10
28617,2022-10-18,CA,11309237,0,West,2022,10
