In [1]:
import sqlite3
import pandas as pd

In [2]:
# Import covid table as dataframe
con = sqlite3.connect("../Data/data.db")
covid_df = pd.read_sql_query("SELECT * FROM covid", con)
print(covid_df.dtypes)
print(covid_df.shape)
covid_df.head()

date                        object
state                       object
positiveIncrease             int64
totalTestResultsIncrease     int64
dtype: object
(18223, 4)


Unnamed: 0,date,state,positiveIncrease,totalTestResultsIncrease
0,2021-02-20 00:00:00,AK,0,0
1,2021-02-20 00:00:00,AL,774,5436
2,2021-02-20 00:00:00,AR,517,3060
3,2021-02-20 00:00:00,AZ,2047,45153
4,2021-02-20 00:00:00,CA,6668,192222


In [3]:
# Change date to datetime
covid_df['date'] = pd.to_datetime(covid_df['date'])
print(covid_df.dtypes)
covid_df.head()

date                        datetime64[ns]
state                               object
positiveIncrease                     int64
totalTestResultsIncrease             int64
dtype: object


Unnamed: 0,date,state,positiveIncrease,totalTestResultsIncrease
0,2021-02-20,AK,0,0
1,2021-02-20,AL,774,5436
2,2021-02-20,AR,517,3060
3,2021-02-20,AZ,2047,45153
4,2021-02-20,CA,6668,192222


In [4]:
# Define cutoff dates and encoding
months = [2,3,4,5,6,7,8,9]
start = [pd.Timestamp("2-15-2020"),
         pd.Timestamp("3-15-2020"),
         pd.Timestamp("4-15-2020"),
         pd.Timestamp("5-15-2020"),
         pd.Timestamp("6-15-2020"),
         pd.Timestamp("7-15-2020"),
         pd.Timestamp("8-15-2020"),
         pd.Timestamp("9-15-2020")]
end = [pd.Timestamp("3-15-2020"),
       pd.Timestamp("4-15-2020"),
       pd.Timestamp("5-15-2020"),
       pd.Timestamp("6-15-2020"),
       pd.Timestamp("7-15-2020"),
       pd.Timestamp("8-15-2020"),
       pd.Timestamp("9-15-2020"),
       pd.Timestamp("10-15-2020")]

# Include new month column
for i in range(len(months)):
    covid_df.loc[(covid_df['date']>=start[i]) & (covid_df['date']<end[i]), "month"] = months[i]

# Drop rows outside of daterange
covid_df = covid_df.dropna()
print(covid_df.shape)

covid_df.head()

(11546, 5)


Unnamed: 0,date,state,positiveIncrease,totalTestResultsIncrease,month
6579,2020-10-14,AK,155,2388,9.0
6580,2020-10-14,AL,784,5014,9.0
6581,2020-10-14,AR,1079,10677,9.0
6582,2020-10-14,AZ,901,22286,9.0
6583,2020-10-14,CA,2666,91770,9.0


In [5]:
# Groupby state and month and sum
covid_df = covid_df.groupby(['state','month']).sum()
covid_df = covid_df.reset_index()
print(covid_df.shape)
covid_df.head()

(408, 4)


Unnamed: 0,state,month,positiveIncrease,totalTestResultsIncrease
0,AK,2.0,0,136
1,AK,3.0,285,8204
2,AK,4.0,104,23414
3,AK,5.0,276,41705
4,AK,6.0,915,76006


In [17]:
# Calculate change in infections by state and month
months = [3,4,5,6,7,8,9]
states = covid_df['state'].unique()

for month in months:
    for state in states:
        curr_month = covid_df[(covid_df['month'] == month) & (covid_df['state'] == state)].positiveIncrease.item()
        prev_month = covid_df[(covid_df['month'] == month-1) & (covid_df['state'] == state)].positiveIncrease.item()
        if prev_month != 0:
            covid_df.loc[(covid_df['month'] == month) & (covid_df['state'] == state), 'change'] = \
            (curr_month-prev_month)/prev_month

# Drop na - data for month 2, and month 3 where month 2 positiveIncrease is 0
covid_df = covid_df.dropna()
print(covid_df.shape)
covid_df.head()

(355, 5)


Unnamed: 0,state,month,positiveIncrease,totalTestResultsIncrease,change
2,AK,4.0,104,23414,-0.635088
3,AK,5.0,276,41705,1.653846
4,AK,6.0,915,76006,2.315217
5,AK,7.0,2498,146456,1.730055
6,AK,8.0,2272,107484,-0.090472


In [27]:
# Get population of states
pop_df = pd.read_sql_query("SELECT state, population FROM policy", con)
pop_df.head()

Unnamed: 0,state,population
0,AL,4887871
1,AK,737438
2,AZ,7171646
3,AR,3013825
4,CA,39557045
5,CO,5695564
6,CT,3572665
7,DE,967171
8,DC,702455
9,FL,21299325


In [28]:
# Merge tables
covid_df = pd.merge(covid_df, pop_df, on='state')
covid_df.head()

Unnamed: 0,state,month,positiveIncrease,totalTestResultsIncrease,change,population
0,AK,4.0,104,23414,-0.635088,737438
1,AK,5.0,276,41705,1.653846,737438
2,AK,6.0,915,76006,2.315217,737438
3,AK,7.0,2498,146456,1.730055,737438
4,AK,8.0,2272,107484,-0.090472,737438


In [33]:
# Calculate new cases per capita
covid_df['per_capita'] = covid_df['positiveIncrease']/covid_df['population']
covid_df = covid_df.astype({'month':'int64'})
print(covid_df.shape)
print(covid_df.dtypes)
covid_df.head()

(355, 7)
state                        object
month                         int64
positiveIncrease              int64
totalTestResultsIncrease      int64
change                      float64
population                    int64
per_capita                  float64
dtype: object


Unnamed: 0,state,month,positiveIncrease,totalTestResultsIncrease,change,population,per_capita
0,AK,4,104,23414,-0.635088,737438,0.000141
1,AK,5,276,41705,1.653846,737438,0.000374
2,AK,6,915,76006,2.315217,737438,0.001241
3,AK,7,2498,146456,1.730055,737438,0.003387
4,AK,8,2272,107484,-0.090472,737438,0.003081
