In [36]:
import sqlite3
import pandas as pd
import statsmodels.api as sm
from statsmodels.tools import eval_measures
import numpy as np

In [2]:
# Import covid table as dataframe
con = sqlite3.connect("../Data/data.db")
covid_df = pd.read_sql_query("SELECT * FROM covid", con)
print(covid_df.dtypes)
print(covid_df.shape)
covid_df.head()

date                        object
state                       object
positiveIncrease             int64
totalTestResultsIncrease     int64
dtype: object
(18223, 4)


Unnamed: 0,date,state,positiveIncrease,totalTestResultsIncrease
0,2021-02-20 00:00:00,AK,0,0
1,2021-02-20 00:00:00,AL,774,5436
2,2021-02-20 00:00:00,AR,517,3060
3,2021-02-20 00:00:00,AZ,2047,45153
4,2021-02-20 00:00:00,CA,6668,192222


In [3]:
# Change date to datetime
covid_df['date'] = pd.to_datetime(covid_df['date'])
print(covid_df.dtypes)
covid_df.head()

date                        datetime64[ns]
state                               object
positiveIncrease                     int64
totalTestResultsIncrease             int64
dtype: object


Unnamed: 0,date,state,positiveIncrease,totalTestResultsIncrease
0,2021-02-20,AK,0,0
1,2021-02-20,AL,774,5436
2,2021-02-20,AR,517,3060
3,2021-02-20,AZ,2047,45153
4,2021-02-20,CA,6668,192222


In [4]:
# Define cutoff dates and encoding
months = [2,3,4,5,6,7,8,9]
start = [pd.Timestamp("2-15-2020"),
         pd.Timestamp("3-15-2020"),
         pd.Timestamp("4-15-2020"),
         pd.Timestamp("5-15-2020"),
         pd.Timestamp("6-15-2020"),
         pd.Timestamp("7-15-2020"),
         pd.Timestamp("8-15-2020"),
         pd.Timestamp("9-15-2020")]
end = [pd.Timestamp("3-15-2020"),
       pd.Timestamp("4-15-2020"),
       pd.Timestamp("5-15-2020"),
       pd.Timestamp("6-15-2020"),
       pd.Timestamp("7-15-2020"),
       pd.Timestamp("8-15-2020"),
       pd.Timestamp("9-15-2020"),
       pd.Timestamp("10-15-2020")]

# Include new month column
for i in range(len(months)):
    covid_df.loc[(covid_df['date']>=start[i]) & (covid_df['date']<end[i]), "month"] = months[i]

# Drop rows outside of daterange
covid_df = covid_df.dropna()
print(covid_df.shape)

covid_df.head()

(11546, 5)


Unnamed: 0,date,state,positiveIncrease,totalTestResultsIncrease,month
6579,2020-10-14,AK,155,2388,9.0
6580,2020-10-14,AL,784,5014,9.0
6581,2020-10-14,AR,1079,10677,9.0
6582,2020-10-14,AZ,901,22286,9.0
6583,2020-10-14,CA,2666,91770,9.0


In [5]:
# Groupby state and month and sum
covid_df = covid_df.groupby(['state','month']).sum()
covid_df = covid_df.reset_index()
print(covid_df.shape)
covid_df.head()

(408, 4)


Unnamed: 0,state,month,positiveIncrease,totalTestResultsIncrease
0,AK,2.0,0,136
1,AK,3.0,285,8204
2,AK,4.0,104,23414
3,AK,5.0,276,41705
4,AK,6.0,915,76006


In [6]:
# Calculate change in infections by state and month
months = [3,4,5,6,7,8,9]
states = covid_df['state'].unique()

for month in months:
    for state in states:
        curr_month = covid_df[(covid_df['month'] == month) & (covid_df['state'] == state)].positiveIncrease.item()
        prev_month = covid_df[(covid_df['month'] == month-1) & (covid_df['state'] == state)].positiveIncrease.item()
        if prev_month != 0:
            covid_df.loc[(covid_df['month'] == month) & (covid_df['state'] == state), 'change'] = \
            (curr_month-prev_month)/prev_month

# Drop na - data for month 2, and month 3 where month 2 positiveIncrease is 0
covid_df = covid_df.dropna()
print(covid_df.shape)
covid_df.head()

(355, 5)


Unnamed: 0,state,month,positiveIncrease,totalTestResultsIncrease,change
2,AK,4.0,104,23414,-0.635088
3,AK,5.0,276,41705,1.653846
4,AK,6.0,915,76006,2.315217
5,AK,7.0,2498,146456,1.730055
6,AK,8.0,2272,107484,-0.090472


In [7]:
# Get population of states
pop_df = pd.read_sql_query("SELECT state, population FROM policy", con)
pop_df.head()

Unnamed: 0,state,population
0,AL,4887871
1,AK,737438
2,AZ,7171646
3,AR,3013825
4,CA,39557045


In [8]:
# Merge tables
covid_df = pd.merge(covid_df, pop_df, on='state')
covid_df.head()

Unnamed: 0,state,month,positiveIncrease,totalTestResultsIncrease,change,population
0,AK,4.0,104,23414,-0.635088,737438
1,AK,5.0,276,41705,1.653846,737438
2,AK,6.0,915,76006,2.315217,737438
3,AK,7.0,2498,146456,1.730055,737438
4,AK,8.0,2272,107484,-0.090472,737438


In [9]:
# Calculate new cases per capita
covid_df['per_capita'] = covid_df['positiveIncrease']/covid_df['population']
covid_df = covid_df.astype({'month':'int64'})
print(covid_df.shape)
print(covid_df.dtypes)
covid_df.head()

(355, 7)
state                        object
month                         int64
positiveIncrease              int64
totalTestResultsIncrease      int64
change                      float64
population                    int64
per_capita                  float64
dtype: object


Unnamed: 0,state,month,positiveIncrease,totalTestResultsIncrease,change,population,per_capita
0,AK,4,104,23414,-0.635088,737438,0.000141
1,AK,5,276,41705,1.653846,737438,0.000374
2,AK,6,915,76006,2.315217,737438,0.001241
3,AK,7,2498,146456,1.730055,737438,0.003387
4,AK,8,2272,107484,-0.090472,737438,0.003081


In [10]:
covid_df.to_csv('cases.csv')

In [11]:
policy_df = pd.read_sql_query("SELECT * FROM policy", con)
# drop pop density and population columns 
policy_df = policy_df.drop(columns=['pop_density', 'population'])
policy_df.head()

Unnamed: 0,state,state_of_emergency,stay_at_home,stay_at_home_end,business_closure,business_closure_end,facemask_mandate,facemask_mandate_end,quaratine_mandate,quaratine_mandate_end
0,AL,2020-03-13 00:00:00,2020-04-04 00:00:00,2020-04-30 00:00:00,2020-03-28 00:00:00,2020-04-30 00:00:00,2020-07-16 00:00:00,,,
1,AK,2020-03-11 00:00:00,2020-03-28 00:00:00,2020-04-24 00:00:00,2020-03-24 00:00:00,2020-04-24 00:00:00,2020-04-24 00:00:00,2020-05-22 00:00:00,2020-03-25 00:00:00,2021-02-14 00:00:00
2,AZ,2020-03-11 00:00:00,2020-03-31 00:00:00,2020-05-16 00:00:00,2020-03-31 00:00:00,2020-05-08 00:00:00,,,,2020-05-12 00:00:00
3,AR,2020-03-11 00:00:00,,,2020-04-06 00:00:00,2020-05-04 00:00:00,2020-07-20 00:00:00,,,2020-06-15 00:00:00
4,CA,2020-03-04 00:00:00,2020-03-19 00:00:00,,2020-03-19 00:00:00,2020-05-08 00:00:00,2020-06-18 00:00:00,,,


In [12]:
# change columns to Datetime object
# convert date to only the month value
for i in range(1, policy_df.shape[1]):
    policy_df.iloc[:, i] = pd.to_datetime(policy_df.iloc[:, i], errors="coerce")
    policy_df.iloc[:, i] = policy_df.iloc[:, i].dt.month
policy_df.head()

Unnamed: 0,state,state_of_emergency,stay_at_home,stay_at_home_end,business_closure,business_closure_end,facemask_mandate,facemask_mandate_end,quaratine_mandate,quaratine_mandate_end
0,AL,3,4.0,4.0,3.0,4.0,7.0,,,
1,AK,3,3.0,4.0,3.0,4.0,4.0,5.0,3.0,2.0
2,AZ,3,3.0,5.0,3.0,5.0,,,,5.0
3,AR,3,,,4.0,5.0,7.0,,,6.0
4,CA,3,3.0,,3.0,5.0,6.0,,,


In [13]:
# create dummy variables 
policy_df = policy_df.fillna(0)
dates = list(policy_df.columns)
dates.pop(0)
policy_df[dates] = policy_df[dates].astype(int)

cats = ["state"]
for date in dates:
    for i in range(3,10):
        cats.append(date+"_{i}".format(i=i))


    
dummy_df = pd.get_dummies(policy_df, columns=dates)
dummy_df = dummy_df.T.reindex(cats).T.fillna(0)

dummy_df.head()


Unnamed: 0,state,state_of_emergency_3,state_of_emergency_4,state_of_emergency_5,state_of_emergency_6,state_of_emergency_7,state_of_emergency_8,state_of_emergency_9,stay_at_home_3,stay_at_home_4,...,quaratine_mandate_7,quaratine_mandate_8,quaratine_mandate_9,quaratine_mandate_end_3,quaratine_mandate_end_4,quaratine_mandate_end_5,quaratine_mandate_end_6,quaratine_mandate_end_7,quaratine_mandate_end_8,quaratine_mandate_end_9
0,AL,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,AK,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,AZ,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,AR,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,CA,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# convert to panel data
dummy_df = (pd.wide_to_long(dummy_df, stubnames=dates, j='month', i='state', sep='_')
         .reset_index(level=1, drop=True)
         .sort_index())

print(dummy_df.shape)
dummy_df.head()

(357, 9)


Unnamed: 0_level_0,state_of_emergency,stay_at_home,stay_at_home_end,business_closure,business_closure_end,facemask_mandate,facemask_mandate_end,quaratine_mandate,quaratine_mandate_end
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AK,0,0,0,0,0,0,0,0,0
AK,1,1,0,1,0,0,0,1,0
AK,0,0,0,0,0,0,0,0,0
AK,0,0,0,0,0,0,0,0,0
AK,0,0,0,0,0,0,0,0,0


In [37]:
months = []
for i in range(51):
    for j in range(3, 10):
        months.append(j)


# reorder the month column
dummy_df.insert(0, 'month', np.array(months))

print(dummy_df.shape)
dummy_df.head()

ValueError: cannot insert month, already exists

In [38]:
dummy_df

Unnamed: 0_level_0,month,state_of_emergency,stay_at_home,stay_at_home_end,business_closure,business_closure_end,facemask_mandate,facemask_mandate_end,quaratine_mandate,quaratine_mandate_end
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AK,3,0,0,0,0,0,0,0,0,0
AK,4,1,1,0,1,0,0,0,1,0
AK,5,0,0,0,0,0,0,0,0,0
AK,6,0,0,0,0,0,0,0,0,0
AK,7,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
WY,5,0,0,0,0,0,0,0,0,0
WY,6,0,0,0,0,1,0,0,0,1
WY,7,1,0,0,1,0,0,0,0,0
WY,8,0,0,0,0,0,0,0,0,0


<h2>grouping flights by destination state</h2>

In [18]:
flight_df = pd.read_sql_query("SELECT * FROM flight WHERE (month != 1.0) AND (month != 2.0)", con)

In [19]:
flight_df.to_csv('../Data/flights.csv')

In [20]:
arrivals = flight_df.groupby(['dest_state','month']).sum().reset_index()

In [21]:
arrivals.head(15)

Unnamed: 0,dest_state,month,monthly_pax
0,AK,3.0,138235.631149
1,AK,4.0,26092.783185
2,AK,5.0,52663.559269
3,AK,6.0,110763.657546
4,AK,7.0,158323.464182
5,AK,8.0,151104.177926
6,AK,9.0,124192.357892
7,AL,3.0,105264.222427
8,AL,4.0,10663.991963
9,AL,5.0,29081.545354


<h2>State-State Model</h2>

In [264]:
merge_1 = pd.merge(flight_df,covid_df,left_on=['dest_state','month'],right_on=['state','month'],how='inner')

In [265]:
merge_2 = pd.merge(merge_1,covid_df,left_on=['origin_state','month'],right_on=['state','month'],how='inner')

In [266]:
populations = pd.read_csv("../Data/statepop.csv")

In [267]:
merge_3 = pd.merge(merge_2,populations,left_on = 'dest_state',right_on = 'State')

In [268]:
merge_4 = pd.merge(merge_3,populations,left_on = 'origin_state',right_on = 'State')

In [269]:
merge_5 = merge_4[merge_4['origin_state'] != merge_4['dest_state']]

In [270]:
merge_5['dest_positive'] = merge_5['positiveIncrease_x'] / merge_5['totalTestResultsIncrease_x']
merge_5['origin_positive'] = merge_5['positiveIncrease_y'] / merge_5['totalTestResultsIncrease_y']
merge_5['dest_pop'] = merge_5['2018 Population_x']
merge_5['origin_pop'] = merge_5['2018 Population_y']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [271]:
cleaned_6 = merge_5[['origin_state','dest_state','month','monthly_pax','dest_pop','origin_pop','dest_positive','origin_positive']]

In [272]:
cleaned_6.head()

Unnamed: 0,origin_state,dest_state,month,monthly_pax,dest_pop,origin_pop,dest_positive,origin_positive
6,AK,AL,4.0,43.377867,4888949,738068,0.068684,0.004442
7,AK,AL,5.0,113.423683,4888949,738068,0.091143,0.006618
8,AK,AL,6.0,183.19845,4888949,738068,0.136851,0.012039
9,AK,AL,7.0,368.845306,4888949,738068,0.167651,0.017056
10,AK,AL,8.0,411.530764,4888949,738068,0.1713,0.021138


In [273]:
cleaned_6['per_cap_arrivals'] = (cleaned_6['monthly_pax'] / cleaned_6['dest_pop'])
cleaned_6['state_of_emergency'] = 0
cleaned_6['stay_at_home'] = 0
cleaned_6['business_closure'] = 0
cleaned_6['mask_mandate'] = 0
cleaned_6['quarantine_order'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [278]:
policy_df

Unnamed: 0,state,state_of_emergency,stay_at_home,stay_at_home_end,business_closure,business_closure_end,facemask_mandate,facemask_mandate_end,quaratine_mandate,quaratine_mandate_end
0,AL,3,4,4,3,4,7,0,0,0
1,AK,3,3,4,3,4,4,5,3,2
2,AZ,3,3,5,3,5,0,0,0,5
3,AR,3,0,0,4,5,7,0,0,6
4,CA,3,3,0,3,5,6,0,0,0
5,CO,3,3,4,3,5,7,0,0,0
6,CT,3,3,5,3,5,4,0,0,0
7,DE,3,3,6,3,5,4,0,3,6
8,DC,3,4,5,3,5,4,0,0,0
9,FL,3,4,5,4,5,0,0,0,8


In [258]:
def helpey(row,column):
    month = row['month']
    state = row['dest_state']
    
    policy_data = policy_df[policy_df['state'] == state]
    
    counter = 0
    
    end_str = column + '_end'
#     print(end_str)
    
    if (int(policy_data[column].iloc[0]) <= month) and (int(policy_data[column].iloc[0]) > 0):
        counter = 1
        
    if column == "state_of_emergency":
        pass
    elif (int(policy_data[end_str].iloc[0]) <= month) and (int(policy_data[end_str].iloc[0]) > 0):
        counter = 0
                
    return counter
    

In [279]:
cleaned_6['state_of_emergency'] = cleaned_6.apply((lambda x: helpey(x, 'state_of_emergency')), axis=1)
cleaned_6['stay_at_home'] = cleaned_6.apply((lambda x: helpey(x, 'stay_at_home')), axis=1)
cleaned_6['business_closure'] = cleaned_6.apply((lambda x: helpey(x, 'business_closure')), axis=1)
cleaned_6['mask_mandate'] = cleaned_6.apply((lambda x: helpey(x, 'facemask_mandate')), axis=1)
cleaned_6['quarantine_order'] = cleaned_6.apply((lambda x: helpey(x, 'quaratine_mandate')), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [None]:
cleaned_6

In [280]:
cleaned_6['per_cap_positive_arr'] = cleaned_6['per_cap_arrivals'] * cleaned_6['origin_positive']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [282]:
reg_state = cleaned_6[['dest_positive','origin_positive','per_cap_positive_arr','per_cap_arrivals','state_of_emergency','stay_at_home','business_closure','mask_mandate']]

In [372]:
model = sm.OLS(reg_state['dest_positive'], reg_state[['per_cap_positive_arr','stay_at_home','business_closure','mask_mandate']])
model2 = sm.OLS(reg_state['dest_positive'], reg_state[['origin_positive','per_cap_arrivals','stay_at_home','business_closure','mask_mandate']])
results = model.fit()
results2 = model2.fit()

In [370]:
print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:          dest_positive   R-squared (uncentered):                   0.426
Model:                            OLS   Adj. R-squared (uncentered):              0.426
Method:                 Least Squares   F-statistic:                              2959.
Date:                Thu, 01 Apr 2021   Prob (F-statistic):                        0.00
Time:                        23:14:51   Log-Likelihood:                          19641.
No. Observations:               15929   AIC:                                 -3.927e+04
Df Residuals:                   15925   BIC:                                 -3.924e+04
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                           coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------

In [373]:
print(results2.summary())

                                 OLS Regression Results                                
Dep. Variable:          dest_positive   R-squared (uncentered):                   0.512
Model:                            OLS   Adj. R-squared (uncentered):              0.512
Method:                 Least Squares   F-statistic:                              3345.
Date:                Thu, 01 Apr 2021   Prob (F-statistic):                        0.00
Time:                        23:16:10   Log-Likelihood:                          20933.
No. Observations:               15929   AIC:                                 -4.186e+04
Df Residuals:                   15924   BIC:                                 -4.182e+04
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                       coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------