In [224]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import sklearn as sk 

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## Data exploration

In [291]:
# get data 
factors = pd.read_csv('datasets/covid-state-data-agg/COVID19_state.csv')
covid_cases = pd.read_csv('datasets/covid-cases-data/United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv')
state_trips = pd.read_csv('datasets/bts-travel-data/State_trips.csv')
blue_red = pd.read_csv('datasets/BlueRedStates.csv').drop(columns=['BlueRed'])

In [293]:
# filter date and format date col 
ny_cols = ['NY', 'NYC']

def filterByYear(df, date_col, year):
    df[date_col] = pd.to_datetime(df[date_col], infer_datetime_format=True)
    return df[df[date_col].dt.strftime('%Y') == year]

covid_cases_cols = ['submission_date', 'state', 'new_case']
covid_cases = covid_cases[covid_cases_cols]
covid_cases = filterByYear(covid_cases, 'submission_date', '2020')

covid_cases[(covid_cases['submission_date'] == '2020-12-31')].sort_values('state')

Unnamed: 0,submission_date,state,new_case
964,2020-12-31,AK,372.0
26793,2020-12-31,AL,4784.0
2191,2020-12-31,AR,2708.0
38784,2020-12-31,AS,0.0
17831,2020-12-31,AZ,7721.0
14904,2020-12-31,CA,13410.0
33652,2020-12-31,CO,2858.0
15224,2020-12-31,CT,2045.0
29404,2020-12-31,DC,225.0
10103,2020-12-31,DE,626.0


In [294]:
# WAY1: merge NY and NYC data
cases_indexed = covid_cases.groupby(['submission_date','state']).sum()

indexes = cases_indexed.index
date_index = np.unique(np.array([ind[0] for ind in indexes]))

for date_ind in date_index: 
    cases_indexed.loc[date_ind, 'NY'] = sum(cases_indexed.loc[date_ind, 'NY'], cases_indexed.loc[date_ind, 'NYC'])

cases_indexed.reset_index(inplace=True, level=['submission_date', 'state'])
cases_indexed[(cases_indexed['submission_date'] == '2020-12-31') & (cases_indexed['state'].isin(ny_cols))]
# cases_indexed

new_cases = cases_indexed
new_cases[(new_cases['submission_date'] == '2020-12-31')].sort_values('state')

Unnamed: 0,submission_date,state,new_case
20640,2020-12-31,AK,372.0
20641,2020-12-31,AL,4784.0
20642,2020-12-31,AR,2708.0
20643,2020-12-31,AS,0.0
20644,2020-12-31,AZ,7721.0
20645,2020-12-31,CA,13410.0
20646,2020-12-31,CO,2858.0
20647,2020-12-31,CT,2045.0
20648,2020-12-31,DC,225.0
20649,2020-12-31,DE,626.0


In [277]:
# # WAY2: merge NYC with NY data
# covid_cases[(covid_cases['state'] == 'NY') | (covid_cases['state'] == 'NYC')].sort_values(['submission_date', 'state'])

# filt_NY = (covid_cases['state'] == 'NY') | (covid_cases['state'] == 'NYC')
# ny_cases = covid_cases.loc[filt_NY].groupby('submission_date').sum()\
#                     .reset_index().rename(columns={'new_case':'ny_cases'})

# merged = covid_cases.merge(ny_cases, on='submission_date', how='left')
# merged.sort_values(['submission_date', 'state'])

# merged[(merged['submission_date'] == '2020-12-31')].sort_values(['submission_date', 'state'])

# merged['state'] = np.where(merged['state'] == 'NY', merged['ny_cases'], merged['new_case'])
# merged = merged.drop(columns='ny_cases')

# merged[(merged['submission_date'] == '2020-12-31') ]

In [292]:
# state trips data 
state_trips_col = ['Date', 
                   'State Postal Code', 
                   'Number of Trips', 
                   'Number of Trips 5-10', 
                   'Number of Trips 25-50', 
                   'Number of Trips 100-250', 
                   'Number of Trips 250-500']
state_trips = state_trips[state_trips_col]
state_trips = filterByYear(state_trips, 'Date', '2020')

# check state num
state_trips.groupby(['Date']).size().reset_index(name='counts')

state_trips.head()

Unnamed: 0,Date,State Postal Code,Number of Trips,Number of Trips 5-10,Number of Trips 25-50,Number of Trips 100-250,Number of Trips 250-500
18615,2020-01-01,MS,7629798,1221867,396415,93571,22897
18616,2020-01-01,CA,112606885,15443353,5553845,839553,174752
18617,2020-01-01,CO,18346137,2712409,807692,141931,39387
18618,2020-01-01,CT,9636627,1491090,464270,49982,10008
18619,2020-01-01,DE,2665377,432958,140146,15988,1121


In [295]:
new_cases.rename(columns={'submission_date' : 'date'}, inplace=True)
state_trips.rename(columns={'Date' : 'date', 'State Postal Code' : 'state'}, inplace=True)

# get population by state
factors = factors[['State', 'Population']]
pop_by_state = blue_red.merge(factors, left_on='stateName', right_on='State')\
    .drop(columns=['stateName', 'State'])\
    .rename(columns={'stateCode' : 'state'})

# merge all dfs
new_case_trips = new_cases.merge(state_trips, on=['date', 'state'], how='inner')
final_data = new_case_trips.merge(pop_by_state, on='state', how='inner')
final_data.replace(',','', regex=True, inplace=True)


# normalize number of trips 
num_trips_cols = ['Number of Trips', 'Number of Trips 5-10', 'Number of Trips 25-50', 'Number of Trips 100-250', 'Number of Trips 250-500']
for col in num_trips_cols:
    final_data[col] = final_data[col].astype(float) / final_data['Population']

# normalize num of new cases
final_data['new_case'] = final_data['new_case'] / final_data['Population']

# drop 0 new cases 
index_to_drop = final_data[final_data['new_case'] == 0].index
final_data.drop(index_to_drop, inplace=True)

final_data[final_data['new_case'] == 0].count

<bound method DataFrame.count of Empty DataFrame
Columns: [date, state, new_case, Number of Trips, Number of Trips 5-10, Number of Trips 25-50, Number of Trips 100-250, Number of Trips 250-500, Population]
Index: []>

In [296]:
corr_matrix = final_data.corr()

corr_matrix

Unnamed: 0,new_case,Number of Trips,Number of Trips 5-10,Number of Trips 25-50,Number of Trips 100-250,Number of Trips 250-500,Population
new_case,1.0,-0.104669,-0.084028,0.051307,0.107897,0.126579,-0.051502
Number of Trips,-0.104669,1.0,0.842656,0.600038,0.129132,0.019568,-0.117291
Number of Trips 5-10,-0.084028,0.842656,1.0,0.479557,-0.055542,-0.04403,-0.028371
Number of Trips 25-50,0.051307,0.600038,0.479557,1.0,0.375825,0.164526,-0.14035
Number of Trips 100-250,0.107897,0.129132,-0.055542,0.375825,1.0,0.780432,-0.196152
Number of Trips 250-500,0.126579,0.019568,-0.04403,0.164526,0.780432,1.0,-0.108564
Population,-0.051502,-0.117291,-0.028371,-0.14035,-0.196152,-0.108564,1.0


In [269]:
num_trips = np.array(final_data['Number of Trips'].astype(float)).reshape(-1, 1)
num_new_cases = final_data['new_case']

X_train, X_test, y_train, y_test = train_test_split(num_trips, num_new_cases, test_size=0.20)

model = LinearRegression()
model.fit(X_train, y_train)

# make a prediction
y_train_hat = model.predict(X_train)
y_test_hat = model.predict(X_test)

# find error
training_error = mean_squared_error(y_train, y_train_hat)
testing_error = mean_squared_error(y_test, y_test_hat)

print(training_error)
print(testing_error)

5755589.044854301
6641170.457705222


Unnamed: 0,new_case,Number of Trips,Number of Trips 5-10,Number of Trips 25-50,Number of Trips 100-250,Number of Trips 250-500,Population
new_case,1.0,-0.18918,-0.113192,-0.068275,-0.081053,-0.044651,0.482867
Number of Trips,-0.18918,1.0,0.867849,0.602418,0.10907,0.047523,-0.100724
Number of Trips 5-10,-0.113192,0.867849,1.0,0.489367,-0.0671,-0.020159,-0.013639
Number of Trips 25-50,-0.068275,0.602418,0.489367,1.0,0.354798,0.159573,-0.135838
Number of Trips 100-250,-0.081053,0.10907,-0.0671,0.354798,1.0,0.781042,-0.199774
Number of Trips 250-500,-0.044651,0.047523,-0.020159,0.159573,0.781042,1.0,-0.112498
Population,0.482867,-0.100724,-0.013639,-0.135838,-0.199774,-0.112498,1.0
