In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
from config import DB_NAME, DB_URL, DB_PORT, DB_NAME, USERNAME, PASSWORD

# Import Data

In [2]:
engine_string = "postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}".format(
    user = USERNAME,
    password = PASSWORD,
    host = DB_URL,
    port = DB_PORT,
    database = DB_NAME
)

engine = create_engine(engine_string)

In [3]:
# Pull tables from database.
# !!! Note read_sql_table() requires sqlalchemy v1.4
sparse_covid_df = pd.read_sql_table('sparse_county_covid',engine).drop(columns=['index'])
dense_covid_df = pd.read_sql_table('dense_county_covid',engine).drop(columns=['index'])
weather_df = pd.read_sql_table('county_weather',engine).drop(columns=['index'])
county_df = pd.read_sql_table('county_pop',engine)

In [4]:
#####################################################################
### !!! To be removed once database is updated
#####################################################################
# Drop harrisonburg county
dense_covid_df = dense_covid_df[dense_covid_df['county'] != 'harrisonburg']

# Transform Data

In [5]:
# Add past_delta14 columns to sparse and dense counties
sparse_covid_df['past_delta14'] = pd.Series(dtype=int)
dense_covid_df['past_delta14'] = pd.Series(dtype=int)

# Sparse county past_delta_14
for i in range(14,len(sparse_covid_df.index)):

    # Set past_delta_14
    past_cases = sparse_covid_df.iloc[i-14,3]
    present_cases = sparse_covid_df.iloc[i,3]
    delta14 = present_cases - past_cases

    sparse_covid_df.iloc[i,7] = delta14

# Dense county past delta14
for i in range(14,len(dense_covid_df.index)):

    # Set past_delta_14
    past_cases = dense_covid_df.iloc[i-14,3]
    present_cases = dense_covid_df.iloc[i,3]
    delta14 = present_cases - past_cases

    dense_covid_df.iloc[i,7] = delta14

In [6]:
# Sort weather data according to the sparse or dense county sets
sparse_counties = sparse_covid_df['county'].unique()
dense_counties = dense_covid_df['county'].unique()

# Sort weather for each set
sparse_weather_df = weather_df[weather_df['county'].isin(sparse_counties)]
dense_weather_df = weather_df[weather_df['county'].isin(dense_counties)]

# Sort county data for each set
sparse_county_df = county_df[county_df['county'].isin(sparse_counties)]
dense_county_df = county_df[county_df['county'].isin(dense_counties)]

In [7]:
# Merge weather data on sparse and dense data
sparse_weather_covid_df = sparse_covid_df.merge(sparse_weather_df,on=['date','county'])
dense_weather_covid_df = dense_covid_df.merge(dense_weather_df,on=['date','county'])

In [8]:
# Verify Sparse Data
sparse_weather_covid_df.head()

Unnamed: 0,date,county,state_x,total_cases,new_cases,future_delta7,future_delta14,past_delta14,state_y,temp_mean(c),precip_sum(mm),wind_max(km/h),min_humidity(%),max_humidity(%),mean_humidity(%)
0,2020-03-07,fairfax,virginia,1,1,9.0,21.0,,virginia,5.0,0.0,27.6,30.0,65.0,48.0
1,2020-03-08,fairfax,virginia,2,1,8.0,29.0,,virginia,6.8,0.0,16.7,24.0,67.0,47.0
2,2020-03-09,fairfax,virginia,4,2,6.0,39.0,,virginia,11.7,0.0,18.8,30.0,69.0,50.0
3,2020-03-10,fairfax,virginia,4,0,8.0,42.0,,virginia,14.6,2.1,29.8,49.0,95.0,69.0
4,2020-03-11,fairfax,virginia,4,0,10.0,72.0,,virginia,9.1,0.6,15.7,63.0,91.0,73.0


In [9]:
# Verify Dense Data
dense_weather_covid_df.head()

Unnamed: 0,date,county,state_x,total_cases,new_cases,future_delta7,future_delta14,past_delta14,state_y,temp_mean(c),precip_sum(mm),wind_max(km/h),min_humidity(%),max_humidity(%),mean_humidity(%)
0,2020-03-11,baltimore,maryland,1,1,6.0,50.0,,maryland,9.1,0.2,20.9,54.0,88.0,72.0
1,2020-03-12,baltimore,maryland,1,0,12.0,80.0,,maryland,9.9,0.0,14.8,57.0,93.0,77.0
2,2020-03-13,baltimore,maryland,2,1,11.0,101.0,,maryland,14.5,7.6,24.7,34.0,98.0,69.0
3,2020-03-14,baltimore,maryland,3,1,16.0,138.0,,maryland,9.4,2.0,20.1,32.0,81.0,45.0
4,2020-03-15,baltimore,maryland,3,0,25.0,159.0,,maryland,7.1,6.9,14.7,46.0,92.0,75.0


In [10]:
# Merge county data on sparse and dense data
sparse_weather_covid_county_df = sparse_weather_covid_df.merge(county_df,on=['county'])
dense_weather_covid_county_df = dense_weather_covid_df.merge(county_df,on=['county'])

In [11]:
# Set rows to keep
# Set Feature Categories
weather_features = ['temp_mean(c)','precip_sum(mm)','wind_max(km/h)','min_humidity(%)','max_humidity(%)','mean_humidity(%)']
county_features = ['total_pop','pop_dens(/sqmi)','avg_household']
covid_features = ['new_cases','past_delta14']
target = ['future_delta14']

In [12]:
# Select only those columns to keep
sparse_weather_covid_county_df = sparse_weather_covid_county_df[target + weather_features + county_features + covid_features]
dense_weather_covid_county_df = dense_weather_covid_county_df[target + weather_features + county_features + covid_features]

In [13]:
# Ensure columns are of string type
sparse_weather_covid_county_df.columns = sparse_weather_covid_county_df.columns.astype(str)
dense_weather_covid_county_df.columns = dense_weather_covid_county_df.columns.astype(str)

In [14]:
# Drop rows with null data
sparse = sparse_weather_covid_county_df.dropna()
dense = dense_weather_covid_county_df.dropna()

In [15]:
# Verify Sparse Data
sparse.head()

Unnamed: 0,future_delta14,temp_mean(c),precip_sum(mm),wind_max(km/h),min_humidity(%),max_humidity(%),mean_humidity(%),total_pop,pop_dens(/sqmi),avg_household,new_cases,past_delta14
14,365.0,10.6,2.6,18.3,43.0,86.0,55.0,1143529,1129,2.88,6,21.0
15,395.0,6.9,0.0,15.9,41.0,73.0,49.0,1143529,1129,2.88,9,29.0
16,445.0,5.4,8.8,14.7,73.0,100.0,94.0,1143529,1129,2.88,12,39.0
17,486.0,8.3,0.0,12.0,57.0,95.0,77.0,1143529,1129,2.88,3,42.0
18,494.0,6.4,10.4,16.6,76.0,100.0,95.0,1143529,1129,2.88,30,72.0


In [16]:
# Verify Dense Data
dense.head()

Unnamed: 0,future_delta14,temp_mean(c),precip_sum(mm),wind_max(km/h),min_humidity(%),max_humidity(%),mean_humidity(%),total_pop,pop_dens(/sqmi),avg_household,new_cases,past_delta14
14,815.0,7.2,9.1,21.5,80.0,97.0,89.0,614700,2932,2.48,9,50.0
15,898.0,9.0,0.0,19.5,61.0,98.0,78.0,614700,2932,2.48,30,80.0
16,969.0,13.0,1.4,13.8,57.0,95.0,77.0,614700,2932,2.48,22,101.0
17,1032.0,11.5,15.1,16.9,80.0,96.0,90.0,614700,2932,2.48,38,138.0
18,1095.0,12.5,0.0,15.9,77.0,97.0,91.0,614700,2932,2.48,21,159.0


In [17]:
print(f"Sparse County Rows: {len(sparse_weather_covid_county_df.index)}")
print(f"Dense County Rows: {len(dense_weather_covid_county_df.index)}")

Sparse County Rows: 2484
Dense County Rows: 1665


# Run Models

## Sparse Data Model
### All Features

In [18]:
# Split data into Training and Testing Sets

# Use All Features
X = sparse[weather_features + covid_features + county_features]
y = sparse[target]

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

# Use linear regression model
sparse_model = LinearRegression()

# Sparse 14-day new case prediction model
sparse_model.fit(X_train,y_train)
training_score = sparse_model.score(X_train,y_train)
testing_score = sparse_model.score(X_test,y_test)

# View performance, coefficients and y-intercept
print("------- Sparse 14-day New Case Prediction --------")
print()
print("---------------- All Features -------------------")
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")
print()
print('----------------- Coefficients ------------------')
for i in range(0,len(X.columns)):
    print(f"{sparse_model.feature_names_in_[i]} : {sparse_model.coef_[0][i]}")
print(f"y-intercept : {sparse_model.intercept_[0]}")

------- Sparse 14-day New Case Prediction --------

---------------- All Features -------------------
Training Score: 0.5367456890157535
Testing Score: 0.5566740266113842

----------------- Coefficients ------------------
temp_mean(c) : 2.96048759006881
precip_sum(mm) : -5.457495537796933
wind_max(km/h) : -21.520117440684782
min_humidity(%) : 2.584218930029477
max_humidity(%) : -22.22979980544879
mean_humidity(%) : 3.773491152123109
new_cases : 2.8031090530779434
past_delta14 : 0.02849509216347285
total_pop : 0.0017810463358680453
pop_dens(/sqmi) : -0.05733752599488673
avg_household : -1562.715453233936
y-intercept : 6031.225348957668


## Dense Data Model
### All Features

In [19]:
# Split data into Training and Testing Sets

# Use All features
X = dense[weather_features + covid_features + county_features]
y = dense[target]

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

# Use linear regression model
dense_model = LinearRegression()

# Dense 14-day new case prediction model
dense_model.fit(X_train,y_train)
training_score = dense_model.score(X_train,y_train)
testing_score = dense_model.score(X_test,y_test)

# View performance, coefficients and y-intercept
print("------- Dense 14-day New Case Prediction --------")
print("---------------- All Features -------------------")
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")
print()
print('----------------- Coefficients ------------------')
for i in range(0,len(X.columns)):
    print(f"{dense_model.feature_names_in_[i]} : {dense_model.coef_[0][i]}")
print(f"y-intercept : {dense_model.intercept_[0]}")

------- Dense 14-day New Case Prediction --------
---------------- All Features -------------------
Training Score: 0.8420089148406973
Testing Score: 0.878762253727533

----------------- Coefficients ------------------
temp_mean(c) : -52.21513397808549
precip_sum(mm) : -17.154606989892965
wind_max(km/h) : 35.55696163626145
min_humidity(%) : -57.11231628525269
max_humidity(%) : -67.04202930471176
mean_humidity(%) : 117.435919071596
new_cases : 11.623105523204234
past_delta14 : 0.003197723419449334
total_pop : 0.000566408966656271
pop_dens(/sqmi) : 0.09371789903942264
avg_household : -447.46035715439683
y-intercept : 1857.1410588417311
