# Preliminary Predictive Modeling

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import recall_score, confusion_matrix

# from typing import Union, Type, List, Dict
from typing import List, Tuple

pd.set_option("display.max_columns", None)

In [2]:
from pipeline_to_sql import make_postgres_conn

## Functions

In [3]:
def return_dummified_df(df: pd.DataFrame, dummy_columns: List[str], 
        drop_first: bool=False) -> pd.DataFrame:
    return pd.get_dummies(df, columns=dummy_columns, drop_first=drop_first)

In [4]:
def return_X_y_dfs(
        df:pd.DataFrame, y_column: str='death_yn'
        ) -> Tuple[pd.DataFrame, pd.DataFrame]:
    return df.loc[:, df.columns != y_column], df.loc[:, y_column]

## Loading data

In [5]:
no_nulls_query = """
            SELECT *
            FROM no_null_data;
            """

In [6]:
conn = make_postgres_conn('covid_cases')

In [7]:
df = pd.read_sql(no_nulls_query, conn)
conn.close()

In [8]:
df.head(1)

Unnamed: 0,case_month,res_state,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn,low_income_score,perc_tribal_ct,perc_rural_ct
0,2020-06,KS,"SHAWNEE, KS",20177,18 to 49 years,Female,White,Hispanic/Latino,3.0,0.0,Clinical evaluation,Yes,Laboratory-confirmed case,Symptomatic,No,No,No,No,0.0,0.0,0.023256


In [9]:
df.shape[0]

61650

## Data transformation

In [10]:
df['death_yn'] = np.where(df['death_yn']=='Yes', 1, 0)
df['low_income_score'] = df['low_income_score'].replace(
    [0.0, 7.0, 15.0], ['Low', 'Medium', 'High'])
df['case_month'] = pd.to_datetime(df['case_month']).dt.month_name()

In [11]:
columns_drop = ['res_county', 'county_fips_code', 'process', 'current_status']
columns_dummy = ['case_month', 'res_state', 'age_group', 'sex', 'race', 'ethnicity', 
    'exposure_yn', 'symptom_status', 'hosp_yn', 'icu_yn', 'underlying_conditions_yn', 
    'low_income_score']

In [12]:
df = df.loc[df['current_status']=='Laboratory-confirmed case', :]

In [13]:
df.reset_index(drop=True, inplace=True)

In [14]:
df.shape[0]

57515

In [15]:
df

Unnamed: 0,case_month,res_state,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn,low_income_score,perc_tribal_ct,perc_rural_ct
0,June,KS,"SHAWNEE, KS",20177,18 to 49 years,Female,White,Hispanic/Latino,3.0,0.0,Clinical evaluation,Yes,Laboratory-confirmed case,Symptomatic,No,No,0,No,Low,0.000000,0.023256
1,December,NV,"CLARK, NV",32003,18 to 49 years,Male,Asian,Hispanic/Latino,0.0,0.0,Routine surveillance,Yes,Laboratory-confirmed case,Symptomatic,No,No,0,Yes,Low,0.014374,0.030801
2,April,OH,"MONTGOMERY, OH",39113,18 to 49 years,Female,White,Hispanic/Latino,0.0,0.0,Clinical evaluation,Yes,Laboratory-confirmed case,Symptomatic,No,No,0,Yes,Medium,0.000000,0.006536
3,June,KS,"SHAWNEE, KS",20177,18 to 49 years,Female,White,Hispanic/Latino,0.0,0.0,Clinical evaluation,Yes,Laboratory-confirmed case,Symptomatic,No,No,0,Yes,Low,0.000000,0.023256
4,April,KS,"LYON, KS",20111,18 to 49 years,Female,White,Hispanic/Latino,0.0,0.0,Multiple,Yes,Laboratory-confirmed case,Symptomatic,No,No,0,Yes,Medium,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57510,March,NV,"CLARK, NV",32003,65+ years,Male,White,Non-Hispanic/Latino,0.0,0.0,Multiple,Yes,Laboratory-confirmed case,Symptomatic,Yes,No,0,Yes,Low,0.014374,0.030801
57511,March,NV,"CLARK, NV",32003,65+ years,Male,White,Non-Hispanic/Latino,0.0,0.0,Multiple,Yes,Laboratory-confirmed case,Symptomatic,No,No,0,Yes,Low,0.014374,0.030801
57512,March,NV,"CLARK, NV",32003,65+ years,Male,White,Non-Hispanic/Latino,0.0,0.0,Routine surveillance,Yes,Laboratory-confirmed case,Symptomatic,No,No,0,Yes,Low,0.014374,0.030801
57513,March,NV,"CLARK, NV",32003,65+ years,Male,White,Non-Hispanic/Latino,0.0,0.0,Multiple,Yes,Laboratory-confirmed case,Symptomatic,No,No,0,Yes,Low,0.014374,0.030801


In [16]:
df.drop(columns=columns_drop, inplace=True)

In [17]:
# for logistic regression: drop_first=True, for random forest: drop_first=False
return_dummified_df(df, columns_dummy, drop_first=False).head()

Unnamed: 0,case_positive_specimen_interval,case_onset_interval,death_yn,perc_tribal_ct,perc_rural_ct,case_month_April,case_month_August,case_month_December,case_month_February,case_month_January,case_month_July,case_month_June,case_month_March,case_month_May,case_month_November,case_month_October,case_month_September,res_state_AK,res_state_CO,res_state_FL,res_state_IA,res_state_ID,res_state_KS,res_state_KY,res_state_LA,res_state_MA,res_state_NC,res_state_NV,res_state_OH,res_state_UT,res_state_VT,age_group_0 - 17 years,age_group_18 to 49 years,age_group_50 to 64 years,age_group_65+ years,sex_Female,sex_Male,race_American Indian/Alaska Native,race_Asian,race_Black,race_Multiple/Other,race_Native Hawaiian/Other Pacific Islander,race_White,ethnicity_Hispanic/Latino,ethnicity_Non-Hispanic/Latino,exposure_yn_Yes,symptom_status_Symptomatic,hosp_yn_No,hosp_yn_Yes,icu_yn_No,icu_yn_Yes,underlying_conditions_yn_No,underlying_conditions_yn_Yes,low_income_score_High,low_income_score_Low,low_income_score_Medium
0,3.0,0.0,0,0.0,0.023256,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0,1,1,1,0,1,0,1,0,0,1,0
1,0.0,0.0,0,0.014374,0.030801,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,1,1,1,0,1,0,0,1,0,1,0
2,0.0,0.0,0,0.0,0.006536,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0,1,1,1,0,1,0,0,1,0,0,1
3,0.0,0.0,0,0.0,0.023256,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0,1,1,1,0,1,0,0,1,0,1,0
4,0.0,0.0,0,0.0,1.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0,1,1,1,0,1,0,0,1,0,0,1


## Dummy Classifier

In [18]:
data_rf = return_dummified_df(df, columns_dummy, drop_first=False)

In [19]:
X_rf, y_rf = return_X_y_dfs(data_rf)

In [20]:
X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split(X_rf, y_rf)

In [71]:
model_dum = DummyClassifier(strategy='stratified')
model_dum.fit(X_rf_train, y_rf_train)

DummyClassifier(strategy='stratified')

In [72]:
y_dum_pred = model_dum.predict(X_rf_test)

In [73]:
recall_score(y_rf_test, y_dum_pred)

0.015822784810126583

## Logistic Regression

In [44]:
data_log = return_dummified_df(df, columns_dummy, drop_first=True)

In [45]:
X_log, y_log = return_X_y_dfs(data_log)

In [46]:
X_log_train, X_log_test, y_log_train, y_log_test = train_test_split(X_log, y_log)

In [51]:
# model_log = LogisticRegressionCV(Cs=10, cv=1, solver='saga')
model_log = LogisticRegression(solver='saga')

In [52]:
model_log.fit(X_log_train, y_log_train)



LogisticRegression(solver='saga')

In [53]:
yhat_log = model_log.predict(X_log_test)

In [54]:
recall_score(y_log_test, yhat_log)

0.33636363636363636

In [55]:
model_log.coef_

array([[-0.03157049, -0.01492853, -0.2995607 , -1.09264555, -0.58636019,
         0.4354178 , -0.56434987, -0.2302762 , -0.25157831, -0.94481393,
        -0.577062  , -0.14791968,  0.19587075, -0.72587984, -0.98997727,
         0.34374012, -0.032094  , -0.25870803, -0.31125822, -0.86417356,
        -0.00595285,  0.03060142,  1.50386845, -0.0034648 , -0.86074917,
        -1.23914152, -0.80081502, -0.04029353, -2.56917377, -0.65776075,
         2.4252576 ,  0.14088023, -0.26059636, -0.93742026, -0.64869608,
        -0.34451084, -0.35293931, -0.38288476,  1.23308797,  1.82097258,
        -0.31622593, -0.67837892, -0.44853858]])

## Random Forest

In [None]:
model_rf = RandomForestClassifier(n_estimators=100, criterion='gini')

In [None]:
model_rf.fit(X_rf_train, y_rf_train)

In [None]:
yhat_rf = model_rf.predict(X_rf_test)

In [None]:
recall_score(y_rf_test, yhat_rf)