# Preliminary Predictive Modeling

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split

# from typing import Union, Type, List, Dict
from typing import List, Tuple

pd.set_option("display.max_columns", None)

In [2]:
from pipeline_to_sql import make_postgres_conn

## Functions

In [3]:
def return_dummified_df(df: pd.DataFrame, dummy_columns: List[str], 
        drop_first: bool=False) -> pd.DataFrame:
    return pd.get_dummies(df, columns=dummy_columns, drop_first=drop_first)

In [4]:
def return_X_y_dfs(
        df:pd.DataFrame, y_column: str='death_yn'
        ) -> Tuple[pd.DataFrame, pd.DataFrame]:
    return df.loc[:, df.columns != y_column], df.loc[:, y_column]

## Loading data

In [5]:
no_nulls_query = """
            SELECT *
            FROM no_null_data;
            """

In [6]:
conn = make_postgres_conn('covid_cases')

In [7]:
df = pd.read_sql(no_nulls_query, conn)
conn.close()

In [8]:
df.head(1)

Unnamed: 0,case_month,res_state,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn,low_income_score,perc_tribal_ct,perc_rural_ct
0,2020-06,KS,"SHAWNEE, KS",20177,18 to 49 years,Female,White,Hispanic/Latino,3.0,0.0,Clinical evaluation,Yes,Laboratory-confirmed case,Symptomatic,No,No,No,No,0.0,0.0,0.023256


In [14]:
df.shape[0]

61650

## Data transformation

In [9]:
df['death_yn'] = np.where(df['death_yn']=='Yes', 1, 0)
df['low_income_score'] = df['low_income_score'].replace(
    [0.0, 7.0, 15.0], ['Low', 'Medium', 'High'])
df['case_month'] = pd.to_datetime(df['case_month']).dt.month_name()

In [10]:
columns_drop = ['res_county', 'county_fips_code', 'process', 'current_status']
columns_dummy = ['case_month', 'res_state', 'age_group', 'sex', 'race', 'ethnicity', 
    'exposure_yn', 'symptom_status', 'hosp_yn', 'icu_yn', 'underlying_conditions_yn', 
    'low_income_score']

In [16]:
df = df.loc[df['current_status']=='Laboratory-confirmed case', :]

In [17]:
df.reset_index(drop=True, inplace=True)

In [18]:
df

Unnamed: 0,case_month,res_state,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn,low_income_score,perc_tribal_ct,perc_rural_ct
0,June,KS,"SHAWNEE, KS",20177,18 to 49 years,Female,White,Hispanic/Latino,3.0,0.0,Clinical evaluation,Yes,Laboratory-confirmed case,Symptomatic,No,No,0,No,Low,0.000000,0.023256
1,December,NV,"CLARK, NV",32003,18 to 49 years,Male,Asian,Hispanic/Latino,0.0,0.0,Routine surveillance,Yes,Laboratory-confirmed case,Symptomatic,No,No,0,Yes,Low,0.014374,0.030801
2,April,OH,"MONTGOMERY, OH",39113,18 to 49 years,Female,White,Hispanic/Latino,0.0,0.0,Clinical evaluation,Yes,Laboratory-confirmed case,Symptomatic,No,No,0,Yes,Medium,0.000000,0.006536
3,June,KS,"SHAWNEE, KS",20177,18 to 49 years,Female,White,Hispanic/Latino,0.0,0.0,Clinical evaluation,Yes,Laboratory-confirmed case,Symptomatic,No,No,0,Yes,Low,0.000000,0.023256
4,April,KS,"LYON, KS",20111,18 to 49 years,Female,White,Hispanic/Latino,0.0,0.0,Multiple,Yes,Laboratory-confirmed case,Symptomatic,No,No,0,Yes,Medium,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57510,March,NV,"CLARK, NV",32003,65+ years,Male,White,Non-Hispanic/Latino,0.0,0.0,Multiple,Yes,Laboratory-confirmed case,Symptomatic,Yes,No,0,Yes,Low,0.014374,0.030801
57511,March,NV,"CLARK, NV",32003,65+ years,Male,White,Non-Hispanic/Latino,0.0,0.0,Multiple,Yes,Laboratory-confirmed case,Symptomatic,No,No,0,Yes,Low,0.014374,0.030801
57512,March,NV,"CLARK, NV",32003,65+ years,Male,White,Non-Hispanic/Latino,0.0,0.0,Routine surveillance,Yes,Laboratory-confirmed case,Symptomatic,No,No,0,Yes,Low,0.014374,0.030801
57513,March,NV,"CLARK, NV",32003,65+ years,Male,White,Non-Hispanic/Latino,0.0,0.0,Multiple,Yes,Laboratory-confirmed case,Symptomatic,No,No,0,Yes,Low,0.014374,0.030801


In [None]:
df.drop(columns=columns_drop, inplace=True)

In [None]:
# for logistic regression: drop_first=True, for random forest: drop_first=False
return_dummified_df(df, columns_dummy, drop_first=False)

In [None]:
return_X_y_dfs(return_dummified_df(df, columns_dummy, drop_first=False))

## Logistic Regression