# Preliminary Predictive Modeling

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline

from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve

# import shap

from typing import List, Tuple, Union

import joblib

pd.set_option("display.max_columns", None)

In [2]:
from pipeline_to_sql import make_postgres_conn

## Functions

In [3]:
def transform_df(df: pd.DataFrame, columns2drop: List[str]) -> pd.DataFrame:
    df_ = df.copy()
    df_['death_yn'] = np.where(df_['death_yn']=='Yes', 1, 0)
    df_['low_income_score'] = df_['low_income_score'].replace(
        [0.0, 7.0, 15.0], ['Low', 'Medium', 'High'])
    df_['case_month'] = pd.to_datetime(df_['case_month']).dt.month_name()
    
    df_ = df_.loc[df_['current_status'] == 'Laboratory-confirmed case', :]
    df_.reset_index(drop=True, inplace=True)
    df_ = df_.drop(columns=columns_drop)
    return df_

In [4]:
def return_dummified_df(df: pd.DataFrame, dummy_columns: List[str], 
        drop_first: bool=False) -> pd.DataFrame:
    return pd.get_dummies(df, columns=dummy_columns, drop_first=drop_first)

In [5]:
def return_X_y_arrays(
        df:pd.DataFrame, y_column: str='death_yn'
        ) -> Tuple[np.ndarray, np.ndarray]:
    X = df.loc[:, df.columns != y_column]
    y = df.loc[:, y_column]
    return X.values, y.values

In [6]:
def get_sql_data(
        db_name: str, query: str, params: Union[int, None]=None) -> pd.DataFrame:
    conn = make_postgres_conn(db_name)
    
    if not params:
        df = pd.read_sql(query, conn)
    else:
        df = pd.read_sql(query, conn, params=params)
        
    conn.close()
    return df

In [7]:
def transform_missing_data(df: pd.DataFrame) -> pd.DataFrame:
    df_ = df.copy()

    df_ = df_[~df_['res_county'].isna()]

    df_['age_group'] = df_['age_group'].fillna(df_['age_group'].mode()[0])
    df_['sex'] = df_['sex'].fillna(df_['sex'].mode()[0])
    df_['race'] = df_['race'].fillna(df_['race'].mode()[0])
    df_['ethnicity'] = df_['ethnicity'].fillna(df_['ethnicity'].mode()[0])
    df_['case_positive_specimen_interval'] = (
        df_['case_positive_specimen_interval'].fillna(
            df_['case_positive_specimen_interval'].median()))
    df_['case_onset_interval'] = (
        df_['case_onset_interval'].fillna(
            df_['case_onset_interval'].median()))
    
    return df_

In [8]:
def plot_precision_recall_curve(
        model, X_test, y_test, label='Model Name', title='Your Title'):
    probs = model.predict_proba(X_test)[:, 1]
    prec, rec, _ = precision_recall_curve(y_test, probs)
    
    fig, ax = plt.subplots(figsize=(10, 6))
    no_skill = len(y_test[y_test==1]) / len(y_test)
    ax.plot([0, 1], [no_skill, no_skill], linestyle='--', label='Random')
    ax.plot(rec, prec, marker='.', label=label)
    ax.set_xlabel('Recall')
    ax.set_ylabel('Precision')
    ax.set_title(title)
    fig.legend(loc='upper right', bbox_to_anchor=(0.9, 0.85));

In [9]:
def plot_random_forest_feature_importance_mdi(data, model, title='Your Title'):
    feat_scores = pd.DataFrame(
        {'Fraction of Samples Affected': model.feature_importances_},
        index=data.loc[:, data.columns != 'death_yn'].columns)
    feat_scores = feat_scores.sort_values(by='Fraction of Samples Affected', ascending=False)[:10]
    feat_scores = feat_scores.sort_values(by='Fraction of Samples Affected')
    
    fig, ax = plt.subplots(figsize=(10, 7))
    feat_scores.plot(kind='barh', ax=ax)
    ax.set_title(title)
    ax.set_ylabel('Feature')
    ax.set_xlabel('Mean Decrease Impurity');

## Loading data

In [10]:
no_nulls_query = """
            SELECT *
            FROM no_null_data;
            """

In [11]:
df = get_sql_data('covid_cases', no_nulls_query)

OperationalError: could not connect to server: Connection refused
	Is the server running on host "localhost" (::1) and accepting
	TCP/IP connections on port 5432?
could not connect to server: Connection refused
	Is the server running on host "localhost" (127.0.0.1) and accepting
	TCP/IP connections on port 5432?


In [None]:
df.shape[0]

## Data transformation

In [None]:
columns_drop = ['res_county', 'county_fips_code', 'process', 'current_status']
columns_dummy = ['case_month', 'res_state', 'age_group', 'sex', 'race', 'ethnicity', 
'exposure_yn', 'symptom_status', 'hosp_yn', 'icu_yn', 'underlying_conditions_yn', 
'low_income_score']

In [None]:
df.shape[0]

In [None]:
df = transform_df(df, columns2drop=columns_drop)

In [None]:
df.shape[0]

## Dummy Classifier

In [None]:
data_rf = return_dummified_df(df, columns_dummy, drop_first=False)

In [None]:
X_rf, y_rf = return_X_y_arrays(data_rf)

In [None]:
y_rf

In [None]:
X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split(X_rf, y_rf, test_size=0.2)

In [None]:
model_dum = DummyClassifier(strategy='stratified')
model_dum.fit(X_rf_train, y_rf_train)

In [None]:
y_dum_pred = model_dum.predict(X_rf_test)

In [None]:
def print_recall_precision(y_test, y_pred):
    print(f'Recall:    {recall_score(y_test, y_pred):0.5f}')
    print(f'Precision: {precision_score(y_test, y_pred):0.5f}')

In [None]:
print_recall_precision(y_rf_test, y_dum_pred)

## Logistic Regression

In [None]:
data_log = return_dummified_df(df, columns_dummy, drop_first=True)

In [None]:
data_log.shape

In [None]:
X_log, y_log = return_X_y_arrays(data_log)

In [None]:
X_log[:, :2]

In [None]:
scaler_mms = MinMaxScaler()
scaler_mms.fit(X_log[:, :2])

In [None]:
X_log[:, :2] = scaler_mms.transform(X_log[:, :2])

In [None]:
X_log_train, X_log_test, y_log_train, y_log_test = train_test_split(X_log, y_log, test_size=0.2)

In [None]:
model_log = LogisticRegression(solver='lbfgs', max_iter=500)

In [None]:
model_log.fit(X_log_train, y_log_train)

In [None]:
yhat_log = model_log.predict(X_log_test)
# log_probs = model_log.predict_proba(X_log_test)[:, 1]
# log_prec, log_rec, _ = precision_recall_curve(y_log_test, log_probs)

In [None]:
# fig, ax = plt.subplots(figsize=(10, 6))
# no_skill = len(y_log_test[y_log_test==1]) / len(y_log_test)
# ax.plot([0, 1], [no_skill, no_skill], linestyle='--', label='Random')
# ax.plot(log_rec, log_prec, marker='.', label='Logistic')
# ax.set_xlabel('Recall')
# ax.set_ylabel('Precision')
# ax.set_title('Precision-Recall Plot: Logistic, 57.5k rows, No Null and Missing Values')
# fig.legend(loc='upper right', bbox_to_anchor=(0.9, 0.85));

In [None]:
plot_precision_recall_curve(
    model_log, X_log_test, y_log_test, label='Logistic', 
    title=('Precision-Recall Plot: Logistic '
        + ', 57.5k rows, No Null and Missing Values'))
plt.savefig('./../images/prp-logistic-no-null-no-missing.png');

In [None]:
print_recall_precision(y_log_test, yhat_log)

In [None]:
fig, ax = plt.subplots(figsize=(10, 7))
cm_log = confusion_matrix(y_log_test, yhat_log, labels=model_log.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_log, display_labels=model_log.classes_)
disp.plot(ax=ax)
ax.set_title('Logistic, 57.5k rows, No Null and Missing Values', fontsize=18)
ax.tick_params(axis='both', labelsize=16)
ax.set_ylabel(ax.get_ylabel(), fontsize=16)
ax.set_xlabel(ax.get_xlabel(), fontsize=16)

for labels in disp.text_.ravel():
    labels.set_fontsize(16)

plt.tight_layout()
plt.grid(None)
plt.savefig('./../images/cm-logistic-no-null-no-missing.png');

## Random Forest

In [None]:
X_rf_train

In [None]:
y_rf_train

In [None]:
model_rf = RandomForestClassifier(n_estimators=100, criterion='gini')

In [None]:
model_rf.fit(X_rf_train, y_rf_train)

In [None]:
yhat_rf = model_rf.predict(X_rf_test)
# rf_probs = model_rf.predict_proba(X_rf_test)[:, 1]
# log_prec, log_rec, _ = precision_recall_curve(y_rf_test, rf_probs)

In [None]:
print_recall_precision(y_rf_test, yhat_rf)

In [None]:
plot_precision_recall_curve(
    model_rf, X_rf_test, y_rf_test, label='Random Forest', 
    title=('Precision-Recall Plot: Random '
        + 'Forest, 57.5k rows, No Null and Missing Values'))
plt.savefig('./../images/prp-rforest-no-null-no-missing.png')

In [None]:
fig, ax = plt.subplots(figsize=(10, 7))
cm_rf = confusion_matrix(y_rf_test, yhat_rf, labels=model_rf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_rf, display_labels=model_rf.classes_)
disp.plot(ax=ax)
plt.title('Random Forest, 57.5k rows, No Null and Missing Values', fontsize=18)
ax.tick_params(axis='both', labelsize=16)
ax.set_ylabel(ax.get_ylabel(), fontsize=16)
ax.set_xlabel(ax.get_xlabel(), fontsize=16)

for labels in disp.text_.ravel():
    labels.set_fontsize(16)
    
plt.grid(None)
plt.tight_layout()
plt.savefig('./../images/cm-rforest-no-null-no-missing.png');

### Feature importance: mean decrease impurity

In [None]:
plot_random_forest_feature_importance_mdi(
    data_rf, model_rf, title=('Top 10 Feature Importance, Mean Decrease '
    + 'Impurity: 57.5k, No Null or Missing Values'))
plt.savefig('./../images/feature-importance-no-null-no-missing.png')

### Trying shap for random forest model

In [None]:
# explainer = shap.TreeExplainer(model_rf)
# shap_values_rf = explainer.shap_values(X_rf_test)

In [None]:
# shap.summary_plot(shap_values=shap_values_rf,
#                  features=X_rf_train,
#                  features_names=data_rf.loc[:, data_rf.columns != 'death_yn'],
#                  plot_type='bar')

## Using 'some' dataset and imputing values

In [None]:
some_nulls_query = """
            SELECT *
            FROM some_null_data;
            """

In [None]:
df_some = get_sql_data('covid_cases', some_nulls_query)

In [None]:
df_some.head(2)

In [None]:
df_some.shape[0]

In [None]:
df_some = transform_missing_data(df_some)

In [None]:
df_some.shape[0]

In [None]:
df_some = transform_df(df_some, columns2drop=columns_drop)

In [None]:
df_some.shape[0]

## Logistic with imputed values

In [None]:
data_some_log = return_dummified_df(df_some, columns_dummy, drop_first=True)

In [None]:
data_some_log.shape

In [None]:
data_some_log.columns

In [None]:
X_slog, y_slog = return_X_y_arrays(data_some_log)

In [None]:
scaler_mms = MinMaxScaler()
scaler_mms.fit(X_slog[:, :2])

In [None]:
X_slog[:, :2] = scaler_mms.transform(X_slog[:, :2])

In [None]:
X_slog_train, X_slog_test, y_slog_train, y_slog_test = train_test_split(X_slog, y_slog, test_size=0.2)

In [None]:
model_slog = LogisticRegression(solver='lbfgs', max_iter=500)

In [None]:
model_slog.fit(X_slog_train, y_slog_train)

In [None]:
yhat_slog = model_slog.predict(X_slog_test)

In [None]:
print_recall_precision(y_slog_test, yhat_slog)

In [None]:
fig, ax = plt.subplots(figsize=(10, 7))
cm_slog = confusion_matrix(y_slog_test, yhat_slog, labels=model_slog.classes_)
disp_slog = ConfusionMatrixDisplay(confusion_matrix=cm_slog, display_labels=model_slog.classes_)
disp_slog.plot(ax=ax)
ax.set_title('Logistic, 203K rows, Imputed Null with Missing Values', fontsize=18)

ax.tick_params(axis='both', labelsize=16)
ax.set_ylabel(ax.get_ylabel(), fontsize=16)
ax.set_xlabel(ax.get_xlabel(), fontsize=16)

for labels in disp_slog.text_.ravel():
    labels.set_fontsize(16)

plt.grid(None)
plt.tight_layout()
plt.savefig('./../images/cm-logistic-imputed-null-keep-missing.png');

In [None]:
plot_precision_recall_curve(
    model_slog, X_slog_test, y_slog_test, label='Logistic',
    title=('Precision-Recall Plot: Logistic'
        + ', 203K rows, Imputed Null with Missing Values'))
plt.savefig('./../images/prp-logistic-imputed-null-with-missing.png')

## Random forest with imputed values

In [None]:
data_srf = return_dummified_df(df_some, columns_dummy, drop_first=False)

In [None]:
X_srf, y_srf = return_X_y_arrays(data_srf)

In [None]:
X_srf_train, X_srf_test, y_srf_train, y_srf_test = train_test_split(X_srf, y_srf, test_size=0.2)

In [None]:
X_srf_train

In [None]:
y_srf_train

In [None]:
model_srf = RandomForestClassifier(n_estimators=100, criterion='gini')

In [None]:
model_srf.fit(X_srf_train, y_srf_train)

In [None]:
yhat_srf = model_srf.predict(X_srf_test)

In [None]:
print_recall_precision(y_srf_test, yhat_srf)

In [None]:
fig, ax = plt.subplots(figsize=(10, 7))
cm_srf = confusion_matrix(y_srf_test, yhat_srf, labels=model_srf.classes_)
disp_s = ConfusionMatrixDisplay(confusion_matrix=cm_srf, display_labels=model_srf.classes_)
disp_s.plot(ax=ax)
plt.title('Random Forest, 203K rows, Imputed Null with Missing Values', fontsize=18)
ax.tick_params(axis='both', labelsize=16)
ax.set_ylabel(ax.get_ylabel(), fontsize=16)
ax.set_xlabel(ax.get_xlabel(), fontsize=16)

for labels in disp_s.text_.ravel():
    labels.set_fontsize(16)

plt.grid(None)
plt.tight_layout()
plt.savefig('./../images/cm-rforest-imputed-null-keep-missing.png');

In [None]:
plot_precision_recall_curve(
    model_srf, X_srf_test, y_srf_test, label='Random Forest',
    title=('Precision-Recall Plot: Random Forest '
        + '203K rows, Imputed Null with Missing Values'))
plt.savefig('./../images/prp-rforest-imputed-null-with-missing-values.png')

In [None]:
plot_random_forest_feature_importance_mdi(
    data_srf, model_srf, title=('Top 10 Feature Importance, Mean Decrease '
    + 'Impurity: 203k rows, Imputed Null with Missing Values'))
plt.savefig('./../images/feature-importance-imputed-null-with-missing.png')

In [None]:
import sys
sys.exit()

## Using 'all' dataset and imputing values
Removing high null/missing columns

In [None]:
num_rows = 3_000_000

In [None]:
num_rows_string = f'{num_rows/1_000_000}M'

In [None]:
all_query = """
            SELECT *
            FROM all_case_data
            LIMIT %s;
            """

In [None]:
df_all = get_sql_data('covid_cases', all_query, params=[num_rows])

In [None]:
df_all.shape[0]

In [None]:
df_all.head(2)

In [None]:
df_all = transform_missing_data(df_all)

In [None]:
df_all.shape[0]

In [None]:
df_all = transform_df(df_all, columns2drop=columns_drop)

In [None]:
df_all.shape[0]

In [None]:
df_all = df_all.drop(
    columns=['underlying_conditions_yn', 'icu_yn', 'exposure_yn', 
        'case_positive_specimen_interval'])

In [None]:
columns_dummy_all = ['case_month', 'res_state', 'age_group', 'sex', 'race', 'ethnicity', 'symptom_status', 'hosp_yn', 'low_income_score']

## Logistic with all dataset

In [None]:
data_all_log = return_dummified_df(df_all, columns_dummy_all, drop_first=True)

In [None]:
data_all_log.shape

In [None]:
data_all_log.columns

In [None]:
X_alog, y_alog = return_X_y_arrays(data_all_log)

In [None]:
scaler_mms = MinMaxScaler()
scaler_mms.fit(X_alog[:, :2])

In [None]:
X_alog[:, :2] = scaler_mms.transform(X_alog[:, :2])

In [None]:
X_alog_train, X_alog_test, y_alog_train, y_alog_test = train_test_split(X_alog, y_alog, test_size=0.2)

In [None]:
model_alog = LogisticRegression(solver='lbfgs', max_iter=500)

In [None]:
model_alog.fit(X_alog_train, y_alog_train)

In [None]:
yhat_alog = model_alog.predict(X_alog_test)

In [None]:
print_recall_precision(y_alog_test, yhat_alog)

In [None]:
fig, ax = plt.subplots(figsize=(10, 7))
cm_alog = confusion_matrix(y_alog_test, yhat_alog, labels=model_alog.classes_)
disp_alog = ConfusionMatrixDisplay(confusion_matrix=cm_alog, display_labels=model_alog.classes_)
disp_alog.plot(ax=ax)
plt.title(f'Logistic, {num_rows_string} rows, High Missing Data Features Removed', fontsize=18)

ax.tick_params(axis='both', labelsize=16)
ax.set_ylabel(ax.get_ylabel(), fontsize=16)
ax.set_xlabel(ax.get_xlabel(), fontsize=16)

for labels in disp_alog.text_.ravel():
    labels.set_fontsize(16)
    
plt.grid(None)
plt.tight_layout()
plt.savefig('./../images/cm-logistic-high-missing-features-removed.png');

In [None]:
plot_precision_recall_curve(
    model_alog, X_alog_test, y_alog_test, label='Logistic',
    title=('Precision-Recall Plot: Logistic, '
        + f'{num_rows_string} rows, High Missing Data Features Removed'))
plt.savefig('./../images/prp-logistic-high-missing-data-features-removed.png')

## Random forest with all dataset

In [None]:
data_arf = return_dummified_df(df_all, columns_dummy_all, drop_first=False)

In [None]:
X_arf, y_arf = return_X_y_arrays(data_arf)

In [None]:
X_arf_train, X_arf_test, y_arf_train, y_arf_test = train_test_split(X_arf, y_arf, test_size=0.2)

In [None]:
model_arf = RandomForestClassifier(n_estimators=100, criterion='gini')

In [None]:
model_arf.fit(X_arf_train, y_arf_train)

In [None]:
yhat_arf = model_arf.predict(X_arf_test)

In [None]:
print_recall_precision(y_arf_test, yhat_arf)

In [None]:
# joblib.dump(model_arf, './../model/random-forest-high-null-features-removed.sav')

In [None]:
fig, ax = plt.subplots(figsize=(10, 7))
cm_arf = confusion_matrix(y_arf_test, yhat_arf, labels=model_arf.classes_)
disp_a = ConfusionMatrixDisplay(confusion_matrix=cm_arf, display_labels=model_arf.classes_)
disp_a.plot(ax=ax)
plt.title(f'Random Forest, {num_rows_string} rows, High Missing Data Features Removed', fontsize=18)
ax.tick_params(axis='both', labelsize=16)
ax.set_ylabel(ax.get_ylabel(), fontsize=16)
ax.set_xlabel(ax.get_xlabel(), fontsize=16)

for labels in disp_a.text_.ravel():
    labels.set_fontsize(16)

plt.grid(None)
plt.tight_layout()
plt.savefig('./../images/cm-rforest-high-missing-features-removed.png');

In [None]:
plot_precision_recall_curve(
    model_arf, X_arf_test, y_arf_test, label='Random Forest',
    title=('Precision-Recall Plot: Random Forest, '
        + f'{num_rows_string} rows, High Missing Data Features Removed'))
plt.savefig('./../images/prp-rforest-high-missing-data-features-removed.png')

In [None]:
plot_random_forest_feature_importance_mdi(
    data_arf, model_arf, title=('Top 10 Feature Importance, Mean Decrease '
    + f'Impurity: {num_rows_string} rows, High Missing Data Features Removed'))
plt.savefig('./../images/feature-importance-high-missing-features-removed.png')

In [None]:
# feat_scores_a.reset_index()['index'][feat_scores_a.reset_index()['index'].str.contains('res_state')].reset_index()