In [164]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [18]:
# !pip install imblearn

In [98]:
#!pip install mlxtend

In [165]:
import pandas as pd
import numpy as np
import scipy.stats as st
import json

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import ElasticNet


from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report


from sklearn.datasets import make_classification

from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.plotting import plot_decision_regions
%matplotlib inline



## Logistic Regression Redux

I had initially hoped that either the Random Forest or XGBoost models with over or undersampling might lend some support for feature selection that I could then use to dig into a logistic regression model, which seemed due to the binary nature of the problem to be the model best suited to this data.  Unfortunately, RF and XGB did not really lend any clarity here.  

I attempted to build a model with smaller subsets of the data, including respectively only hospitalized patients, and only immunosuppressed patients.  I additionally worked on some feature engineering, to try to draw out any correlations specifically around the `inmsupr` feature in the data.  I also attempted to use GridSearchCV to aid in finding the best possible combination of C and L1 versus L2 penalty for regularization. In addition, I applied oversampling and undersampling with the logistic regression model, in the hopes that this might provide better results.

However, after all of these experiments, it became clear that my original baseline logistic regression model, from my Minimum Viable Product in Notebook 04, had better validation results than any other model, and ultimately better test results as well.  

### Hospitalization Subset

In [20]:
covid_hosp = pd.read_json('covid_hosp.json', lines=True)

In [21]:
categoricals = [ 'sex','patient_type', 'pneumonia', 'diabetes', 'copd', 'asthma', 'inmsupr',
       'hypertension', 'other_disease', 'cardiovascular', 'obesity',
       'renal_chronic', 'tobacco', 'pregnancy', 'icu', 'intubed', 'covid_res',
       'contact_other_covid', 'passed']

In [22]:
covid_hosp[categoricals] = covid_hosp[categoricals].astype('category')

In [23]:
covid_hosp.shape

(120026, 24)

In [9]:
# train / test split
hosp_train, hosp_test = train_test_split(covid_hosp, test_size=0.2, random_state=33, stratify=covid_hosp['passed'].values)

In [10]:
# train / validation split
hosp_train, hosp_val = train_test_split(hosp_train, test_size=0.2, random_state=33, stratify=hosp_train['passed'].values)

### Immunosuppressed Subset

In [4]:
covid_immun = pd.read_json('covid_immun.json', lines=True)

In [8]:
covid_immun[categoricals] = covid_immun[categoricals].astype('category')

In [14]:
covid_immun.shape

(8904, 24)

In [11]:
# train / test split
immun_train, immun_test = train_test_split(covid_immun, test_size=0.2, random_state=33, stratify=covid_immun['passed'].values)

In [12]:
# train / validation split
immun_train, immun_val = train_test_split(immun_train, test_size=0.2, random_state=33, stratify=immun_train['passed'].values)

### Engineered Features


In [166]:
covid_feat = pd.read_json('covid_hosp_feats.json', lines=True)

In [167]:
covid_feat.shape

(120026, 29)

In [168]:
covid_feat.columns

Index(['id', 'sex', 'patient_type', 'entry_date', 'date_symptoms', 'date_died',
       'intubed', 'pneumonia', 'age', 'pregnancy', 'diabetes', 'copd',
       'asthma', 'inmsupr', 'hypertension', 'other_disease', 'cardiovascular',
       'obesity', 'renal_chronic', 'tobacco', 'contact_other_covid',
       'covid_res', 'icu', 'passed', 'comorb_count', 'imm_comorb',
       'imm_covid_pos', 'imm_other_dis', 'imm_lung_disease'],
      dtype='object')

In [169]:
covid_res = pd.get_dummies(covid_feat['covid_res'])

In [170]:
covid_res.head()

Unnamed: 0,0,1,2
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0


In [171]:
covid_feat = pd.get_dummies(covid_feat, columns=['covid_res'], drop_first=True, prefix='covid_res')

In [172]:
covid_feat.head()

Unnamed: 0,id,sex,patient_type,entry_date,date_symptoms,date_died,intubed,pneumonia,age,pregnancy,...,contact_other_covid,icu,passed,comorb_count,imm_comorb,imm_covid_pos,imm_other_dis,imm_lung_disease,covid_res_1,covid_res_2
0,18be58,1,1,2020-06-19T00:00:00.000Z,2020-06-19T00:00:00.000Z,9999-99-99,0,1,57,0,...,1,0,0,3,0,0,0,0,1,0
1,0c3c05,0,1,2020-05-04T00:00:00.000Z,2020-04-28T00:00:00.000Z,9999-99-99,0,0,66,0,...,0,0,0,0,0,0,0,0,1,0
2,06861b,1,1,2020-05-05T00:00:00.000Z,2020-04-29T00:00:00.000Z,08-05-2020,0,0,55,0,...,0,0,1,2,0,0,0,0,1,0
3,1e0b21,0,1,2020-06-14T00:00:00.000Z,2020-06-14T00:00:00.000Z,9999-99-99,0,0,35,0,...,0,0,0,1,0,0,0,0,1,0
4,16b611,1,1,2020-04-20T00:00:00.000Z,2020-04-10T00:00:00.000Z,30-04-2020,0,1,44,0,...,0,0,1,0,0,0,0,0,1,0


In [173]:
covid_feat.columns

Index(['id', 'sex', 'patient_type', 'entry_date', 'date_symptoms', 'date_died',
       'intubed', 'pneumonia', 'age', 'pregnancy', 'diabetes', 'copd',
       'asthma', 'inmsupr', 'hypertension', 'other_disease', 'cardiovascular',
       'obesity', 'renal_chronic', 'tobacco', 'contact_other_covid', 'icu',
       'passed', 'comorb_count', 'imm_comorb', 'imm_covid_pos',
       'imm_other_dis', 'imm_lung_disease', 'covid_res_1', 'covid_res_2'],
      dtype='object')

In [175]:
categoricals = [ 'sex','patient_type', 'pneumonia', 'diabetes', 'copd', 'asthma', 'inmsupr',
       'hypertension', 'other_disease', 'cardiovascular', 'obesity',
       'renal_chronic', 'tobacco', 'pregnancy', 'icu', 'intubed',
       'contact_other_covid', 'passed', 'covid_res_1', 'covid_res_2']

In [176]:
covid_feat[categoricals] = covid_feat[categoricals].astype('category')

In [179]:
# train / test split

feat_train, feat_test = train_test_split(covid_feat, test_size=0.2, random_state=33, stratify=covid_feat['passed'].values)

In [180]:
# train / validation split
feat_train, feat_val = train_test_split(feat_train, test_size=0.2, random_state=33, stratify=feat_train['passed'].values)

In [72]:
features = ['sex', 'intubed', 'pneumonia', 'age', 'pregnancy', 'diabetes', 'copd',
       'asthma', 'inmsupr', 'hypertension', 'other_disease', 'cardiovascular',
       'obesity', 'renal_chronic', 'tobacco', 'contact_other_covid', 'icu', 'comorb_count', 'imm_comorb',
       'imm_covid_pos', 'imm_other_dis', 'imm_lung_disease','covid_res_1', 'covid_res_2']

In [94]:
# baseline, no penalty
feat_lr = LogisticRegression(solver='saga', max_iter=1000)
xtrain = feat_train[features]
ytrain = feat_train['passed']

scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
feat_lr.fit(xtrain, ytrain)

LogisticRegression(max_iter=1000, solver='saga')

In [78]:
xval = scaler.transform(feat_val[features])
yval = feat_val['passed']
lr_preds = feat_lr.predict(xval)
lr_conf = confusion_matrix(yval, lr_preds)
lr_conf

array([[13271,   821],
       [ 3725,  1387]])

In [54]:
import statsmodels.api as sm

In [74]:
# statsmodels for comparison
feat_sm = sm.Logit(ytrain, sm.add_constant(xtrain))
feat_sm = feat_sm.fit()

Optimization terminated successfully.
         Current function value: 0.490793
         Iterations 12


In [75]:
feat_sm.summary()

0,1,2,3
Dep. Variable:,passed,No. Observations:,76816.0
Model:,Logit,Df Residuals:,76792.0
Method:,MLE,Df Model:,23.0
Date:,"Mon, 08 Feb 2021",Pseudo R-squ.:,0.153
Time:,08:38:45,Log-Likelihood:,-37701.0
converged:,True,LL-Null:,-44509.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.2797,0.010,-124.569,0.000,-1.300,-1.260
x1,0.1315,0.009,14.094,0.000,0.113,0.150
x2,0.3896,0.009,41.344,0.000,0.371,0.408
x3,0.2787,0.010,28.268,0.000,0.259,0.298
x4,0.6125,0.012,52.736,0.000,0.590,0.635
x5,-0.0991,0.021,-4.647,0.000,-0.141,-0.057
x6,0.0425,2.35e+05,1.81e-07,1.000,-4.6e+05,4.6e+05
x7,-0.0129,1.07e+05,-1.21e-07,1.000,-2.09e+05,2.09e+05
x8,-0.0493,8.17e+04,-6.03e-07,1.000,-1.6e+05,1.6e+05


In [76]:
feat_lr.intercept_, feat_lr.coef_

(array([-1.27966392]),
 array([[ 0.1314572 ,  0.3895093 ,  0.27866462,  0.61238222, -0.09905339,
          0.04249466, -0.01288198, -0.04931456,  0.09618228,  0.02576786,
          0.05556881, -0.01486286,  0.04605757,  0.07194404, -0.01636137,
         -0.28688245, -0.0094236 ,  0.04720759, -0.03764559,  0.00089342,
         -0.00074813, -0.02153534,  0.45165612, -0.13105201]]))

In [79]:
# scoring on training set
print(classification_report(ytrain, feat_lr.predict(xtrain)))

              precision    recall  f1-score   support

           0       0.78      0.94      0.85     56369
           1       0.63      0.27      0.38     20447

    accuracy                           0.76     76816
   macro avg       0.70      0.61      0.62     76816
weighted avg       0.74      0.76      0.73     76816



In [80]:
# scoring on validation set
print(classification_report(yval, feat_lr.predict(xval)))

              precision    recall  f1-score   support

           0       0.78      0.94      0.85     14092
           1       0.63      0.27      0.38      5112

    accuracy                           0.76     19204
   macro avg       0.70      0.61      0.62     19204
weighted avg       0.74      0.76      0.73     19204



In [82]:
feat_lr.predict_proba(xval)[:10]

array([[0.98244384, 0.01755616],
       [0.80990819, 0.19009181],
       [0.41001012, 0.58998988],
       [0.92090796, 0.07909204],
       [0.75076525, 0.24923475],
       [0.95976289, 0.04023711],
       [0.85676989, 0.14323011],
       [0.93328275, 0.06671725],
       [0.79761172, 0.20238828],
       [0.77265747, 0.22734253]])

### Addressing Imbalanced Data in the Feature Engineered Set

#### Basic Oversampling

In [96]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=33)
x_resampled, y_resampled = ros.fit_sample(xtrain, ytrain)
Counter(y_resampled)

Counter({0: 56369, 1: 56369})

In [106]:
# oversampled, no penalty
osample_lr = LogisticRegression(solver='saga', max_iter=1000)

scaler = StandardScaler()
x_resampled = scaler.fit_transform(x_resampled)
osample_lr.fit(x_resampled, y_resampled)

LogisticRegression(max_iter=1000, solver='saga')

In [107]:
osample_preds = osample_lr.predict(xval)
osample_conf = confusion_matrix(yval, osample_preds)
osample_conf

array([[8190, 5902],
       [1087, 4025]])

In [108]:
# scoring on training set
print(classification_report(y_resampled, osample_lr.predict(x_resampled)))

              precision    recall  f1-score   support

           0       0.70      0.67      0.68     56369
           1       0.68      0.71      0.70     56369

    accuracy                           0.69    112738
   macro avg       0.69      0.69      0.69    112738
weighted avg       0.69      0.69      0.69    112738



In [109]:
# scoring on validation set - looks like overfitting to the training data
# worse than without oversampling
print(classification_report(yval, osample_lr.predict(xval)))

              precision    recall  f1-score   support

           0       0.88      0.58      0.70     14092
           1       0.41      0.79      0.54      5112

    accuracy                           0.64     19204
   macro avg       0.64      0.68      0.62     19204
weighted avg       0.76      0.64      0.66     19204



#### Synthetic Minority Oversampling Technique (SMOTE)

In [110]:
from imblearn.over_sampling import SMOTE
x_smoted, y_smoted = SMOTE(random_state=32).fit_sample(xtrain, ytrain)
Counter(y_smoted)

Counter({0: 56369, 1: 56369})

In [111]:
# SMOTE oversampled, no penalty
SMOTE_lr = LogisticRegression(solver='saga', max_iter=1000)

scaler = StandardScaler()
x_smoted = scaler.fit_transform(x_smoted)
SMOTE_lr.fit(x_smoted, y_smoted)

LogisticRegression(max_iter=1000, solver='saga')

In [112]:
SMOTE_preds = SMOTE_lr.predict(xval)
SMOTE_conf = confusion_matrix(yval, SMOTE_preds)
SMOTE_conf

array([[8194, 5898],
       [1090, 4022]])

In [113]:
# scoring on training set
print(classification_report(y_smoted, SMOTE_lr.predict(x_smoted)))

              precision    recall  f1-score   support

           0       0.70      0.67      0.68     56369
           1       0.68      0.71      0.70     56369

    accuracy                           0.69    112738
   macro avg       0.69      0.69      0.69    112738
weighted avg       0.69      0.69      0.69    112738



In [114]:
# scoring on validation set
# same as basic oversampling, worse than non-sampled
print(classification_report(yval, SMOTE_lr.predict(xval)))

              precision    recall  f1-score   support

           0       0.88      0.58      0.70     14092
           1       0.41      0.79      0.54      5112

    accuracy                           0.64     19204
   macro avg       0.64      0.68      0.62     19204
weighted avg       0.76      0.64      0.66     19204



#### ADASYN (Adaptive Synthetic Oversampling)

In [116]:
from imblearn.over_sampling import ADASYN
X_adasyn, y_adasyn = ADASYN(random_state=33).fit_sample(xtrain, ytrain)
Counter(y_adasyn)

Counter({0: 56369, 1: 59101})

In [117]:
# ADASYN oversampled, no penalty
ADASYN_lr = LogisticRegression(solver='saga', max_iter=1000)

scaler = StandardScaler()
X_adasyn = scaler.fit_transform(X_adasyn)
ADASYN_lr.fit(X_adasyn, y_adasyn)

LogisticRegression(max_iter=1000, solver='saga')

In [118]:
ADASYN_preds = ADASYN_lr.predict(xval)
ADASYN_conf = confusion_matrix(yval, ADASYN_preds)
ADASYN_conf

array([[7726, 6366],
       [ 948, 4164]])

In [120]:
# scoring on training set
print(classification_report(y_adasyn, ADASYN_lr.predict(X_adasyn)))

              precision    recall  f1-score   support

           0       0.67      0.61      0.64     56369
           1       0.66      0.71      0.68     59101

    accuracy                           0.66    115470
   macro avg       0.66      0.66      0.66    115470
weighted avg       0.66      0.66      0.66    115470



In [121]:
# scoring on validation set
# nice improvement on recall, but took a hit on precision
# maybe worth working on the penalization
# look
print(classification_report(yval, ADASYN_lr.predict(xval)))

              precision    recall  f1-score   support

           0       0.89      0.55      0.68     14092
           1       0.40      0.81      0.53      5112

    accuracy                           0.62     19204
   macro avg       0.64      0.68      0.61     19204
weighted avg       0.76      0.62      0.64     19204



In [141]:
for feature, coef in zip(feat_train[features], ADASYN_lr.coef_.T):
    print(feature, ':', coef)

sex : [0.12977655]
intubed : [0.32463395]
pneumonia : [0.23820629]
age : [0.52438052]
pregnancy : [-0.07979762]
diabetes : [0.04497495]
copd : [-0.00560068]
asthma : [-0.04568401]
inmsupr : [0.10976532]
hypertension : [0.01437715]
other_disease : [0.05476367]
cardiovascular : [-0.01118642]
obesity : [0.03436354]
renal_chronic : [0.06398638]
tobacco : [-0.01713429]
contact_other_covid : [-0.27611447]
icu : [-0.00817442]
comorb_count : [0.0443561]
imm_comorb : [-0.03196222]
imm_covid_pos : [-0.017127]
imm_other_dis : [0.00394484]
imm_lung_disease : [-0.02537926]
covid_res_1 : [0.36981852]
covid_res_2 : [-0.10330525]


In [137]:
print(list(zip(features, ADASYN_lr.coef_)))

[('sex', array([ 0.12977655,  0.32463395,  0.23820629,  0.52438052, -0.07979762,
        0.04497495, -0.00560068, -0.04568401,  0.10976532,  0.01437715,
        0.05476367, -0.01118642,  0.03436354,  0.06398638, -0.01713429,
       -0.27611447, -0.00817442,  0.0443561 , -0.03196222, -0.017127  ,
        0.00394484, -0.02537926,  0.36981852, -0.10330525]))]


In [139]:
ADASYN_lr.coef_.shape

(1, 24)

In [142]:
# ADASYN oversampled, L2
ADASYN_L2_lr = LogisticRegression(solver='saga', penalty='elasticnet', l1_ratio=0, max_iter=1000)

#scaler = StandardScaler()
#X_adasyn = scaler.fit_transform(X_adasyn)
ADASYN_L2_lr.fit(X_adasyn, y_adasyn)

LogisticRegression(l1_ratio=0, max_iter=1000, penalty='elasticnet',
                   solver='saga')

In [143]:
ADASYN_L2_preds = ADASYN_L2_lr.predict(xval)
ADASYN_L2_conf = confusion_matrix(yval, ADASYN_L2_preds)
ADASYN_L2_conf

array([[7725, 6367],
       [ 948, 4164]])

In [144]:
ADASYN_preds = ADASYN_lr.predict(xval)
ADASYN_conf = confusion_matrix(yval, ADASYN_preds)
ADASYN_conf

array([[7726, 6366],
       [ 948, 4164]])

In [145]:
print(classification_report(yval, ADASYN_L2_lr.predict(xval)))

              precision    recall  f1-score   support

           0       0.89      0.55      0.68     14092
           1       0.40      0.81      0.53      5112

    accuracy                           0.62     19204
   macro avg       0.64      0.68      0.61     19204
weighted avg       0.76      0.62      0.64     19204



In [146]:
print(classification_report(yval, ADASYN_lr.predict(xval)))

              precision    recall  f1-score   support

           0       0.89      0.55      0.68     14092
           1       0.40      0.81      0.53      5112

    accuracy                           0.62     19204
   macro avg       0.64      0.68      0.61     19204
weighted avg       0.76      0.62      0.64     19204



In [147]:
# ADASYN oversampled, L1
ADASYN_L1_lr = LogisticRegression(solver='saga', penalty='elasticnet', l1_ratio=1, max_iter=1000)

#scaler = StandardScaler()
#X_adasyn = scaler.fit_transform(X_adasyn)
ADASYN_L1_lr.fit(X_adasyn, y_adasyn)

LogisticRegression(l1_ratio=1, max_iter=1000, penalty='elasticnet',
                   solver='saga')

In [148]:
ADASYN_L1_preds = ADASYN_L1_lr.predict(xval)
ADASYN_L1_conf = confusion_matrix(yval, ADASYN_L1_preds)
ADASYN_L1_conf

array([[7728, 6364],
       [ 948, 4164]])

In [149]:
print(classification_report(yval, ADASYN_L1_lr.predict(xval)))

              precision    recall  f1-score   support

           0       0.89      0.55      0.68     14092
           1       0.40      0.81      0.53      5112

    accuracy                           0.62     19204
   macro avg       0.64      0.68      0.61     19204
weighted avg       0.76      0.62      0.64     19204



In [150]:
print(classification_report(yval, ADASYN_lr.predict(xval)))

              precision    recall  f1-score   support

           0       0.89      0.55      0.68     14092
           1       0.40      0.81      0.53      5112

    accuracy                           0.62     19204
   macro avg       0.64      0.68      0.61     19204
weighted avg       0.76      0.62      0.64     19204



In [151]:
# ADASYN oversampled, L1-L2 balanced
ADASYN_bal_lr = LogisticRegression(solver='saga', penalty='elasticnet', l1_ratio=0.5, max_iter=1000)

#scaler = StandardScaler()
#X_adasyn = scaler.fit_transform(X_adasyn)
ADASYN_bal_lr.fit(X_adasyn, y_adasyn)

LogisticRegression(l1_ratio=0.5, max_iter=1000, penalty='elasticnet',
                   solver='saga')

In [152]:
ADASYN_bal_preds = ADASYN_bal_lr.predict(xval)
ADASYN_bal_conf = confusion_matrix(yval, ADASYN_bal_preds)
ADASYN_bal_conf

array([[7728, 6364],
       [ 948, 4164]])

In [153]:
print(classification_report(yval, ADASYN_bal_lr.predict(xval)))

              precision    recall  f1-score   support

           0       0.89      0.55      0.68     14092
           1       0.40      0.81      0.53      5112

    accuracy                           0.62     19204
   macro avg       0.64      0.68      0.61     19204
weighted avg       0.76      0.62      0.64     19204



#### Undersampling

In [122]:
from imblearn.under_sampling import RandomUnderSampler
x_under, y_under = RandomUnderSampler(random_state=33).fit_sample(xtrain, ytrain)
Counter(y_under)

Counter({0: 20447, 1: 20447})

In [123]:
# Undersampled, no penalty
under_lr = LogisticRegression(solver='saga', max_iter=1000)

scaler = StandardScaler()
x_under = scaler.fit_transform(x_under)
under_lr.fit(x_under, y_under)

LogisticRegression(max_iter=1000, solver='saga')

In [124]:
under_preds = under_lr.predict(xval)
under_conf = confusion_matrix(yval, under_preds)
under_conf

array([[8214, 5878],
       [1078, 4034]])

In [125]:
# scoring on training set
print(classification_report(y_under, under_lr.predict(x_under)))

              precision    recall  f1-score   support

           0       0.70      0.67      0.68     20447
           1       0.68      0.71      0.70     20447

    accuracy                           0.69     40894
   macro avg       0.69      0.69      0.69     40894
weighted avg       0.69      0.69      0.69     40894



In [126]:
# scoring on validation set
# very close to baseline model
print(classification_report(yval, under_lr.predict(xval)))

              precision    recall  f1-score   support

           0       0.88      0.58      0.70     14092
           1       0.41      0.79      0.54      5112

    accuracy                           0.64     19204
   macro avg       0.65      0.69      0.62     19204
weighted avg       0.76      0.64      0.66     19204



In [127]:
# base model scoring on validation set
print(classification_report(yval, feat_lr.predict(xval)))

              precision    recall  f1-score   support

           0       0.88      0.58      0.70     14092
           1       0.41      0.79      0.54      5112

    accuracy                           0.64     19204
   macro avg       0.64      0.68      0.62     19204
weighted avg       0.76      0.64      0.66     19204



### GridSearchCV for parameter tuning

In [190]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, log_loss, make_scorer
import matplotlib.pyplot as plt

In [191]:
xtraings = feat_train[strict_features]
ytraings = feat_train['passed']

scaler = StandardScaler()
xtraings = scaler.fit_transform(xtraings)

In [193]:
xvalgs = scaler.transform(feat_val[strict_features])
yvalgs = feat_val['passed']



In [208]:
logreg=LogisticRegression(solver='saga')

In [209]:
params = {
    'C': np.logspace(-3,3,7),
    'penalty':["l1", "l2"],
    'random_state':[33]
}
score = make_scorer(recall_score)


In [210]:
gridsearch = GridSearchCV(logreg, params, cv=5)
gridsearch.fit(xtraings, ytraings)
print("Best parameters: ", gridsearch.best_params_)

best_estim=gridsearch.best_estimator_
print("Best estimator: ",best_estim)

best_estim.fit(xtraings, ytraings)

ytr_pred = best_estim.predict(xtraings)
print("Training scores: ", classification_report(ytraings, best_estim.predict(xtraings)))

Best parameters:  {'C': 0.01, 'penalty': 'l1', 'random_state': 33}
Best estimator:  LogisticRegression(C=0.01, penalty='l1', random_state=33, solver='saga')
Training scores:                precision    recall  f1-score   support

           0       0.76      0.95      0.84     56369
           1       0.56      0.17      0.26     20447

    accuracy                           0.74     76816
   macro avg       0.66      0.56      0.55     76816
weighted avg       0.71      0.74      0.69     76816



In [219]:
gs_preds = best_estim.predict(xvalgs)
gs_conf = confusion_matrix(yvalgs, gs_preds)
gs_conf

array([[13425,   667],
       [ 4255,   857]])

In [None]:
x_ax = range(len(yvalgs))
plt.scatter(x_ax, yvalgs, s=5, color='blue', label='original')
plt.plot(x_ax, yvalgs, lw=0.8, color='red', label='predicted')
plt.legend()
plt.show();

In [212]:
# baseline repeated with strict_features - 
lm_1 = LogisticRegression(solver='saga',  # same model as statsmodel,try default later
                         C=100000)  # no regularization
features = ['age', 'sex', 'patient_type', 'hypertension', 'obesity', 'inmsupr','copd', 'other_disease', 'renal_chronic','tobacco']
X_train  = covid_feat[strict_features]
y_train = covid_feat['passed']

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

lm_1.fit(X_train, y_train)

LogisticRegression(C=100000, solver='saga')

In [214]:
print("Training scores: ", classification_report(y_train, lm_1.predict(X_train)))

Training scores:                precision    recall  f1-score   support

           0       0.76      0.95      0.84     88077
           1       0.56      0.18      0.27     31949

    accuracy                           0.74    120026
   macro avg       0.66      0.56      0.56    120026
weighted avg       0.71      0.74      0.69    120026



In [215]:
covid_feat.shape

(120026, 30)

In [216]:
covid_feat.columns

Index(['id', 'sex', 'patient_type', 'entry_date', 'date_symptoms', 'date_died',
       'intubed', 'pneumonia', 'age', 'pregnancy', 'diabetes', 'copd',
       'asthma', 'inmsupr', 'hypertension', 'other_disease', 'cardiovascular',
       'obesity', 'renal_chronic', 'tobacco', 'contact_other_covid', 'icu',
       'passed', 'comorb_count', 'imm_comorb', 'imm_covid_pos',
       'imm_other_dis', 'imm_lung_disease', 'covid_res_1', 'covid_res_2'],
      dtype='object')

In [217]:
best_estim.coef_
for feature, coef in zip(covid_feat[strict_features], best_estim.coef_.T):
    print(feature, ':', coef)

sex : [0.13623335]
age : [0.60187323]
pregnancy : [-0.08171028]
diabetes : [0.07299871]
copd : [-0.00296904]
asthma : [-0.04281237]
inmsupr : [0.]
hypertension : [0.03932866]
other_disease : [0.04393191]
cardiovascular : [0.]
obesity : [0.05849964]
renal_chronic : [0.07024165]
tobacco : [-0.00742825]
contact_other_covid : [-0.21310149]
comorb_count : [0.00056435]
imm_comorb : [-0.00400084]
imm_covid_pos : [0.]
imm_other_dis : [0.]
imm_lung_disease : [0.0554473]
covid_res_1 : [0.47292332]
covid_res_2 : [-0.12292091]
