In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import json

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import ElasticNet


from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report


from sklearn.datasets import make_classification

from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


## Feature Engineering

Since it was particularly the relationship between immunosuppression and COVID death risk that I was interested in addressing, I created several engineered features to try to draw out any interactions between immunosuppression and any other categories of co-morbidity.

In [2]:
covid_clean = pd.read_json('covid_no_nan.json', lines=True)

In [3]:
covid_clean.shape

(561096, 24)

In [5]:
covid_clean.head()

Unnamed: 0,id,sex,patient_type,entry_date,date_symptoms,date_died,intubed,pneumonia,age,pregnancy,...,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu,passed
0,033f0a,1,0,2020-06-24T00:00:00.000Z,2020-06-19T00:00:00.000Z,9999-99-99,0,0,46,0,...,0,0,0,0,0,0,0,1,0,0
1,18be58,1,1,2020-06-19T00:00:00.000Z,2020-06-19T00:00:00.000Z,9999-99-99,0,1,57,0,...,1,0,0,0,1,1,1,1,0,0
2,1e7e67,0,0,2020-06-08T00:00:00.000Z,2020-06-01T00:00:00.000Z,9999-99-99,0,0,65,0,...,1,0,0,0,0,0,0,1,0,0
3,003bcb,1,0,2020-06-08T00:00:00.000Z,2020-06-05T00:00:00.000Z,9999-99-99,0,0,39,0,...,0,0,0,0,0,0,1,1,0,0
4,1a8351,1,0,2020-06-08T00:00:00.000Z,2020-06-02T00:00:00.000Z,9999-99-99,0,0,45,0,...,0,0,0,0,0,1,0,1,0,0


In [6]:
categoricals = [ 'sex','patient_type', 'pneumonia', 'diabetes', 'copd', 'asthma', 'inmsupr',
       'hypertension', 'other_disease', 'cardiovascular', 'obesity',
       'renal_chronic', 'tobacco', 'pregnancy', 'icu', 'intubed', 'covid_res',
       'contact_other_covid', 'passed']

In [7]:
covid_clean[categoricals] = covid_clean[categoricals].astype('category')

In [13]:
covid_hosp = pd.read_json('covid_hosp.json', lines=True)

In [9]:
covid_hosp[categoricals] = covid_hosp[categoricals].astype('category')

In [10]:
covid_hosp.shape

(120026, 24)

In [11]:
covid_hosp.columns

Index(['id', 'sex', 'patient_type', 'entry_date', 'date_symptoms', 'date_died',
       'intubed', 'pneumonia', 'age', 'pregnancy', 'diabetes', 'copd',
       'asthma', 'inmsupr', 'hypertension', 'other_disease', 'cardiovascular',
       'obesity', 'renal_chronic', 'tobacco', 'contact_other_covid',
       'covid_res', 'icu', 'passed'],
      dtype='object')

In [17]:
covid_hosp['comorb_count'] = covid_hosp['diabetes'] + covid_hosp['copd'] + covid_hosp['asthma'] + covid_hosp['hypertension'] + covid_hosp['other_disease'] + covid_hosp['cardiovascular'] + covid_hosp['renal_chronic']

In [18]:
covid_hosp[covid_hosp['comorb_count'] > 2].head()

Unnamed: 0,id,sex,patient_type,entry_date,date_symptoms,date_died,intubed,pneumonia,age,pregnancy,...,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu,passed,comorb_count
0,18be58,1,1,2020-06-19T00:00:00.000Z,2020-06-19T00:00:00.000Z,9999-99-99,0,1,57,0,...,0,0,0,1,1,1,1,0,0,3
17,09a930,0,1,2020-05-16T00:00:00.000Z,2020-05-14T00:00:00.000Z,9999-99-99,0,1,67,0,...,0,0,0,1,0,0,1,0,0,3
39,04ce29,0,1,2020-04-21T00:00:00.000Z,2020-04-19T00:00:00.000Z,27-04-2020,0,1,52,0,...,1,1,0,1,0,0,1,1,1,5
57,19dcfa,1,1,2020-05-08T00:00:00.000Z,2020-05-08T00:00:00.000Z,19-05-2020,0,1,50,0,...,0,0,1,0,1,0,1,0,1,3
92,9.60E+97,0,1,2020-05-21T00:00:00.000Z,2020-05-21T00:00:00.000Z,28-05-2020,0,1,71,0,...,0,0,0,1,0,0,1,0,1,3


In [25]:
covid_hosp['imm_comorb'] = np.where((covid_hosp['inmsupr']==1) & (covid_hosp['comorb_count']>=1), 1,0)

In [26]:
covid_hosp.head()

Unnamed: 0,id,sex,patient_type,entry_date,date_symptoms,date_died,intubed,pneumonia,age,pregnancy,...,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu,passed,comorb_count,imm_comorb
0,18be58,1,1,2020-06-19T00:00:00.000Z,2020-06-19T00:00:00.000Z,9999-99-99,0,1,57,0,...,0,0,1,1,1,1,0,0,3,0
1,0c3c05,0,1,2020-05-04T00:00:00.000Z,2020-04-28T00:00:00.000Z,9999-99-99,0,0,66,0,...,0,0,0,0,0,1,0,0,0,0
2,06861b,1,1,2020-05-05T00:00:00.000Z,2020-04-29T00:00:00.000Z,08-05-2020,0,0,55,0,...,0,0,0,1,0,1,0,1,2,0
3,1e0b21,0,1,2020-06-14T00:00:00.000Z,2020-06-14T00:00:00.000Z,9999-99-99,0,0,35,0,...,0,0,0,0,0,1,0,0,1,0
4,16b611,1,1,2020-04-20T00:00:00.000Z,2020-04-10T00:00:00.000Z,30-04-2020,0,1,44,0,...,0,1,0,0,0,1,0,1,0,0


In [28]:
covid_hosp['imm_covid_pos'] =  np.where((covid_hosp['inmsupr']==1) & (covid_hosp['covid_res']==1), 1,0)

In [29]:
covid_hosp.head()

Unnamed: 0,id,sex,patient_type,entry_date,date_symptoms,date_died,intubed,pneumonia,age,pregnancy,...,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu,passed,comorb_count,imm_comorb,imm_covid_pos
0,18be58,1,1,2020-06-19T00:00:00.000Z,2020-06-19T00:00:00.000Z,9999-99-99,0,1,57,0,...,0,1,1,1,1,0,0,3,0,0
1,0c3c05,0,1,2020-05-04T00:00:00.000Z,2020-04-28T00:00:00.000Z,9999-99-99,0,0,66,0,...,0,0,0,0,1,0,0,0,0,0
2,06861b,1,1,2020-05-05T00:00:00.000Z,2020-04-29T00:00:00.000Z,08-05-2020,0,0,55,0,...,0,0,1,0,1,0,1,2,0,0
3,1e0b21,0,1,2020-06-14T00:00:00.000Z,2020-06-14T00:00:00.000Z,9999-99-99,0,0,35,0,...,0,0,0,0,1,0,0,1,0,0
4,16b611,1,1,2020-04-20T00:00:00.000Z,2020-04-10T00:00:00.000Z,30-04-2020,0,1,44,0,...,1,0,0,0,1,0,1,0,0,0


In [31]:
covid_hosp['imm_other_dis'] =  np.where((covid_hosp['inmsupr']==1) & (covid_hosp['other_disease']==1), 1,0)

In [32]:
covid_hosp['imm_lung_disease'] = np.where((covid_hosp['inmsupr']==1) & \
                                          ((covid_hosp['pneumonia']==1) | \
                                          (covid_hosp['asthma']==1) | \
                                          (covid_hosp['copd']==1) | \
                                          (covid_hosp['tobacco']==1)), 1, 0)

In [33]:
covid_hosp.head()

Unnamed: 0,id,sex,patient_type,entry_date,date_symptoms,date_died,intubed,pneumonia,age,pregnancy,...,tobacco,contact_other_covid,covid_res,icu,passed,comorb_count,imm_comorb,imm_covid_pos,imm_other_dis,imm_lung_disease
0,18be58,1,1,2020-06-19T00:00:00.000Z,2020-06-19T00:00:00.000Z,9999-99-99,0,1,57,0,...,1,1,1,0,0,3,0,0,0,0
1,0c3c05,0,1,2020-05-04T00:00:00.000Z,2020-04-28T00:00:00.000Z,9999-99-99,0,0,66,0,...,0,0,1,0,0,0,0,0,0,0
2,06861b,1,1,2020-05-05T00:00:00.000Z,2020-04-29T00:00:00.000Z,08-05-2020,0,0,55,0,...,1,0,1,0,1,2,0,0,0,0
3,1e0b21,0,1,2020-06-14T00:00:00.000Z,2020-06-14T00:00:00.000Z,9999-99-99,0,0,35,0,...,0,0,1,0,0,1,0,0,0,0
4,16b611,1,1,2020-04-20T00:00:00.000Z,2020-04-10T00:00:00.000Z,30-04-2020,0,1,44,0,...,0,0,1,0,1,0,0,0,0,0


In [34]:
covid_hosp.to_json('covid_hosp_feats.json', orient='records', lines=True)