In [None]:
import numpy as np
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
q1features_train_1 = pd.read_pickle("q1features_train_1.pickle")
q1features_valid_1 = pd.read_pickle("q1features_valid_1.pickle")
q1target_train_1 = pd.read_pickle("q1target_train_1.pickle")
q1target_valid_1 = pd.read_pickle("q1target_valid_1.pickle")

In [None]:
#Imputing validation set
for col in q1features_valid_1.columns.tolist():
  q1features_valid_1[col] = q1features_valid_1[col].replace([None], np.nan)
  if (q1features_valid_1[col].dtype == 'category' or q1features_valid_1[col].dtype =='datetime64[ns]'):
    q1features_valid_1[col]= q1features_valid_1[col].fillna(q1features_train_1[col].mode()[0])
  if (q1features_valid_1[col].dtype == 'Int64' or q1features_valid_1[col].dtype == 'int64'):
    q1features_valid_1[col] = q1features_valid_1[col].astype('float64')
  if (q1features_valid_1[col].dtype == 'float64'):
    q1features_valid_1[col] = q1features_valid_1[col].fillna(q1features_train_1[col].mean())

In [None]:
#Imputing training set
for col in q1features_train_1.columns.tolist():
  q1features_train_1[col] = q1features_train_1[col].replace([None], np.nan)
  if (q1features_train_1[col].dtype == 'category' or q1features_train_1[col].dtype =='datetime64[ns]'):
    q1features_train_1[col] = q1features_train_1[col].fillna(q1features_train_1[col].mode()[0])
  if (q1features_train_1[col].dtype == 'Int64' or q1features_train_1[col].dtype == 'int64'):
    q1features_train_1[col] = q1features_train_1[col].astype('float64')
  if (q1features_train_1[col].dtype == 'float64'):
    q1features_train_1[col] = q1features_train_1[col].fillna(q1features_train_1[col].mean())

In [None]:
#Normalization of validation set
for col in q1features_valid_1.columns.tolist():
  if (q1features_valid_1[col].dtype == 'Int64' or q1features_valid_1[col].dtype == 'int64' or q1features_valid_1[col].dtype == 'float64'):
    q1features_valid_1[col] = (q1features_valid_1[col] - min(q1features_train_1[col]))/(max(q1features_train_1[col]) - min(q1features_train_1[col]))

In [None]:
#Normalization of training set
for col in q1features_train_1.columns.tolist():
  if (q1features_train_1[col].dtype == 'Int64' or q1features_train_1[col].dtype == 'int64' or q1features_train_1[col].dtype == 'float64'):
    q1features_train_1[col] = (q1features_train_1[col] - min(q1features_train_1[col]))/(max(q1features_train_1[col]) - min(q1features_train_1[col]))

In [None]:
#The SMOTE algorithm cannot handle datetime, so converting to number of
#days since January 1st, year one
for i in range(len(q1features_train_1['INTERVIEWDATE'])):
  q1features_train_1.loc[q1features_train_1.index[i], 'INTERVIEWDATE'] =  q1features_train_1.loc[q1features_train_1.index[i], 'INTERVIEWDATE'].toordinal()

for i in range(len(q1features_valid_1['INTERVIEWDATE'])):
  q1features_valid_1.loc[q1features_valid_1.index[i], 'INTERVIEWDATE'] =  q1features_valid_1.loc[q1features_valid_1.index[i], 'INTERVIEWDATE'].toordinal()

q1features_train_1['INTERVIEWDATE'] = q1features_train_1['INTERVIEWDATE'].astype('float64')
q1features_valid_1['INTERVIEWDATE'] = q1features_valid_1['INTERVIEWDATE'].astype('float64')

In [None]:
#Converting categorical variables to dummy variables
for col in q1features_train_1.columns.tolist():
  if (q1features_train_1[col].dtype == 'category'):
    q1features_train_1 = pd.concat([q1features_train_1, pd.get_dummies(q1features_train_1[col], prefix=col, drop_first=True, dtype='float')], axis=1)
    q1features_train_1 = q1features_train_1.drop([col], axis=1)

In [None]:
#Converting categorical variables to dummy variables
for col in q1features_valid_1.columns.tolist():
  if (q1features_valid_1[col].dtype == 'category'):
    q1features_valid_1 = pd.concat([q1features_valid_1, pd.get_dummies(q1features_valid_1[col], prefix=col, drop_first=True, dtype='float')], axis=1)
    q1features_valid_1 = q1features_valid_1.drop([col], axis=1)

In [None]:
#Converting target variable to a float (False=0.0, True=1.0)
q1target_train_1 = q1target_train_1.astype('bool')
q1target_train_1 = q1target_train_1.astype('float')

In [None]:
#Converting target variable to a float (False=0.0, True=1.0)
q1target_valid_1 = q1target_valid_1.astype('bool')
q1target_valid_1 = q1target_valid_1.astype('float')

In [None]:
#Removing highly correlated variables
vifcalcs=q1features_train_1

In [None]:
vif_data = pd.DataFrame()
vif_data["feature"] = vifcalcs.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(vifcalcs.values, i)
                          for i in range(len(vifcalcs.columns))]

In [None]:
#Printing out attribute names with VIF value > 10
for i in range (len(vif_data['VIF'])):
  if (vif_data.loc[vif_data.index[i], 'VIF'])> 10:
    print(vif_data.loc[vif_data.index[i], 'feature'],vif_data.loc[vif_data.index[i], 'VIF'])

In [None]:
#Iteratively removed attributes starting with highest VIF until all
#VIF values are below ten.

#Note that BMI5CAT and SMOKER3 were kept in the model even though VIF >10,
#since VIF >20 and there is possible evidence of a relationship between these
#variables and long-term COVID outcomes

In [None]:
unwanted = q1features_train_1.columns[q1features_train_1.columns.str.startswith('IDAY')]
q1features_train_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_train_1.columns[q1features_train_1.columns.str.startswith('QSTVER')]
q1features_train_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_train_1.columns[q1features_train_1.columns.str.startswith('STATE')]
q1features_train_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_train_1.columns[q1features_train_1.columns.str.startswith('_AGE65YR')]
q1features_train_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_train_1.columns[q1features_train_1.columns.str.startswith('IMONTH')]
q1features_train_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_train_1.columns[q1features_train_1.columns.str.startswith('INTERVIEWDATE')]
q1features_train_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_train_1.columns[q1features_train_1.columns.str.startswith('_AGE_G')]
q1features_train_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_train_1.columns[q1features_train_1.columns.str.startswith('_AGE80')]
q1features_train_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_train_1.columns[q1features_train_1.columns.str.startswith('HTM4')]
q1features_train_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_train_1.columns[q1features_train_1.columns.str.startswith('WTKG3')]
q1features_train_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_train_1.columns[q1features_train_1.columns.str.startswith('USENOW3')]
q1features_train_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_train_1.columns[q1features_train_1.columns.str.startswith('EDUCA')]
q1features_train_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_train_1.columns[q1features_train_1.columns.str.startswith('SLEPTIM1')]
q1features_train_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_train_1.columns[q1features_train_1.columns.str.startswith('PRIMINSR')]
q1features_train_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_train_1.columns[q1features_train_1.columns.str.startswith('FMONTH')]
q1features_train_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_train_1.columns[q1features_train_1.columns.str.startswith('SMOKE100')]
q1features_train_1.drop(unwanted, axis=1, inplace=True)

In [None]:
#Drop same attributes in validation set
unwanted = q1features_valid_1.columns[q1features_valid_1.columns.str.startswith('IDAY')]
q1features_valid_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_valid_1.columns[q1features_valid_1.columns.str.startswith('QSTVER')]
q1features_valid_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_valid_1.columns[q1features_valid_1.columns.str.startswith('STATE')]
q1features_valid_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_valid_1.columns[q1features_valid_1.columns.str.startswith('_AGE65YR')]
q1features_valid_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_valid_1.columns[q1features_valid_1.columns.str.startswith('IMONTH')]
q1features_valid_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_valid_1.columns[q1features_valid_1.columns.str.startswith('INTERVIEWDATE')]
q1features_valid_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_valid_1.columns[q1features_valid_1.columns.str.startswith('_AGE_G')]
q1features_valid_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_valid_1.columns[q1features_valid_1.columns.str.startswith('_AGE80')]
q1features_valid_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_valid_1.columns[q1features_valid_1.columns.str.startswith('HTM4')]
q1features_valid_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_valid_1.columns[q1features_valid_1.columns.str.startswith('WTKG3')]
q1features_valid_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_valid_1.columns[q1features_valid_1.columns.str.startswith('USENOW3')]
q1features_valid_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_valid_1.columns[q1features_valid_1.columns.str.startswith('EDUCA')]
q1features_valid_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_valid_1.columns[q1features_valid_1.columns.str.startswith('SLEPTIM1')]
q1features_valid_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_valid_1.columns[q1features_valid_1.columns.str.startswith('PRIMINSR')]
q1features_valid_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_valid_1.columns[q1features_valid_1.columns.str.startswith('FMONTH')]
q1features_valid_1.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_valid_1.columns[q1features_valid_1.columns.str.startswith('SMOKE100')]
q1features_valid_1.drop(unwanted, axis=1, inplace=True)

In [None]:
#SMOTE Algorithm
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
q1features_train_1_SM, q1target_train_1_SM = sm.fit_resample(q1features_train_1, q1target_train_1.ravel())

Logistic Regression

In [None]:
import statsmodels.api as sm
from sklearn import metrics

In [None]:
#Constructing logistic regression model
log_reg = sm.Logit(q1target_train_1_SM, q1features_train_1_SM).fit()

In [None]:
log_reg.summary() #Use to obtain regression coefficients and p-values

In [None]:
#Making predictions on the validation set using the logistic regression model
logRegPrediction = logReg.predict(q1features_valid_1)

In [None]:
from sklearn import metrics

In [None]:
#Converting predicted probabilities back to class labels
logRegPrediction = np.where(logRegPrediction > 0.5, 1, 0)

In [None]:
#Generating confusion matrix
logRegMatrix = metrics.confusion_matrix(q1target_valid_1, logRegPrediction)
logRegMatrix

Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
# Build a Gaussian Classifier
model = GaussianNB()

In [None]:
#Constructing Naive Bayes model
nbModel = model.fit(q1features_train_1_SM, q1target_train_1_SM)

In [None]:
#Making predictions on the validation set
nbPrediction = nbModel.predict(q1features_valid_1)

In [None]:
#Generating confusion matrix
nbMatrix = metrics.confusion_matrix(q1target_valid_1, nbPrediction)
nbMatrix

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
#Constructing random forest model
rf = RandomForestClassifier()
rf.fit(q1features_train_1_SM, q1target_train_1_SM)

In [None]:
#Making predictions on the validation set
rfPrediction = rf.predict(q1features_valid_1)

In [None]:
#Generating confusion matrix
rfMatrix = metrics.confusion_matrix(q1target_valid_1, rfPrediction)
rfMatrix

Gradient-Boosted Trees

In [None]:
from xgboost import XGBClassifier

In [None]:
#Constructing model
bst = XGBClassifier()
bst.fit(q1features_train_1_SM, q1target_train_1_SM)

In [None]:
#Making predictions on the validation set
bstPrediction = bst.predict(q1features_valid_1)

In [None]:
#Generating confusion matrix
bstMatrix = metrics.confusion_matrix(q1target_valid_1, bstPrediction)
bstMatrix