In [None]:
import numpy as np
import pandas as pd

import statistics
import random

from statsmodels.stats.outliers_influence import variance_inflation_factor
from imblearn.over_sampling import SMOTE

from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel

from sklearn.metrics import fbeta_score
from sklearn import metrics

import statsmodels.api as sm
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

In [None]:
q1features_trainvalid = pd.read_pickle("q1features_trainvalid.pickle")
q1target_trainvalid = pd.read_pickle("q1target_trainvalid.pickle")
q1features_test = pd.read_pickle("q1features_test.pickle")
q1target_test = pd.read_pickle("q1target_test.pickle")

Attributes with zero variance were removed from the model; these attributes did not contribute any information to the model.

In [None]:
#Removing zero variance attributes (training set)
q1features_trainvalid = q1features_trainvalid.loc[:, q1features_trainvalid.nunique(axis=0) != 1]

In [None]:
#Removing zero variance attributes (test set)
q1features_test = q1features_test.loc[:, q1features_test.nunique(axis=0) != 1]

Attributes with more than 10% missing data were dropped from the dataset. Dr. Iris Eekhout and colleagues in their 2013 paper titled "Missing data in a multi-item instrument were best handled by multiple imputation at the item score level" (https://doi.org/10.1016/j.jclinepi.2013.09.009) found that single-imputation methods can result in biased estimates when the dataset has a higher proportion of missing data. Single-imputation methods will be applied later in the analysis, hence the removal of these attributes with a high proportion of missing data at this step.

In [None]:
#Removing attributes with more than 10% missing data (training set)
for col in q1features_trainvalid.columns.tolist():
  if q1features_trainvalid[col].isna().sum()/len(q1features_trainvalid[col]) > 0.10:
    q1features_trainvalid = q1features_trainvalid.drop([col], axis=1)

In [None]:
#Removing attributes with more than 10% missing data (test set)
for col in q1features_test.columns.tolist():
  if q1features_test[col].isna().sum()/len(q1features_test[col]) > 0.10:
    q1features_test = q1features_test.drop([col], axis=1)

Imputation of missing data was performed. The alternative was to use complete case analysis, which involves removing from the dataset any rows that were missing any attributes. Complete case analysis leads to a loss of information and can lead to bias if the missing data is not missing completely at random.

Single-imputation methods were used; missing categorical values were imputed using the mode of that attribute and missing numeric values were imputed using the median. The test data set was imputed using the mode and median values of the training data set. To avoid data leakage, the training data set was not imputed using test set data.


In [None]:
#Imputing test set
for col in q1features_test.columns.tolist():
  q1features_test[col] = q1features_test[col].replace([None], np.nan)
  if (q1features_test[col].dtype == 'category' or q1features_trainvalid[col].dtype =='datetime64[ns]'):
    q1features_test[col]= q1features_test[col].fillna(random.choice(statistics.multimode(q1features_trainvalid[col])))
  if (q1features_test[col].dtype == 'Int64' or q1features_test[col].dtype == 'int64'):
    q1features_test[col] = q1features_test[col].astype('float64')
  if (q1features_test[col].dtype == 'float64'):
    q1features_test[col] = q1features_test[col].fillna(q1features_trainvalid[col].median())

In [None]:
#Imputing training set
for col in q1features_trainvalid.columns.tolist():
  q1features_trainvalid[col] = q1features_trainvalid[col].replace([None], np.nan)
  if (q1features_trainvalid[col].dtype == 'category' or q1features_trainvalid[col].dtype =='datetime64[ns]'):
    q1features_trainvalid[col]= q1features_trainvalid[col].fillna(random.choice(statistics.multimode(q1features_trainvalid[col])))
  if (q1features_trainvalid[col].dtype == 'Int64' or q1features_trainvalid[col].dtype == 'int64'):
    q1features_trainvalid[col] = q1features_trainvalid[col].astype('float64')
  if (q1features_trainvalid[col].dtype == 'float64'):
    q1features_trainvalid[col] = q1features_trainvalid[col].fillna(q1features_trainvalid[col].median())

Numeric attributes were standardized; variable importance will be assessed using regression models, and standardization allows for easier interpretation of the model coefficients. Standardization was performed instead of normalization, as standardization is more resistant to outliers.

In [None]:
#Standardization  of test set
for col in q1features_test.columns.tolist():
  if (q1features_test[col].dtype == 'Int64' or q1features_test[col].dtype == 'int64' or q1features_test[col].dtype == 'float64'):
    for i in range(len(q1features_test[col])):
      q1features_test.loc[q1features_test.index[i], col] = (q1features_test.loc[q1features_test.index[i], col] - q1features_trainvalid[col].mean())/np.std(q1features_trainvalid[col])

In [None]:
#Standardization of training set
for col in q1features_trainvalid.columns.tolist():
  if (q1features_trainvalid[col].dtype == 'Int64' or q1features_trainvalid[col].dtype == 'int64' or q1features_trainvalid[col].dtype == 'float64'):
    for i in range(len(q1features_trainvalid[col])):
      q1features_trainvalid.loc[q1features_trainvalid.index[i], col] = (q1features_trainvalid.loc[q1features_trainvalid.index[i], col] - q1features_trainvalid[col].mean())/np.std(q1features_trainvalid[col])

In [None]:
#The SMOTE algorithm cannot handle datetime, so converting to number of
#days since January 1st, year one
for i in range(len(q1features_trainvalid['INTERVIEWDATE'])):
  q1features_trainvalid.loc[q1features_trainvalid.index[i], 'INTERVIEWDATE'] =  q1features_trainvalid.loc[q1features_trainvalid.index[i], 'INTERVIEWDATE'].toordinal()

for i in range(len(q1features_test['INTERVIEWDATE'])):
  q1features_test.loc[q1features_test.index[i], 'INTERVIEWDATE'] =  q1features_test.loc[q1features_test.index[i], 'INTERVIEWDATE'].toordinal()

q1features_trainvalid['INTERVIEWDATE'] = q1features_trainvalid['INTERVIEWDATE'].astype('float64')
q1features_test['INTERVIEWDATE'] = q1features_test['INTERVIEWDATE'].astype('float64')

In [None]:
#Converting categorical variables to dummy variables
for col in q1features_trainvalid.columns.tolist():
  if (q1features_trainvalid[col].dtype == 'category'):
    q1features_trainvalid = pd.concat([q1features_trainvalid, pd.get_dummies(q1features_trainvalid[col], prefix=col, drop_first=True, dtype='float')], axis=1)
    q1features_trainvalid = q1features_trainvalid.drop([col], axis=1)

In [None]:
for col in q1features_test.columns.tolist():
  if (q1features_test[col].dtype == 'category'):
    q1features_test = pd.concat([q1features_test, pd.get_dummies(q1features_test[col], prefix=col, drop_first=True, dtype='float')], axis=1)
    q1features_test = q1features_test.drop([col], axis=1)

In [None]:
q1target_trainvalid = q1target_trainvalid.astype('bool')
q1target_trainvalid = q1target_trainvalid.astype('float')

In [None]:
q1target_test = q1target_test.astype('bool')
q1target_test = q1target_test.astype('float')

Multicollinearity can lead to model overfitting and can artificially hide the importance of explanatory variables. Multicollinearity (and collinearity) can be assessed using variance inflation factor values. A VIF value of over 10 indicates serious multicollinearity (Vittinghoff, E., Shiboski, S., Glidden, D., & McCulloch, C. (2004). Regression Methods in Biostatistics: Linear, Logistic, Survival and Repeated Measures Models. New York:Springer. https://doi.org/10.1007/b138825). Independent variables were iteratively removed from the model starting with the variable with the highest VIF value until all VIF values were below an accceptable threshold.

In [None]:
#Removing highly correlated variables
unwanted = q1features_trainvalid.columns[q1features_trainvalid.columns.str.startswith('IDAY')]
q1features_trainvalid.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_trainvalid.columns[q1features_trainvalid.columns.str.startswith('QSTVER')]
q1features_trainvalid.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_trainvalid.columns[q1features_trainvalid.columns.str.startswith('STATE')]
q1features_trainvalid.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_trainvalid.columns[q1features_trainvalid.columns.str.startswith('_AGE65YR')]
q1features_trainvalid.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_trainvalid.columns[q1features_trainvalid.columns.str.startswith('IMONTH')]
q1features_trainvalid.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_trainvalid.columns[q1features_trainvalid.columns.str.startswith('INTERVIEWDATE')]
q1features_trainvalid.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_trainvalid.columns[q1features_trainvalid.columns.str.startswith('_AGE_G')]
q1features_trainvalid.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_trainvalid.columns[q1features_trainvalid.columns.str.startswith('_AGE80')]
q1features_trainvalid.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_trainvalid.columns[q1features_trainvalid.columns.str.startswith('HTM4')]
q1features_trainvalid.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_trainvalid.columns[q1features_trainvalid.columns.str.startswith('WTKG3')]
q1features_trainvalid.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_trainvalid.columns[q1features_trainvalid.columns.str.startswith('USENOW3')]
q1features_trainvalid.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_trainvalid.columns[q1features_trainvalid.columns.str.startswith('EDUCA')]
q1features_trainvalid.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_trainvalid.columns[q1features_trainvalid.columns.str.startswith('SLEPTIM1')]
q1features_trainvalid.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_trainvalid.columns[q1features_trainvalid.columns.str.startswith('PRIMINSR')]
q1features_trainvalid.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_trainvalid.columns[q1features_trainvalid.columns.str.startswith('FMONTH')]
q1features_trainvalid.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_trainvalid.columns[q1features_trainvalid.columns.str.startswith('SMOKE100')]
q1features_trainvalid.drop(unwanted, axis=1, inplace=True)

In [None]:
unwanted = q1features_test.columns[q1features_test.columns.str.startswith('IDAY')]
q1features_test.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_test.columns[q1features_test.columns.str.startswith('QSTVER')]
q1features_test.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_test.columns[q1features_test.columns.str.startswith('STATE')]
q1features_test.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_test.columns[q1features_test.columns.str.startswith('_AGE65YR')]
q1features_test.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_test.columns[q1features_test.columns.str.startswith('IMONTH')]
q1features_test.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_test.columns[q1features_test.columns.str.startswith('INTERVIEWDATE')]
q1features_test.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_test.columns[q1features_test.columns.str.startswith('_AGE_G')]
q1features_test.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_test.columns[q1features_test.columns.str.startswith('_AGE80')]
q1features_test.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_test.columns[q1features_test.columns.str.startswith('HTM4')]
q1features_test.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_test.columns[q1features_test.columns.str.startswith('WTKG3')]
q1features_test.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_test.columns[q1features_test.columns.str.startswith('USENOW3')]
q1features_test.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_test.columns[q1features_test.columns.str.startswith('EDUCA')]
q1features_test.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_test.columns[q1features_test.columns.str.startswith('SLEPTIM1')]
q1features_test.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_test.columns[q1features_test.columns.str.startswith('PRIMINSR')]
q1features_test.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_test.columns[q1features_test.columns.str.startswith('FMONTH')]
q1features_test.drop(unwanted, axis=1, inplace=True)

unwanted = q1features_test.columns[q1features_test.columns.str.startswith('SMOKE100')]
q1features_test.drop(unwanted, axis=1, inplace=True)

Feature selection can help reduce model overfitting. Feature selection can also result in dimensionality reduction, which can improve computational efficiency. Here, a decision tree model was used to evaluate feature importance; this model has a built-in method to compute Gini importance values for each attribute. Gini importance is also referred to as "mean decrease impurity" and is a measure of how a given attribute improves the purity of a node. Attributes with a Gini importance value of less than a robust threshold of 0.01 were removed from the model. Note that dummy variables generated using one-hot encoding will be grouped together; if one variable meets the threshold then all of the set of dummy variables will remain in the model. This is because the entire set of dummy variables is required to represent one categorical variable.

In [None]:
clf = DecisionTreeClassifier(max_depth=16, random_state=8)
clf.fit(q1features_trainvalid, q1target_trainvalid)
y_pred = clf.predict(q1features_test)

importances = clf.feature_importances_
threshold = 0.01
selected_features = q1features_trainvalid.columns[importances > threshold]
selected_features.tolist()

In [None]:
filter_col = [col for col in q1features_trainvalid if col.startswith('') or col.startswith('')]

In [None]:
q1features_trainvalid = q1features_trainvalid[filter_col]
q1features_test = q1features_test[filter_col]

The dataset is imbalanced; a majority of respondents reported that they did not have COVID-19 symptoms that lasted longer than 3 months. Imbalanced data adversely affects model performance; if the model encounters few instances of the minority class then it will be unable to effectively learn from this class.

The SMOTE algorithm was used to oversample the minority class; this address the issue of imbalanced data.

In [None]:
#SMOTE Algorithm
from imblearn.over_sampling import SMOTE
smo = SMOTE(random_state = 2, k_neighbors=10)
q1features_trainvalid_SM, q1target_trainvalid_SM = smo.fit_resample(q1features_trainvalid, q1target_trainvalid.ravel())

1. Running models using hyperparameters chosen to optimize F-2 Score

Logistic Regression

In [None]:
logReg = sm.Logit(q1target_trainvalid_SM.ravel(), q1features_trainvalid_SM,).fit()
logRegPrediction = logReg.predict(q1features_test)
logRegPrediction = np.where(logRegPrediction > 0.5, 1, 0)

f2_score = fbeta_score(q1target_test, logRegPrediction, beta=2)
accuracy =  metrics.accuracy_score(q1target_test, logRegPrediction)
recall =  metrics.recall_score(q1target_test, logRegPrediction)
precision = metrics.precision_score(q1target_test, logRegPrediction)

In [None]:
f2_score
#0.5161352497384217

In [None]:
accuracy
#0.470629428241885

In [None]:
recall
#0.7043020852902498

In [None]:
precision
#0.24950086516704378

In [None]:
logRegMatrix = metrics.confusion_matrix(q1target_test, logRegPrediction)
logRegMatrix
#array([[ 7676, 11277],
#       [ 1574,  3749]])


Naive Bayes

In [None]:
model = GaussianNB()
nbModel = model.fit(q1features_trainvalid_SM, q1target_trainvalid_SM)
nbPrediction = (nbModel.predict_proba(q1features_test)[:,1] >= 0.1).astype(bool)

f2_score = fbeta_score(q1target_test, nbPrediction, beta=2)
accuracy =  metrics.accuracy_score(q1target_test, nbPrediction)
recall =  metrics.recall_score(q1target_test, nbPrediction)
precision = metrics.precision_score(q1target_test, nbPrediction)

In [None]:
f2_score
#0.5846800827865604

In [None]:
accuracy
#0.22446037238424782

In [None]:
recall
#0.9977456321623145

In [None]:
precision
#0.22013595291386887

In [None]:
nbMatrix = metrics.confusion_matrix(q1target_test, nbPrediction)
nbMatrix
#array([[  138, 18815],
#       [   12,  5311]])


Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=75, max_depth=3, random_state=42)
rf.fit(q1features_trainvalid_SM, q1target_trainvalid_SM)
#rfPrediction = rf.predict(q1features_trainvalid)
rfPrediction = (rf.predict_proba(q1features_test)[:,1] >= 0.4).astype(bool)

f2_score = fbeta_score(q1target_test, rfPrediction, beta=2)
accuracy =  metrics.accuracy_score(q1target_test, rfPrediction)
recall =  metrics.recall_score(q1target_test, rfPrediction)
precision = metrics.precision_score(q1target_test, rfPrediction)

In [None]:
f2_score
#0.5861439415777842

In [None]:
accuracy
#0.34507332344702585

In [None]:
recall
#0.9167762539921097

In [None]:
precision
#0.23996852871754523

In [None]:
rfMatrix = metrics.confusion_matrix(q1target_test, rfPrediction)
rfMatrix
#array([[ 3497, 15456],
#       [  443,  4880]])


Light GBM

In [None]:
train_data = lgb.Dataset(q1features_trainvalid_SM, label=q1target_trainvalid_SM)
test_data = lgb.Dataset(q1features_test, label=q1target_test, reference=train_data)

params = {
    "num_leaves": 31,
    "reg_alpha": 0.5,
    "reg_lambda": 0.5,
    "min_data_in_leaf": 30,
    "objective": "binary",
    "metric": "binary_logloss",
    "learning_rate": 0.05,
    "force_row_wise": True,
    "bagging_fraction": 0.8,
    "feature_fraction": 0.8
    }
num_round=500
bst = lgb.train(params, train_data, num_round, valid_sets=[test_data])
y_pred = bst.predict(q1features_test)
y_pred_binary = (y_pred > 0.1).astype(int)

f2_score = fbeta_score(q1target_test, y_pred_binary, beta=2)
accuracy =  metrics.accuracy_score(q1target_test, y_pred_binary)
recall =  metrics.recall_score(q1target_test, y_pred_binary)
precision = metrics.precision_score(q1target_test, y_pred_binary)

In [None]:
f2_score
#0.5826131052120997

In [None]:
accuracy
#0.22487230186192123

In [None]:
recall
#0.9928611685139959

In [None]:
precision
#0.2196226728723404

In [None]:
bstMatrix = metrics.confusion_matrix(q1target_test, y_pred_binary)
bstMatrix
#array([[  174, 18779],
#       [   38,  5285]])


2. Running models using hyperparameters chosen to optimize accuracy

Logistic Regression

In [None]:
logReg = sm.Logit(q1target_trainvalid_SM.ravel(), q1features_trainvalid_SM,).fit()
logRegPrediction = logReg.predict(q1features_test)
logRegPrediction = np.where(logRegPrediction > 0.5, 1, 0)

f2_score = fbeta_score(q1target_test, logRegPrediction, beta=2)
accuracy =  metrics.accuracy_score(q1target_test, logRegPrediction)
recall =  metrics.recall_score(q1target_test, logRegPrediction)
precision = metrics.precision_score(q1target_test, logRegPrediction)

In [None]:
f2_score
#0.5161352497384217

In [None]:
accuracy
#0.470629428241885

In [None]:
recall
#0.7043020852902498

In [None]:
precision
#0.24950086516704378

In [None]:
logRegMatrix = metrics.confusion_matrix(q1target_test, logRegPrediction)
logRegMatrix
#array([[ 7676, 11277],
#       [ 1574,  3749]])


Naive Bayes

In [None]:
model = GaussianNB()
nbModel = model.fit(q1features_trainvalid_SM, q1target_trainvalid_SM)
nbPrediction = (nbModel.predict_proba(q1features_test)[:,1] >= 0.7).astype(bool)

f2_score = fbeta_score(q1target_test, nbPrediction, beta=2)
accuracy =  metrics.accuracy_score(q1target_test, nbPrediction)
recall =  metrics.recall_score(q1target_test, nbPrediction)
precision = metrics.precision_score(q1target_test, nbPrediction)

In [None]:
f2_score
#0.33712568014915717

In [None]:
accuracy
#0.7212061295106278

In [None]:
recall
#0.33289498403156115

In [None]:
precision
#0.3551813990779715

In [None]:
nbMatrix = metrics.confusion_matrix(q1target_test, nbPrediction)
nbMatrix
#array([[15736,  3217],
#       [ 3551,  1772]])


Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=75, max_depth=10, random_state=42)
rf.fit(q1features_trainvalid_SM, q1target_trainvalid_SM)
#rfPrediction = rf.predict(q1features_trainvalid)
rfPrediction = (rf.predict_proba(q1features_test)[:,1] >= 0.5).astype(bool)

f2_score = fbeta_score(q1target_test, rfPrediction, beta=2)
accuracy =  metrics.accuracy_score(q1target_test, rfPrediction)
recall =  metrics.recall_score(q1target_test, rfPrediction)
precision = metrics.precision_score(q1target_test, rfPrediction)

In [None]:
f2_score
#0.1847996941246442

In [None]:
accuracy
#0.7598451145163948

In [None]:
recall
#0.16344166823219988

In [None]:
precision
#0.3871829105473965

In [None]:
rfMatrix = metrics.confusion_matrix(q1target_test, rfPrediction)
rfMatrix
#array([[17576,  1377],
#       [ 4453,   870]])


Light GBM

In [None]:
train_data = lgb.Dataset(q1features_trainvalid_SM, label=q1target_trainvalid_SM)
test_data = lgb.Dataset(q1features_test, label=q1target_test, reference=train_data)

params = {
    "num_leaves": 31,
    "reg_alpha": 0.5,
    "reg_lambda": 0.5,
    "min_data_in_leaf": 5,
    "objective": "binary",
    "metric": "binary_logloss",
    "learning_rate": 0.05,
    "force_row_wise": True,
    "bagging_fraction": 0.8,
    "feature_fraction": 0.8
    }
num_round=500
bst = lgb.train(params, train_data, num_round, valid_sets=[test_data])
y_pred = bst.predict(q1features_test)
y_pred_binary = (y_pred > 0.6).astype(int)

f2_score = fbeta_score(q1target_test, y_pred_binary, beta=2)
accuracy =  metrics.accuracy_score(q1target_test, y_pred_binary)
recall =  metrics.recall_score(q1target_test, y_pred_binary)
precision = metrics.precision_score(q1target_test, y_pred_binary)

In [None]:
f2_score
#0.14990736061984167

In [None]:
accuracy
#0.7382188169385401

In [None]:
recall
#0.1337591583693406

In [None]:
precision
#0.2899022801302932

In [None]:
bstMatrix = metrics.confusion_matrix(q1target_test, y_pred_binary)
bstMatrix
#array([[17209,  1744],
#       [ 4611,   712]])


Determining Feature Importance of Optimal Model

1. Random Forest with hyperparameters chosen to optimize F2-Score

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=75, max_depth=3, random_state=42)
rf.fit(q1features_trainvalid_SM, q1target_trainvalid_SM)
importances = rf.feature_importances_
feature_names=rf.feature_names_in_
feature_imp_df = pd.DataFrame({'Feature': feature_names, 'Gini Importance': importances}).sort_values('Gini Importance', ascending=False)
print(feature_imp_df)

In [None]:
#  Feature  Gini Importance
#13    SEXVAR_True         0.236007
#9     DECIDE_True         0.166509
#12  MEDCOST1_True         0.148601
#7   ADDEPEV3_True         0.145333
#11  LCSCTSC1_True         0.111050
#5        PHYSHLTH         0.076114
#8    ASTHMA3_True         0.069548
#3        DROCDY4_         0.022499
#10  FLUSHOT7_True         0.019594
#0        CHILDREN         0.001614
#4           IYEAR         0.001038
#6           SEQNO         0.000850
#2        DISPCODE         0.000848
#1        CPDEMO1C         0.000395

2. Random Forest with hyperparameters chosen to optimize accuracy

In [None]:
rf = RandomForestClassifier(n_estimators=75, max_depth=10, random_state=42)
rf.fit(q1features_trainvalid_SM, q1target_trainvalid_SM)
importances = rf.feature_importances_
feature_names=rf.feature_names_in_
feature_imp_df = pd.DataFrame({'Feature': feature_names, 'Gini Importance': importances}).sort_values('Gini Importance', ascending=False)
print(feature_imp_df)

In [None]:
 #       Feature          Gini Importance
#13    SEXVAR_True         0.176802
#11  LCSCTSC1_True         0.115792
#12  MEDCOST1_True         0.115212
#7   ADDEPEV3_True         0.111068
#9     DECIDE_True         0.099609
#5        PHYSHLTH         0.075115
#8    ASTHMA3_True         0.074949
#3        DROCDY4_         0.058059
#10  FLUSHOT7_True         0.049431
#0        CHILDREN         0.027794
#4           IYEAR         0.025080
#2        DISPCODE         0.024058
#1        CPDEMO1C         0.023786
#6           SEQNO         0.023245