In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
import pyarrow.parquet as pq

## Reading data

In [2]:
dir_name = 'C:\Cloud\OneDrive - Emory University\Papers\PASC Diabetes Incidence'
outcome_df = pd.read_parquet(dir_name + '\working\models pdadm\pdadm104_ipw for cohort membership data.parquet' )
outcome_df.head()

Unnamed: 0,ID,female,nhwhite,nhblack,hispanic,nhother,age,matchid,index_date,site,...,lab_LOINC_11277_1_gtQ3,lab_LOINC_5821_4_gtQ3,lab_LOINC_13945_1_gtQ3,lab_LOINC_46421_4_gtQ3,lab_LOINC_6298_4_gtQ3,lab_LOINC_19123_9_gtQ3,lab_LOINC_10839_9_gtQ3,lab_LOINC_30934_4_gtQ3,lab_LOINC_1968_7_gtQ3,lab_LOINC_2157_6_gtQ3
0,12MAR202320220187400000002,1.0,1.0,0.0,0.0,0.0,33.0,,2021-01-27,Source1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,12MAR202320220187400000003,1.0,1.0,0.0,0.0,0.0,72.0,,2018-08-20,Source5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12MAR202320220187400000004,0.0,1.0,0.0,0.0,0.0,84.0,,2020-05-21,Source7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,12MAR202320220187400000005,1.0,1.0,0.0,0.0,0.0,27.0,,2021-03-01,Source1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12MAR202320220187400000007,1.0,0.0,0.0,0.0,1.0,18.0,,2021-03-08,Source2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# The below lines were commented out after testing a simple model
# outcome_df = outcome_df.sample(frac=0.1, random_state=1)
# sample_outcome_df.shape
outcome_df.shape

(252247, 1767)

In [4]:
# https://stackoverflow.com/questions/11587782/creating-dummy-variables-in-pandas-for-python
outcome_df = pd.get_dummies(outcome_df,prefix='',prefix_sep='_',
                                           columns=['site','calendar_month','payer_type_primary','payer_type_secondary'],drop_first=True)
outcome_df.shape
# https://www.kdnuggets.com/2020/07/easy-guide-data-preprocessing-python.html
# Also lists one-hot encoding as an option

(252247, 1792)

## Code for Random Forest Classifier
https://scikit-learn.org/stable/modules/grid_search.html

In [5]:
rf = RandomForestClassifier(random_state=1)
# n_estimators = number of trees in the forest
# random_state = controls both the randomness of the bootstrapping of the samples used when building trees (if bootstrap=True), 
# and the sampling of the features to consider when looking for the best split at each node (if max_features < n_features)
# min_samples_leaf = [10, 25] # Comment out when running direct PS estimation
# n_estimators = [1000, 2000] # Comment out when running direct PS estimation

min_samples_leaf = [10]
n_estimators = [2000]

# TRIAL 
# min_samples_leaf = [500, 1000]
# n_estimators = [5, 10] # Commented out after testing a simple model

random_grid = {'min_samples_leaf': min_samples_leaf,
               'n_estimators': n_estimators}
print(random_grid)


{'min_samples_leaf': [10], 'n_estimators': [2000]}


## Train-Test Split

In [6]:
y = outcome_df['COHORT']
X = outcome_df.drop(['COHORT','EXPOSED','UNEXPOSED','HISTORICAL','ID','matchid','index_date'], axis=1)

X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

## Creating different outcome variables from train-test split

In [7]:
y_train_EXPOSED = (y_train == 'exposed').astype(int)
y_train_UNEXPOSED = (y_train == 'unexposed').astype(int)
y_train_HISTORICAL = (y_train == 'historical').astype(int)

y_test_EXPOSED = (y_test == 'exposed').astype(int)
y_test_UNEXPOSED = (y_test == 'unexposed').astype(int)
y_test_HISTORICAL = (y_test == 'historical').astype(int)


In [8]:
import collections
# X_train.nhwhite.value_counts(dropna=False)
collections.Counter(y_train)


Counter({'unexposed': 132753, 'exposed': 36086, 'historical': 32958})

## Grid Search
https://towardsdatascience.com/gridsearchcv-for-beginners-db48a90114ee

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer


### Exposed vs Other

In [10]:
gs_rf_EXPOSED = GridSearchCV(rf,
                      param_grid=random_grid,
                      # https://scikit-learn.org/stable/modules/model_evaluation.html#scoring
                      # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score
                      scoring = make_scorer(recall_score, average='weighted'),
                      cv=5)
gs_rf_EXPOSED.fit(X_train, y_train_EXPOSED)
gs_rf_EXPOSED.best_params_

{'min_samples_leaf': 10, 'n_estimators': 2000}

In [11]:
gs_rf_EXPOSED.cv_results_

{'mean_fit_time': array([2852.43603697]),
 'std_fit_time': array([30.06899278]),
 'mean_score_time': array([31.26862636]),
 'std_score_time': array([0.30069676]),
 'param_min_samples_leaf': masked_array(data=[10],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[2000],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'min_samples_leaf': 10, 'n_estimators': 2000}],
 'split0_test_score': array([0.83704163]),
 'split1_test_score': array([0.83731417]),
 'split2_test_score': array([0.83656681]),
 'split3_test_score': array([0.83716148]),
 'split4_test_score': array([0.83686414]),
 'mean_test_score': array([0.83698965]),
 'std_test_score': array([0.00025771]),
 'rank_test_score': array([1])}

In [12]:
gs_rf_EXPOSED.best_score_

0.836989646146843

In [13]:
gs_rf_EXPOSED.score(X_test, y_test_EXPOSED)

0.8443211100099108

In [14]:
# https://datatofish.com/numpy-array-to-pandas-dataframe/
y_pred_EXPOSED = gs_rf_EXPOSED.predict(X)
y_pred_proba_EXPOSED = gs_rf_EXPOSED.predict_proba(X)
pd.DataFrame(y_pred_proba_EXPOSED,columns=['other','exposed']).to_csv(dir_name + '\working\models pdadm\pdadm106_predicted probability for EXPOSED_min10_ntree2000.csv')
pd.crosstab(y,y_pred_EXPOSED)


col_0,0,1
COHORT,Unnamed: 1_level_1,Unnamed: 2_level_1
exposed,36427,8484
historical,41418,0
unexposed,165897,21


### Unexposed vs Other

In [15]:
gs_rf_UNEXPOSED = GridSearchCV(rf,
                      param_grid=random_grid,
                      # https://scikit-learn.org/stable/modules/model_evaluation.html#scoring
                      # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score
                      scoring = make_scorer(recall_score, average='weighted'),
                      cv=5)
gs_rf_UNEXPOSED.fit(X_train, y_train_UNEXPOSED)
gs_rf_UNEXPOSED.best_params_

{'min_samples_leaf': 10, 'n_estimators': 2000}

In [16]:
gs_rf_UNEXPOSED.cv_results_

{'mean_fit_time': array([2824.1160593]),
 'std_fit_time': array([12.32566337]),
 'mean_score_time': array([32.57838888]),
 'std_score_time': array([0.30571743]),
 'param_min_samples_leaf': masked_array(data=[10],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[2000],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'min_samples_leaf': 10, 'n_estimators': 2000}],
 'split0_test_score': array([0.85210605]),
 'split1_test_score': array([0.84995045]),
 'split2_test_score': array([0.85088828]),
 'split3_test_score': array([0.85348993]),
 'split4_test_score': array([0.85054139]),
 'mean_test_score': array([0.85139522]),
 'std_test_score': array([0.00126226]),
 'rank_test_score': array([1])}

In [17]:
gs_rf_UNEXPOSED.best_score_

0.85139521763923

In [18]:
gs_rf_UNEXPOSED.score(X_test, y_test_UNEXPOSED)

0.8577205153617443

In [19]:

y_pred_UNEXPOSED = gs_rf_UNEXPOSED.predict(X)
y_pred_proba_UNEXPOSED = gs_rf_UNEXPOSED.predict_proba(X)

pd.DataFrame(y_pred_proba_UNEXPOSED,columns=['other','unexposed']).to_csv(dir_name + '\working\models pdadm\pdadm106_predicted probability for UNEXPOSED_min10_ntree2000.csv')
pd.crosstab(y,y_pred_UNEXPOSED)



col_0,0,1
COHORT,Unnamed: 1_level_1,Unnamed: 2_level_1
exposed,19312,25599
historical,37838,3580
unexposed,406,165512


### Historical vs Other

In [20]:
gs_rf_HISTORICAL = GridSearchCV(rf,
                      param_grid=random_grid,
                      # https://scikit-learn.org/stable/modules/model_evaluation.html#scoring
                      # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score
                      scoring = make_scorer(recall_score, average='weighted'),
                      cv=5)
gs_rf_HISTORICAL.fit(X_train, y_train_HISTORICAL)
gs_rf_HISTORICAL.best_params_

{'min_samples_leaf': 10, 'n_estimators': 2000}

In [21]:
gs_rf_HISTORICAL.cv_results_

{'mean_fit_time': array([2221.20289564]),
 'std_fit_time': array([295.33555381]),
 'mean_score_time': array([27.60567522]),
 'std_score_time': array([6.98255112]),
 'param_min_samples_leaf': masked_array(data=[10],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[2000],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'min_samples_leaf': 10, 'n_estimators': 2000}],
 'split0_test_score': array([0.9622894]),
 'split1_test_score': array([0.95948959]),
 'split2_test_score': array([0.96280879]),
 'split3_test_score': array([0.96164424]),
 'split4_test_score': array([0.96159469]),
 'mean_test_score': array([0.96156534]),
 'std_test_score': array([0.00113015]),
 'rank_test_score': array([1])}

In [22]:
gs_rf_HISTORICAL.best_score_

0.9615653421591761

In [23]:
gs_rf_HISTORICAL.score(X_test, y_test_HISTORICAL)

0.9629732408325075

In [24]:
y_pred_HISTORICAL = gs_rf_HISTORICAL.predict(X)
y_pred_proba_HISTORICAL = gs_rf_HISTORICAL.predict_proba(X)
pd.DataFrame(y_pred_proba_HISTORICAL,columns=['other','historical']).to_csv(dir_name + '\working\models pdadm\pdadm106_predicted probability for HISTORICAL_min10_ntree2000.csv')
pd.crosstab(y,y_pred_HISTORICAL)

col_0,0,1
COHORT,Unnamed: 1_level_1,Unnamed: 2_level_1
exposed,44903,8
historical,8064,33354
unexposed,165917,1
