In [10]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
import pyarrow.parquet as pq

## Reading data

In [11]:
dir_name = 'C:\Cloud\OneDrive - Emory University\Papers\PASC Diabetes Incidence'
outcome_df = pd.read_parquet(dir_name + '\working\sensitivity utilization\pdsu204_ipw for loss to followup data.parquet' )
outcome_df.head()

Unnamed: 0,ID,female,nhwhite,nhblack,hispanic,nhother,age,matchid,index_date,site,...,pre_J04_gtQ3,pre_H01_gtQ3,lab_LOINC_788_0_gtQ3,lab_LOINC_10834_0_gtQ3,lab_LOINC_6301_6_gtQ3,lab_LOINC_5902_2_gtQ3,lab_LOINC_75241_0_gtQ3,lab_LOINC_1988_5_gtQ3,lab_LOINC_13458_5_gtQ3,lab_LOINC_4537_7_gtQ3
0,12MAR202320220187400000002,1.0,1.0,0.0,0.0,0.0,33.0,,2021-01-27,Source1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,12MAR202320220187400000003,1.0,1.0,0.0,0.0,0.0,72.0,,2018-08-20,Source5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12MAR202320220187400000004,0.0,1.0,0.0,0.0,0.0,84.0,,2020-05-21,Source7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,12MAR202320220187400000005,1.0,1.0,0.0,0.0,0.0,27.0,,2021-03-01,Source1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12MAR202320220187400000007,1.0,0.0,0.0,0.0,1.0,18.0,,2021-03-08,Source2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# The below lines were commented out after testing a simple model
# outcome_df = outcome_df.sample(frac=0.1, random_state=1)
# sample_outcome_df.shape
outcome_df.shape

(252247, 1897)

In [13]:
# https://stackoverflow.com/questions/11587782/creating-dummy-variables-in-pandas-for-python
outcome_df = pd.get_dummies(outcome_df,prefix='',prefix_sep='_',
                                           columns=['site','calendar_month','payer_type_primary','payer_type_secondary'],drop_first=True)
outcome_df.shape
# https://www.kdnuggets.com/2020/07/easy-guide-data-preprocessing-python.html
# Also lists one-hot encoding as an option

(252247, 1922)

## Code for Random Forest Classifier
https://scikit-learn.org/stable/modules/grid_search.html

In [14]:
rf = RandomForestClassifier(random_state=1)
# n_estimators = number of trees in the forest
# random_state = controls both the randomness of the bootstrapping of the samples used when building trees (if bootstrap=True), 
# and the sampling of the features to consider when looking for the best split at each node (if max_features < n_features)
# min_samples_leaf = [10, 25] # Comment out when running direct PS estimation
# n_estimators = [1000, 2000] # Comment out when running direct PS estimation

min_samples_leaf = [10]
n_estimators = [2000]

# TRIAL 
# min_samples_leaf = [500, 1000]
# n_estimators = [5, 10] # Commented out after testing a simple model

random_grid = {'min_samples_leaf': min_samples_leaf,
               'n_estimators': n_estimators}
print(random_grid)


{'min_samples_leaf': [10], 'n_estimators': [2000]}


## Train-Test Split

In [15]:
y = outcome_df['in_dm_followup_ID']
X = outcome_df.drop(['EXPOSED','COHORT','ID','matchid','index_date'], axis=1)

X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [16]:
import collections
# X_train.nhwhite.value_counts(dropna=False)
collections.Counter(y_train)


Counter({1.0: 181506, 0.0: 20291})

## Grid Search
https://towardsdatascience.com/gridsearchcv-for-beginners-db48a90114ee

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer


In [18]:
gs_rf_ltfu = GridSearchCV(rf,
                      param_grid=random_grid,
                      # https://scikit-learn.org/stable/modules/model_evaluation.html#scoring
                      # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score
                      scoring = make_scorer(recall_score, average='weighted'),
                      cv=5)
gs_rf_ltfu.fit(X_train, y_train)
gs_rf_ltfu.best_params_

{'min_samples_leaf': 10, 'n_estimators': 2000}

In [19]:
gs_rf_ltfu.cv_results_

{'mean_fit_time': array([1539.64184561]),
 'std_fit_time': array([30.15949458]),
 'mean_score_time': array([17.97296286]),
 'std_score_time': array([0.19979441]),
 'param_min_samples_leaf': masked_array(data=[10],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[2000],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'min_samples_leaf': 10, 'n_estimators': 2000}],
 'split0_test_score': array([0.95374133]),
 'split1_test_score': array([0.95396432]),
 'split2_test_score': array([0.95579672]),
 'split3_test_score': array([0.95418618]),
 'split4_test_score': array([0.95329418]),
 'mean_test_score': array([0.95419655]),
 'std_test_score': array([0.00085277]),
 'rank_test_score': array([1])}

In [20]:
gs_rf_ltfu.best_score_

0.9541965474583158

In [21]:
gs_rf_ltfu.score(X_test, y_test)

0.9589494549058474

In [22]:
# https://datatofish.com/numpy-array-to-pandas-dataframe/
y_pred = gs_rf_ltfu.predict(X)
y_pred_proba = gs_rf_ltfu.predict_proba(X)
pd.DataFrame(y_pred_proba,columns=['missing','available']).to_csv(dir_name + '\working\sensitivity utilization\pdsu206_predicted probability for loss to followup_min10_ntree2000.csv')
pd.crosstab(y,y_pred)


col_0,0.0,1.0
in_dm_followup_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,17276,7943
1.0,0,227028
