In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, recall_score, \
                            classification_report, roc_auc_score, precision_score, \
                            f1_score, matthews_corrcoef, average_precision_score, \
                            precision_recall_curve, auc, roc_curve
from collections import Counter
%config Completer.use_jedi = False

In [4]:
def calc_stats(y_test, y_pred, X_test, clf):
    probs = clf.predict_proba(X_test)
    prob1 = probs[:, 1]
    stats_s = pd.Series(dtype='float')
    stats_s['recall'] = recall_score(y_test, y_pred)
    stats_s['prec'] = precision_score(y_test, y_pred)
    stats_s['MCC'] = matthews_corrcoef(y_test, y_pred)
    precision, recall, _ = precision_recall_curve(y_test, prob1, pos_label=1)
    stats_s['PR_AUC'] = auc(recall, precision)
    stats_s['avg_prec'] = average_precision_score(y_test, prob1)
    stats_s['roc_auc'] = roc_auc_score(y_test, prob1)

    
#    print(f'Accuracy = {accuracy_score(y_test, y_pred)}')
#    print(f'Balanced Accuracy = {balanced_accuracy_score(y_test, y_pred)}')
#    print(f'Prec = {precision_score(y_test, y_pred)}')
#    print(f'Recall = {recall_score(y_test, y_pred)}')
#    print(f'ROC_AUC = {roc_auc_score(y_test, y_pred)}')
#    print(f'MCC = {matthews_corrcoef(y_test, y_pred)}')
#    print(f'PR_AUC = {auc(recall, precision)}')
#    print(f'\nClassification Report:\n {classification_report(y_test, y_pred)}')
    
    return stats_s


In [5]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
def sample_data(X, y, samp_type, samp_strat, seed=0):
    if samp_type == 'over':
        sampler = RandomOverSampler(sampling_strategy=samp_strat, random_state=seed)
    elif samp_type == 'under':
        sampler = RandomUnderSampler(sampling_strategy=samp_strat, random_state=seed)
    else:
        print("Invalid 'samp_type'")
        
    # fit and apply the transform
    X_res, y_res = sampler.fit_resample(X, y)
    # summarize class distribution
    #print(Counter(y_res))
    #print(X_res.shape)
    
    return X_res, y_res

# Bootstrapped  random statistics runs

In [6]:
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
def bootstrap_stat(X, y, clf, nsamples=100, under=False, samp_strat=1.0):
    stats_df = pd.DataFrame()
    for seed in range(nsamples):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=seed)
        #print(f'In bstrap(): y_train.shape = {y_train.shape}; X_train.shape = {X_train.shape}')
        #print(f'In bstrap(): np.bincount(y_train) = {np.bincount(y_train)}')

        if under:
            # Undersample the training data
            #print('Undersampling')
            X_res, y_res = sample_data(X_train, y_train, "under", samp_strat=samp_strat, seed=seed)
        else:
            #print('No Undersampling')
            X_res, y_res = X, y # Not subsampled - for use with class_weight='balanced'
            
#        print(f'In kfold_cv: train_y.shape = {train_y.shape}')
#        if sample_weights:
#            weights = class_weight.compute_sample_weight('balanced', y=y_res)
#            print(f'np.unique(weights): {np.unique(weights)}')
#            clf.fit(X_res, y_res, sample_weight=weights)
#        else:
#            clf.fit(X_res, y_res)
#            
        clf.fit(X_res, y_res)
        y_pred = clf.predict(X_test)

        stats_s = calc_stats(y_test, y_pred, X_test, clf)
        if stats_df.empty:
            stats_df = pd.DataFrame(stats_s)
            stats_df = stats_df.T
        else:
            stats_df = stats_df.append(stats_s, ignore_index=True)

    return stats_df

In [7]:
corr_df = pd.read_csv('../../data/csl/CramerTheil/Cramer_PI_Tl_coeff_Union50.csv', index_col=0, header=None, delimiter='\t')
corr_vars = list(corr_df.index.values)
len(corr_vars)

66

In [8]:
df = pd.read_csv('../../data/csl/CSL_tl_PI.csv', index_col=0)
X = df.drop('trans_loss', axis=1, inplace=False)
X = X[corr_vars]
y = df['trans_loss'].values

In [14]:
%%time
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=128, criterion="gini",max_depth=40, min_samples_leaf=16,
                                     min_samples_split=2,max_leaf_nodes=100,max_features="auto",
                                     class_weight="balanced",random_state=7)
print(clf)
stats_df = bootstrap_stat(X, y, clf, nsamples=10, under=False)
stats_df.loc['mean'] = stats_df.mean()
#stats_df.loc['mean',:]
stats_df

RandomForestClassifier(class_weight='balanced', max_depth=40,
                       max_leaf_nodes=100, min_samples_leaf=16,
                       n_estimators=128, random_state=7)
CPU times: user 3min 23s, sys: 725 ms, total: 3min 24s
Wall time: 3min 24s


Unnamed: 0,recall,prec,MCC,PR_AUC,avg_prec,roc_auc
0,0.907831,0.130194,0.258684,0.226128,0.22637,0.840374
1,0.900097,0.130052,0.256622,0.227031,0.227219,0.838427
2,0.903964,0.130532,0.258407,0.232151,0.232388,0.840836
3,0.900419,0.130311,0.257175,0.211227,0.211588,0.837711
4,0.896229,0.129211,0.254167,0.221175,0.221383,0.836777
5,0.892362,0.129133,0.253129,0.225918,0.226119,0.835614
6,0.893329,0.128607,0.252374,0.222527,0.222712,0.838187
7,0.902675,0.130638,0.258302,0.231739,0.232079,0.842191
8,0.892684,0.12984,0.254512,0.23102,0.231245,0.839097
9,0.900097,0.129992,0.25651,0.23512,0.235444,0.847133


In [17]:
print(f'Recall:    mean = {stats_df["recall"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["recall"], (2.5, 97.5)), 4)}')
print(f'Precision: mean = {stats_df["prec"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["prec"], (2.5, 97.5)), 4)}')
print(f'MCC:       mean = {stats_df["MCC"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["MCC"], (2.5, 97.5)), 4)}')
print(f'PR_AUC:    mean = {stats_df["PR_AUC"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["PR_AUC"], (2.5, 97.5)), 4)}')
print(f'ROC_AUC:   mean = {stats_df["roc_auc"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["roc_auc"], (2.5, 97.5)), 4)}')

Recall:    mean = 0.8990; 95% CI = [0.8924 0.9069]
Precision: mean = 0.1299; 95% CI = [0.1287 0.1306]
MCC:       mean = 0.2560; 95% CI = [0.2526 0.2586]
PR_AUC:    mean = 0.2264; 95% CI = [0.2137 0.2344]
ROC_AUC:   mean = 0.8396; 95% CI = [0.8359 0.8459]


# RF w/out Freq; max_features=auto (Union50, samp_strat=0.7)
### This mimics best original performer

In [19]:
%%time
## Random Undersampling
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=128, criterion="gini",max_depth=40, min_samples_leaf=16,
                                     min_samples_split=2,max_leaf_nodes=100,max_features="auto",
                                     class_weight=None)
print(clf)
stats_df = bootstrap_stat(X, y, clf, nsamples=1000, under=True, samp_strat=0.7)
stats_df.loc['mean'] = stats_df.mean()
#stats_df.loc['mean',:]
stats_df

RandomForestClassifier(max_depth=40, max_leaf_nodes=100, min_samples_leaf=16,
                       n_estimators=128)
CPU times: user 1h 3min 1s, sys: 50.7 s, total: 1h 3min 52s
Wall time: 1h 3min 52s


Unnamed: 0,recall,prec,MCC,PR_AUC,avg_prec,roc_auc
0,0.863358,0.138385,0.262956,0.209320,0.209548,0.831170
1,0.848534,0.139025,0.260598,0.207184,0.207387,0.828535
2,0.864325,0.139448,0.265010,0.212737,0.213097,0.831629
3,0.861747,0.139351,0.264240,0.194659,0.194936,0.828888
4,0.856590,0.137913,0.260562,0.199217,0.199440,0.826750
...,...,...,...,...,...,...
996,0.856268,0.137926,0.260509,0.206368,0.206625,0.830055
997,0.861102,0.137019,0.260061,0.195075,0.195312,0.827799
998,0.857557,0.136210,0.257829,0.204968,0.205224,0.828594
999,0.862391,0.140193,0.265831,0.205344,0.205664,0.832100


In [20]:
print(f'Recall:    mean = {stats_df["recall"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["recall"], (2.5, 97.5)), 4)}')
print(f'Precision: mean = {stats_df["prec"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["prec"], (2.5, 97.5)), 4)}')
print(f'MCC:       mean = {stats_df["MCC"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["MCC"], (2.5, 97.5)), 4)}')
print(f'PR_AUC:    mean = {stats_df["PR_AUC"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["PR_AUC"], (2.5, 97.5)), 4)}')
print(f'ROC_AUC:   mean = {stats_df["roc_auc"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["roc_auc"], (2.5, 97.5)), 4)}')

Recall:    mean = 0.8573; 95% CI = [0.8447 0.8692]
Precision: mean = 0.1379; 95% CI = [0.1358 0.1399]
MCC:       mean = 0.2607; 95% CI = [0.2554 0.2657]
PR_AUC:    mean = 0.2043; 95% CI = [0.1946 0.2147]
ROC_AUC:   mean = 0.8295; 95% CI = [0.8246 0.8342]


In [11]:
%%time
## Random Undersampling
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=128, criterion="gini",max_depth=40, min_samples_leaf=16,
                                     min_samples_split=2,max_leaf_nodes=100,max_features="auto",
                                     class_weight=None)
print(clf)
stats_df = bootstrap_stat(X, y, clf, nsamples=100, under=True, samp_strat=0.7)
stats_df.loc['mean'] = stats_df.mean()
#stats_df.loc['mean',:]
stats_df

RandomForestClassifier(max_depth=40, max_leaf_nodes=100, min_samples_leaf=16,
                       n_estimators=128)
CPU times: user 5min 50s, sys: 5.86 s, total: 5min 56s
Wall time: 5min 56s


Unnamed: 0,recall,prec,MCC,PR_AUC,avg_prec,roc_auc
0,0.859491,0.138015,0.261414,0.209595,0.209959,0.831377
1,0.845311,0.138901,0.259635,0.205329,0.205554,0.827086
2,0.864003,0.139396,0.264845,0.214118,0.214471,0.831755
3,0.862391,0.139513,0.264669,0.196097,0.196363,0.828866
4,0.857879,0.137985,0.260986,0.199433,0.199650,0.827501
...,...,...,...,...,...,...
96,0.845633,0.138207,0.258522,0.198901,0.199228,0.826272
97,0.867225,0.140537,0.267554,0.202212,0.202460,0.831901
98,0.852401,0.138729,0.260991,0.198272,0.198549,0.828937
99,0.863036,0.138462,0.263013,0.202544,0.202898,0.829276


In [18]:
print(f'Recall:    mean = {stats_df["recall"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["recall"], (2.5, 97.5)), 4)}')
print(f'Precision: mean = {stats_df["prec"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["prec"], (2.5, 97.5)), 4)}')
print(f'MCC:       mean = {stats_df["MCC"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["MCC"], (2.5, 97.5)), 4)}')
print(f'PR_AUC:    mean = {stats_df["PR_AUC"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["PR_AUC"], (2.5, 97.5)), 4)}')
print(f'ROC_AUC:   mean = {stats_df["roc_auc"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["roc_auc"], (2.5, 97.5)), 4)}')

Recall:    mean = 0.8990; 95% CI = [0.8924 0.9069]
Precision: mean = 0.1299; 95% CI = [0.1287 0.1306]
MCC:       mean = 0.2560; 95% CI = [0.2526 0.2586]
PR_AUC:    mean = 0.2264; 95% CI = [0.2137 0.2344]
ROC_AUC:   mean = 0.8396; 95% CI = [0.8359 0.8459]


In [12]:
print(np.percentile(stats_df['recall'], (2.5, 97.5)))
print(np.percentile(stats_df['prec'], (2.5, 97.5)))
print(np.percentile(stats_df['MCC'], (2.5, 97.5)))
print(np.percentile(stats_df['PR_AUC'], (2.5, 97.5)))
print(np.percentile(stats_df['avg_prec'], (2.5, 97.5)))
print(np.percentile(stats_df['roc_auc'], (2.5, 97.5)))

[0.84418305 0.86948115]
[0.13580784 0.13981806]
[0.25541192 0.26558688]
[0.1954854  0.21448585]
[0.19575671 0.21475082]
[0.82532605 0.83463933]


In [31]:
print(f'PR_AUC CI: = {np.percentile(stats_df["PR_AUC"], (2.5, 97.5))}')
print(f'ROC CI: = {np.percentile(stats_df["ROC"], (2.5, 97.5))}')
print(f'MCC CI: = {np.percentile(stats_df["MCC"], (2.5, 97.5))}')

PR_AUC CI: = [0.19474928 0.21440643]
ROC CI: = [0.76511869 0.7751545 ]
MCC CI: = [0.25586553 0.26541317]


# Trying RF w/ Freq & max_features=None (Union50, no undersampling)

In [38]:
df = pd.read_csv('../../data/csl/CSL_tl_PI_Freq.csv', index_col=0)
X = df.drop('trans_loss', axis=1, inplace=False)
X = X[corr_vars]
y = df['trans_loss'].values

In [39]:
%%time
## Random Undersampling
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=128, criterion="gini",max_depth=40, min_samples_leaf=16,
                                     min_samples_split=2,max_leaf_nodes=100,max_features=None,
                                     class_weight="balanced")
print(clf)
stats_df = bootstrap_stat(X, y, clf, nsamples=10, under=False)
stats_df.loc['mean'] = stats_df.mean()
#stats_df.loc['mean',:]
stats_df

RandomForestClassifier(class_weight='balanced', max_depth=40, max_features=None,
                       max_leaf_nodes=100, min_samples_leaf=16,
                       n_estimators=128)
CPU times: user 15min 33s, sys: 216 ms, total: 15min 33s
Wall time: 15min 33s


Unnamed: 0,rec,ROC,MCC,PR_AUC
0,0.895585,0.781421,0.268452,0.215758
1,0.888817,0.780569,0.268469,0.212218
2,0.898808,0.78447,0.271768,0.213937
3,0.892362,0.781847,0.269505,0.202572
4,0.890751,0.779613,0.266951,0.217452
5,0.887528,0.778963,0.266649,0.215846
6,0.890106,0.779386,0.266768,0.210024
7,0.89204,0.782352,0.270199,0.21715
8,0.88785,0.780743,0.268849,0.212723
9,0.895907,0.783562,0.271095,0.216249


In [None]:
%%time
## Random Undersampling
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=128, criterion="gini",max_depth=40, min_samples_leaf=16,
                                     min_samples_split=2,max_leaf_nodes=100,max_features=None,
                                     class_weight="balanced")
print(clf)
stats_df = bootstrap_stat(X, y, clf, nsamples=2, under=True, samp_strat=0.7)
stats_df.loc['mean'] = stats_df.mean()
#stats_df.loc['mean',:]
stats_df

RandomForestClassifier(class_weight='balanced', max_depth=40, max_features=None,
                       max_leaf_nodes=100, min_samples_leaf=16,
                       n_estimators=128)


# Trying RF w/ Freq & max_features=None (Union50 features, under=0.7)

In [32]:
df = pd.read_csv('../../data/csl/CSL_tl_PI_Freq.csv', index_col=0)
X = df.drop('trans_loss', axis=1, inplace=False)
X = X[corr_vars]
y = df['trans_loss'].values

In [37]:
%%time
## Random Undersampling
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=128, criterion="gini",max_depth=40, min_samples_leaf=16,
                                     min_samples_split=2,max_leaf_nodes=100,max_features=None,
                                     class_weight=None)
print(clf)
stats_df = bootstrap_stat(X, y, clf, nsamples=5, under=True, samp_strat=0.7)
stats_df.loc['mean'] = stats_df.mean()
#stats_df.loc['mean',:]
stats_df

RandomForestClassifier(max_depth=40, max_features=None, max_leaf_nodes=100,
                       min_samples_leaf=16, n_estimators=128)
CPU times: user 1min 46s, sys: 540 ms, total: 1min 47s
Wall time: 1min 47s


Unnamed: 0,rec,ROC,MCC,PR_AUC
0,0.860458,0.772492,0.262959,0.206475
1,0.844989,0.768508,0.260566,0.204989
2,0.858202,0.771745,0.26239,0.212229
3,0.841766,0.769715,0.262804,0.197719
4,0.836932,0.765403,0.257966,0.202361
mean,0.848469,0.769573,0.261337,0.204755


# Trying RF w/ Freq & max_features=None (ALL features, under=0.7)

In [34]:
df = pd.read_csv('../../data/csl/CSL_tl_PI_Freq.csv', index_col=0)
X = df.drop('trans_loss', axis=1, inplace=False)
y = df['trans_loss'].values

In [35]:
%%time
## Random Undersampling
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=128, criterion="gini",max_depth=40, min_samples_leaf=16,
                                     min_samples_split=2,max_leaf_nodes=100,max_features=None,
                                     class_weight=None)
print(clf)
stats_df = bootstrap_stat(X, y, clf, nsamples=2, under=True, samp_strat=0.7)
stats_df.loc['mean'] = stats_df.mean()
#stats_df.loc['mean',:]
stats_df

RandomForestClassifier(max_depth=40, max_features=None, max_leaf_nodes=100,
                       min_samples_leaf=16, n_estimators=128)
CPU times: user 41.1 s, sys: 231 ms, total: 41.3 s
Wall time: 41.3 s


Unnamed: 0,rec,ROC,MCC,PR_AUC
0,0.863036,0.773019,0.263186,0.209435
1,0.845633,0.76884,0.260885,0.206431
mean,0.854335,0.77093,0.262035,0.207933


# Trying RF w/ Freq & max_features=auto (ALL features, under=0.7)

In [36]:
%%time
## Random Undersampling
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=128, criterion="gini",max_depth=40, min_samples_leaf=16,
                                     min_samples_split=2,max_leaf_nodes=100,max_features="auto",
                                     class_weight=None)
print(clf)
stats_df = bootstrap_stat(X, y, clf, nsamples=2, under=True, samp_strat=0.7)
stats_df.loc['mean'] = stats_df.mean()
#stats_df.loc['mean',:]
stats_df

RandomForestClassifier(max_depth=40, max_leaf_nodes=100, min_samples_leaf=16,
                       n_estimators=128)
CPU times: user 8.73 s, sys: 208 ms, total: 8.94 s
Wall time: 8.94 s


Unnamed: 0,rec,ROC,MCC,PR_AUC
0,0.850467,0.768953,0.260138,0.199961
1,0.826297,0.762552,0.256202,0.199826
mean,0.838382,0.765752,0.25817,0.199894
