In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, recall_score, \
                            classification_report, roc_auc_score, precision_score, \
                            f1_score, matthews_corrcoef, average_precision_score, \
                            precision_recall_curve, auc, roc_curve
from collections import Counter
%config Completer.use_jedi = False

In [2]:
def calc_stats(y_test, y_pred, X_test, clf):
    probs = clf.predict_proba(X_test)
    prob1 = probs[:, 1]
    stats_s = pd.Series(dtype='float')
    stats_s['recall'] = recall_score(y_test, y_pred)
    stats_s['prec'] = precision_score(y_test, y_pred)
    stats_s['MCC'] = matthews_corrcoef(y_test, y_pred)
    precision, recall, _ = precision_recall_curve(y_test, prob1, pos_label=1)
    stats_s['PR_AUC'] = auc(recall, precision)
    stats_s['avg_prec'] = average_precision_score(y_test, prob1)
    stats_s['roc_auc'] = roc_auc_score(y_test, prob1)
    
    return stats_s


In [3]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
def sample_data(X, y, samp_type, samp_strat, seed=0):
    if samp_type == 'over':
        sampler = RandomOverSampler(sampling_strategy=samp_strat, random_state=seed)
    elif samp_type == 'under':
        sampler = RandomUnderSampler(sampling_strategy=samp_strat, random_state=seed)
    else:
        print("Invalid 'samp_type'")
        
    # fit and apply the transform
    X_res, y_res = sampler.fit_resample(X, y)
    # summarize class distribution
    #print(Counter(y_res))
    #print(X_res.shape)
    
    return X_res, y_res

# Bootstrapped  random statistics runs

In [4]:
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
def bootstrap_stat(X, y, clf, nsamples=100, under=False, samp_strat=1.0):
    stats_df = pd.DataFrame()
    for seed in range(nsamples):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=seed)
        #print(f'In bstrap(): y_train.shape = {y_train.shape}; X_train.shape = {X_train.shape}')
        #print(f'In bstrap(): np.bincount(y_train) = {np.bincount(y_train)}')

        if under:
            # Undersample the training data
            #print('Undersampling')
            X_res, y_res = sample_data(X_train, y_train, "under", samp_strat=samp_strat, seed=seed)
        else:
            #print('No Undersampling')
            X_res, y_res = X, y # Not subsampled - for use with class_weight='balanced'
            
#        print(f'In kfold_cv: train_y.shape = {train_y.shape}')
#        if sample_weights:
#            weights = class_weight.compute_sample_weight('balanced', y=y_res)
#            print(f'np.unique(weights): {np.unique(weights)}')
#            clf.fit(X_res, y_res, sample_weight=weights)
#        else:
#            clf.fit(X_res, y_res)
#            
        clf.fit(X_res, y_res)
        y_pred = clf.predict(X_test)

        stats_s = calc_stats(y_test, y_pred, X_test, clf)
        if stats_df.empty:
            stats_df = pd.DataFrame(stats_s)
            stats_df = stats_df.T
        else:
            stats_df = stats_df.append(stats_s, ignore_index=True)

    return stats_df

In [5]:
corr_df = pd.read_csv('../../data/csl/CramerTheil/Cramer_PI_Tl_coeff_Union50.csv', index_col=0, header=None, delimiter='\t')
corr_vars = list(corr_df.index.values)
len(corr_vars)

66

In [6]:
df = pd.read_csv('../../data/csl/CSL_tl_PI.csv', index_col=0)
X = df.drop('trans_loss', axis=1, inplace=False)
X = X[corr_vars]
y = df['trans_loss'].values

In [7]:
%%time
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=70, criterion="friedman_mse",max_depth=11, min_samples_leaf=50,
                                     min_samples_split=900,max_leaf_nodes=None,max_features=12,subsample=0.9,
                                     learning_rate=0.1,random_state=7)
print(clf.get_params())
stats_df = bootstrap_stat(X, y, clf, nsamples=100, under=True)
stats_df.loc['mean'] = stats_df.mean()
#stats_df.loc['mean',:]
stats_df

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 11, 'max_features': 12, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 50, 'min_samples_split': 900, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 70, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': 7, 'subsample': 0.9, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
CPU times: user 3min 20s, sys: 5.2 s, total: 3min 25s
Wall time: 3min 25s


Unnamed: 0,recall,prec,MCC,PR_AUC,avg_prec,roc_auc
0,0.877538,0.135311,0.260889,0.209422,0.209679,0.835075
1,0.878827,0.134117,0.259066,0.211624,0.211989,0.832683
2,0.892684,0.136138,0.265895,0.218659,0.218952,0.838557
3,0.883661,0.135743,0.263081,0.197194,0.197513,0.830856
4,0.876893,0.134643,0.259554,0.205716,0.206028,0.831923
...,...,...,...,...,...,...
96,0.868514,0.134106,0.256657,0.208588,0.208958,0.830800
97,0.887850,0.136231,0.264925,0.207154,0.207383,0.834856
98,0.873348,0.134012,0.257609,0.211869,0.212237,0.832695
99,0.883983,0.135130,0.262070,0.210727,0.211007,0.833594


In [8]:
print(f'Recall:    mean = {stats_df["recall"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["recall"], (2.5, 97.5)), 4)}')
print(f'Precision: mean = {stats_df["prec"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["prec"], (2.5, 97.5)), 4)}')
print(f'MCC:       mean = {stats_df["MCC"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["MCC"], (2.5, 97.5)), 4)}')
print(f'PR_AUC:    mean = {stats_df["PR_AUC"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["PR_AUC"], (2.5, 97.5)), 4)}')
print(f'ROC_AUC:   mean = {stats_df["roc_auc"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["roc_auc"], (2.5, 97.5)), 4)}')

Recall:    mean = 0.8791; 95% CI = [0.865  0.8901]
Precision: mean = 0.1347; 95% CI = [0.1322 0.1367]
MCC:       mean = 0.2603; 95% CI = [0.2547 0.2649]
PR_AUC:    mean = 0.2101; 95% CI = [0.2017 0.2185]
ROC_AUC:   mean = 0.8331; 95% CI = [0.8289 0.838 ]


In [9]:
%%time
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=70, criterion="friedman_mse",max_depth=11, min_samples_leaf=50,
                                     min_samples_split=900,max_leaf_nodes=None,max_features=12,subsample=0.9,
                                     learning_rate=0.1,random_state=7)
print(clf.get_params())
stats_df = bootstrap_stat(X, y, clf, nsamples=1000, under=True)
stats_df.loc['mean'] = stats_df.mean()
#stats_df.loc['mean',:]
stats_df

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 11, 'max_features': 12, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 50, 'min_samples_split': 900, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 70, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': 7, 'subsample': 0.9, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
CPU times: user 32min 21s, sys: 33.1 s, total: 32min 54s
Wall time: 32min 53s


Unnamed: 0,recall,prec,MCC,PR_AUC,avg_prec,roc_auc
0,0.877538,0.135311,0.260889,0.209422,0.209679,0.835075
1,0.878827,0.134117,0.259066,0.211624,0.211989,0.832683
2,0.892684,0.136138,0.265895,0.218659,0.218952,0.838557
3,0.883661,0.135743,0.263081,0.197194,0.197513,0.830856
4,0.876893,0.134643,0.259554,0.205716,0.206028,0.831923
...,...,...,...,...,...,...
996,0.884950,0.134292,0.260803,0.215623,0.215885,0.834144
997,0.879149,0.131877,0.255111,0.205804,0.206022,0.830366
998,0.877216,0.133157,0.256974,0.215063,0.215297,0.831413
999,0.881727,0.135479,0.262163,0.219947,0.220211,0.835798


In [10]:
type(stats_df)
stats_df.loc['mean','roc_auc']

0.8327524415060947

In [11]:
print(f'Recall:    mean = {stats_df["recall"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["recall"], (2.5, 97.5)), 4)}')
print(f'Precision: mean = {stats_df["prec"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["prec"], (2.5, 97.5)), 4)}')
print(f'MCC:       mean = {stats_df["MCC"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["MCC"], (2.5, 97.5)), 4)}')
print(f'PR_AUC:    mean = {stats_df["PR_AUC"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["PR_AUC"], (2.5, 97.5)), 4)}')
print(f'ROC_AUC:   mean = {stats_df["roc_auc"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["roc_auc"], (2.5, 97.5)), 4)}')

Recall:    mean = 0.8786; 95% CI = [0.8659 0.8911]
Precision: mean = 0.1346; 95% CI = [0.1323 0.1366]
MCC:       mean = 0.2599; 95% CI = [0.2545 0.2648]
PR_AUC:    mean = 0.2103; 95% CI = [0.2012 0.2197]
ROC_AUC:   mean = 0.8328; 95% CI = [0.828  0.8378]


In [12]:
print(np.percentile(stats_df['recall'], (2.5, 97.5)))
print(np.percentile(stats_df['prec'], (2.5, 97.5)))
print(np.percentile(stats_df['MCC'], (2.5, 97.5)))
print(np.percentile(stats_df['PR_AUC'], (2.5, 97.5)))
print(np.percentile(stats_df['avg_prec'], (2.5, 97.5)))
print(np.percentile(stats_df['roc_auc'], (2.5, 97.5)))

[0.86593619 0.89107316]
[0.13232569 0.13655021]
[0.25450128 0.2647815 ]
[0.20119317 0.21966511]
[0.2014788  0.21993051]
[0.8279541  0.83775522]


# RF w/out Freq; max_features=auto (Union50, samp_strat=0.7)
### This mimics best original performer

In [13]:
%%time
## Random Undersampling
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=128, criterion="gini",max_depth=40, min_samples_leaf=16,
                                     min_samples_split=2,max_leaf_nodes=100,max_features="auto",
                                     class_weight=None)
print(clf)
stats_df = bootstrap_stat(X, y, clf, nsamples=100, under=True, samp_strat=0.7)
stats_df.loc['mean'] = stats_df.mean()
#stats_df.loc['mean',:]
stats_df

RandomForestClassifier(max_depth=40, max_leaf_nodes=100, min_samples_leaf=16,
                       n_estimators=128)
CPU times: user 5min 29s, sys: 4.83 s, total: 5min 33s
Wall time: 5min 33s


Unnamed: 0,recall,prec,MCC,PR_AUC,avg_prec,roc_auc
0,0.862069,0.137894,0.261805,0.209968,0.210329,0.831599
1,0.849178,0.138335,0.259565,0.205813,0.206024,0.827801
2,0.864647,0.139522,0.265212,0.213718,0.214087,0.831747
3,0.862069,0.139570,0.264691,0.193705,0.193973,0.828223
4,0.857557,0.137783,0.260562,0.197350,0.197557,0.826541
...,...,...,...,...,...,...
96,0.847567,0.138268,0.259076,0.204451,0.204649,0.826667
97,0.870448,0.139883,0.267193,0.201113,0.201312,0.831498
98,0.851434,0.138499,0.260371,0.199620,0.199903,0.828942
99,0.862069,0.138236,0.262396,0.199917,0.200237,0.829283


In [14]:
print(np.percentile(stats_df['recall'], (2.5, 97.5)))
print(np.percentile(stats_df['prec'], (2.5, 97.5)))
print(np.percentile(stats_df['MCC'], (2.5, 97.5)))
print(np.percentile(stats_df['PR_AUC'], (2.5, 97.5)))
print(np.percentile(stats_df['avg_prec'], (2.5, 97.5)))
print(np.percentile(stats_df['roc_auc'], (2.5, 97.5)))

[0.84466645 0.87044795]
[0.13589273 0.13973803]
[0.25553646 0.26556014]
[0.19435715 0.21349837]
[0.19462125 0.21377546]
[0.82515394 0.8342248 ]


In [15]:
print(f'PR_AUC CI: = {np.percentile(stats_df["PR_AUC"], (2.5, 97.5))}')
print(f'ROC CI: = {np.percentile(stats_df["ROC"], (2.5, 97.5))}')
print(f'MCC CI: = {np.percentile(stats_df["MCC"], (2.5, 97.5))}')

PR_AUC CI: = [0.19435715 0.21349837]


KeyError: 'ROC'

# Trying RF w/ Freq & max_features=None (Union50, no undersampling)

In [None]:
df = pd.read_csv('../../data/csl/CSL_tl_PI_Freq.csv', index_col=0)
X = df.drop('trans_loss', axis=1, inplace=False)
X = X[corr_vars]
y = df['trans_loss'].values

In [None]:
%%time
## Random Undersampling
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=128, criterion="gini",max_depth=40, min_samples_leaf=16,
                                     min_samples_split=2,max_leaf_nodes=100,max_features=None,
                                     class_weight="balanced")
print(clf)
stats_df = bootstrap_stat(X, y, clf, nsamples=10, under=False)
stats_df.loc['mean'] = stats_df.mean()
#stats_df.loc['mean',:]
stats_df

In [None]:
%%time
## Random Undersampling
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=128, criterion="gini",max_depth=40, min_samples_leaf=16,
                                     min_samples_split=2,max_leaf_nodes=100,max_features=None,
                                     class_weight="balanced")
print(clf)
stats_df = bootstrap_stat(X, y, clf, nsamples=2, under=True, samp_strat=0.7)
stats_df.loc['mean'] = stats_df.mean()
#stats_df.loc['mean',:]
stats_df

# Trying RF w/ Freq & max_features=None (Union50 features, under=0.7)

In [None]:
df = pd.read_csv('../../data/csl/CSL_tl_PI_Freq.csv', index_col=0)
X = df.drop('trans_loss', axis=1, inplace=False)
X = X[corr_vars]
y = df['trans_loss'].values

In [None]:
%%time
## Random Undersampling
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=128, criterion="gini",max_depth=40, min_samples_leaf=16,
                                     min_samples_split=2,max_leaf_nodes=100,max_features=None,
                                     class_weight=None)
print(clf)
stats_df = bootstrap_stat(X, y, clf, nsamples=5, under=True, samp_strat=0.7)
stats_df.loc['mean'] = stats_df.mean()
#stats_df.loc['mean',:]
stats_df

# Trying RF w/ Freq & max_features=None (ALL features, under=0.7)

In [None]:
df = pd.read_csv('../../data/csl/CSL_tl_PI_Freq.csv', index_col=0)
X = df.drop('trans_loss', axis=1, inplace=False)
y = df['trans_loss'].values

In [None]:
%%time
## Random Undersampling
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=128, criterion="gini",max_depth=40, min_samples_leaf=16,
                                     min_samples_split=2,max_leaf_nodes=100,max_features=None,
                                     class_weight=None)
print(clf)
stats_df = bootstrap_stat(X, y, clf, nsamples=2, under=True, samp_strat=0.7)
stats_df.loc['mean'] = stats_df.mean()
#stats_df.loc['mean',:]
stats_df

# Trying RF w/ Freq & max_features=auto (ALL features, under=0.7)

In [None]:
%%time
## Random Undersampling
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=128, criterion="gini",max_depth=40, min_samples_leaf=16,
                                     min_samples_split=2,max_leaf_nodes=100,max_features="auto",
                                     class_weight=None)
print(clf)
stats_df = bootstrap_stat(X, y, clf, nsamples=2, under=True, samp_strat=0.7)
stats_df.loc['mean'] = stats_df.mean()
#stats_df.loc['mean',:]
stats_df