In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, recall_score, \
                            classification_report, roc_auc_score, precision_score, \
                            f1_score, matthews_corrcoef, average_precision_score, \
                            precision_recall_curve, auc
from collections import Counter
%config Completer.use_jedi = False

In [2]:
df = pd.read_csv('../../data/csl/CSL_tl_PI.csv', index_col=0)
X = df.drop('trans_loss', axis=1, inplace=False)
y = df['trans_loss'].values

In [3]:
print(X.shape)
print(y.shape)
print(Counter(y))

(185413, 192)
(185413,)
Counter({0: 175069, 1: 10344})


In [4]:
def calc_stats(y_test, y_pred, X_test, clf):
#    print(confusion_matrix(y_test, y_pred))
##    print(confusion_matrix(y_test, y_pred, normalize='all'))
    f = lambda x: (x*10000).astype(int)
    cm = confusion_matrix(y_test, y_pred, normalize='all').ravel()
    ntn, nfp, nfn, ntp = f(cm)
    stats_s = pd.Series(data = [ntn, nfp, nfn, ntp], index = ['ntn', 'nfp', 'nfn', 'ntp']) 
    stats_s = pd.Series(data = [ntp, nfn, nfp, ntn], index = ['ntp', 'nfn', 'nfp', 'ntn']) 
    acc = accuracy_score(y_test, y_pred)
    stats_s['acc'] = acc
    bacc = balanced_accuracy_score(y_test, y_pred)
    stats_s['bacc'] = bacc
    prec = precision_score(y_test, y_pred)
    stats_s['prec'] = prec
    rec = recall_score(y_test, y_pred)
    stats_s['rec'] = rec
    ROC = roc_auc_score(y_test, y_pred)
    stats_s['ROC'] = ROC
    MCC = matthews_corrcoef(y_test, y_pred)
    stats_s['MCC'] = MCC
    probs = clf.predict_proba(X_test)
    probs = probs[:, 1]
    precision, recall, _ = precision_recall_curve(y_test, probs, pos_label=1)
    pr_auc = auc(recall, precision)
    stats_s['PR_AUC'] = pr_auc

    
#    print(f'Accuracy = {accuracy_score(y_test, y_pred)}')
#    print(f'Balanced Accuracy = {balanced_accuracy_score(y_test, y_pred)}')
#    print(f'Prec = {precision_score(y_test, y_pred)}')
#    print(f'Recall = {recall_score(y_test, y_pred)}')
#    print(f'ROC_AUC = {roc_auc_score(y_test, y_pred)}')
#    print(f'MCC = {matthews_corrcoef(y_test, y_pred)}')
#    print(f'PR_AUC = {auc(recall, precision)}')
#    print(f'\nClassification Report:\n {classification_report(y_test, y_pred)}')
    
    return stats_s


In [5]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
def sample_data(X, y, samp_type, samp_strat):
    if samp_type == 'over':
        sampler = RandomOverSampler(sampling_strategy=samp_strat)
    elif samp_type == 'under':
        sampler = RandomUnderSampler(sampling_strategy=samp_strat)
    else:
        print("Invalid 'samp_type'")
        
    # fit and apply the transform
    X_res, y_res = sampler.fit_resample(X, y)
    # summarize class distribution
    #print(Counter(y_res))
    #print(X_res.shape)
    
    return X_res, y_res

# Stratified K-fold Cross Validation
### Somewhat manual implementation to allow for undersampling, sample weights and stats

In [6]:
from sklearn.utils import class_weight
def kfold_cv(X, y, clf, splits=5, sample_weights=None, under=False):
    kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=7)
    stats_df = pd.DataFrame()
    for train_ix, test_ix in kfold.split(X, y):
        # select rows
        train_X, test_X = X.iloc[train_ix], X.iloc[test_ix]
        train_y, test_y = y[train_ix], y[test_ix]
        # summarize train and test composition
        train_0, train_1 = len(train_y[train_y==0]), len(train_y[train_y==1])
        test_0, test_1 = len(test_y[test_y==0]), len(test_y[test_y==1])
        print(' >Train: 0=%d, 1=%d, Test: 0=%d, 1=%d ' % (train_0, train_1, test_0, test_1))

        if under:
            # Undersample the training data
            print('Undersampling')
            X_res, y_res = sample_data(train_X, train_y, "under", 1.0)
        else:
            print('No Undersampling')
            X_res, y_res = X, y # Not subsampled - for use with class_weight='balanced'
            
        print(f'In kfold_cv: train_X.shape = {train_X.shape}')
        print(f'In kfold_cv: train_y.shape = {train_y.shape}')
        if sample_weights is not None:
            weights = class_weight.compute_sample_weight('balanced', y=y_res)
            print(f'np.unique(weights): {np.unique(weights)}')
            clf.fit(X_res, y_res, sample_weight=weights)
        else:
            clf.fit(X_res, y_res)
            
        y_pred = clf.predict(test_X)

        stats_s = calc_stats(test_y, y_pred, test_X, clf)
        if stats_df.empty:
            stats_df = pd.DataFrame(stats_s)
            stats_df = stats_df.T
        else:
            stats_df = stats_df.append(stats_s, ignore_index=True)

    return stats_df

In [7]:
%%time
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=128, criterion="gini",max_depth=80, min_samples_leaf=1,
                                     min_samples_split=4,max_leaf_nodes=100,max_features="auto",
                                     class_weight="balanced",random_state=7)
print(clf)
stats_df = kfold_cv(X, y, clf, under=False)
stats_df.loc['mean'] = stats_df.mean()
#stats_df.loc['mean',:]
stats_df

RandomForestClassifier(class_weight='balanced', max_depth=80,
                       max_leaf_nodes=100, min_samples_split=4,
                       n_estimators=128, random_state=7)
 >Train: 0=140055, 1=8275, Test: 0=35014, 1=2069 
No Undersampling
In kfold_cv: train_X.shape = (148330, 192)
In kfold_cv: train_y.shape = (148330,)
 >Train: 0=140055, 1=8275, Test: 0=35014, 1=2069 
No Undersampling
In kfold_cv: train_X.shape = (148330, 192)
In kfold_cv: train_y.shape = (148330,)
 >Train: 0=140055, 1=8275, Test: 0=35014, 1=2069 
No Undersampling
In kfold_cv: train_X.shape = (148330, 192)
In kfold_cv: train_y.shape = (148330,)
 >Train: 0=140055, 1=8276, Test: 0=35014, 1=2068 
No Undersampling
In kfold_cv: train_X.shape = (148331, 192)
In kfold_cv: train_y.shape = (148331,)
 >Train: 0=140056, 1=8275, Test: 0=35013, 1=2069 
No Undersampling
In kfold_cv: train_X.shape = (148331, 192)
In kfold_cv: train_y.shape = (148331,)
CPU times: user 1min 47s, sys: 1.3 s, total: 1min 48s
Wall time: 1min 48

Unnamed: 0,ntp,nfn,nfp,ntn,acc,bacc,prec,rec,ROC,MCC,PR_AUC
0,504.0,53.0,3427.0,6014.0,0.651889,0.770183,0.128207,0.903335,0.770183,0.253922,0.215213
1,502.0,55.0,3399.0,6042.0,0.654505,0.769976,0.128707,0.899952,0.769976,0.254074,0.211417
2,506.0,51.0,3412.0,6029.0,0.653615,0.772916,0.129181,0.907202,0.772916,0.25664,0.223803
3,501.0,55.0,3450.0,5991.0,0.649372,0.767239,0.126979,0.899903,0.767239,0.250866,0.219869
4,503.0,54.0,3425.0,6016.0,0.651987,0.76978,0.12814,0.902368,0.76978,0.25357,0.22655
mean,503.2,53.6,3422.6,6018.4,0.652274,0.770019,0.128243,0.902552,0.770019,0.253815,0.21937


In [8]:
%%time
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=128, criterion="gini",max_depth=80, min_samples_leaf=1,
                                     min_samples_split=4,max_leaf_nodes=100,max_features="auto",
                                     class_weight="balanced",random_state=7)
print(clf)
stats_df = kfold_cv(X, y, clf, under=False)
stats_df.loc['mean'] = stats_df.mean()
#stats_df.loc['mean',:]
stats_df

RandomForestClassifier(class_weight='balanced', max_depth=80,
                       max_leaf_nodes=100, min_samples_split=4,
                       n_estimators=128, random_state=7)
 >Train: 0=140055, 1=8275, Test: 0=35014, 1=2069 
No Undersampling
In kfold_cv: train_X.shape = (148330, 192)
In kfold_cv: train_y.shape = (148330,)
 >Train: 0=140055, 1=8275, Test: 0=35014, 1=2069 
No Undersampling
In kfold_cv: train_X.shape = (148330, 192)
In kfold_cv: train_y.shape = (148330,)
 >Train: 0=140055, 1=8275, Test: 0=35014, 1=2069 
No Undersampling
In kfold_cv: train_X.shape = (148330, 192)
In kfold_cv: train_y.shape = (148330,)
 >Train: 0=140055, 1=8276, Test: 0=35014, 1=2068 
No Undersampling
In kfold_cv: train_X.shape = (148331, 192)
In kfold_cv: train_y.shape = (148331,)
 >Train: 0=140056, 1=8275, Test: 0=35013, 1=2069 
No Undersampling
In kfold_cv: train_X.shape = (148331, 192)
In kfold_cv: train_y.shape = (148331,)
CPU times: user 1min 47s, sys: 1.21 s, total: 1min 48s
Wall time: 1min 4

Unnamed: 0,ntp,nfn,nfp,ntn,acc,bacc,prec,rec,ROC,MCC,PR_AUC
0,504.0,53.0,3427.0,6014.0,0.651889,0.770183,0.128207,0.903335,0.770183,0.253922,0.215213
1,502.0,55.0,3399.0,6042.0,0.654505,0.769976,0.128707,0.899952,0.769976,0.254074,0.211417
2,506.0,51.0,3412.0,6029.0,0.653615,0.772916,0.129181,0.907202,0.772916,0.25664,0.223803
3,501.0,55.0,3450.0,5991.0,0.649372,0.767239,0.126979,0.899903,0.767239,0.250866,0.219869
4,503.0,54.0,3425.0,6016.0,0.651987,0.76978,0.12814,0.902368,0.76978,0.25357,0.22655
mean,503.2,53.6,3422.6,6018.4,0.652274,0.770019,0.128243,0.902552,0.770019,0.253815,0.21937


In [9]:
%%time
stats_df = kfold_cv(X, y, clf, under=True)
stats_df.loc['mean'] = stats_df.mean()
#stats_df.loc['mean',:]
stats_df

 >Train: 0=140055, 1=8275, Test: 0=35014, 1=2069 
Undersampling
In kfold_cv: train_X.shape = (148330, 192)
In kfold_cv: train_y.shape = (148330,)
 >Train: 0=140055, 1=8275, Test: 0=35014, 1=2069 
Undersampling
In kfold_cv: train_X.shape = (148330, 192)
In kfold_cv: train_y.shape = (148330,)
 >Train: 0=140055, 1=8275, Test: 0=35014, 1=2069 
Undersampling
In kfold_cv: train_X.shape = (148330, 192)
In kfold_cv: train_y.shape = (148330,)
 >Train: 0=140055, 1=8276, Test: 0=35014, 1=2068 
Undersampling
In kfold_cv: train_X.shape = (148331, 192)
In kfold_cv: train_y.shape = (148331,)
 >Train: 0=140056, 1=8275, Test: 0=35013, 1=2069 
Undersampling
In kfold_cv: train_X.shape = (148331, 192)
In kfold_cv: train_y.shape = (148331,)
CPU times: user 15.2 s, sys: 1e+03 ms, total: 16.2 s
Wall time: 16.2 s


Unnamed: 0,ntp,nfn,nfp,ntn,acc,bacc,prec,rec,ROC,MCC,PR_AUC
0,502.0,55.0,3470.0,5971.0,0.647359,0.76642,0.126442,0.900435,0.76642,0.249924,0.197902
1,498.0,59.0,3398.0,6044.0,0.654235,0.76665,0.127898,0.893185,0.76665,0.251,0.195873
2,509.0,48.0,3564.0,5877.0,0.638648,0.767492,0.124975,0.912518,0.767492,0.249907,0.200739
3,502.0,55.0,3496.0,5945.0,0.644787,0.765039,0.125573,0.900387,0.765039,0.248308,0.199402
4,504.0,53.0,3472.0,5970.0,0.647457,0.768291,0.126882,0.904302,0.768291,0.251646,0.206164
mean,503.0,54.0,3480.0,5961.4,0.646497,0.766778,0.126354,0.902165,0.766778,0.250157,0.200016


In [10]:
df = pd.read_csv('../../data/csl/CSL_tl_PI_Freq.csv', index_col=0)
X = df.drop('trans_loss', axis=1, inplace=False)
y = df['trans_loss'].values

In [11]:
%%time
clf = RandomForestClassifier(n_estimators=128, criterion="gini",max_depth=80, min_samples_leaf=1,
                                     min_samples_split=4,max_leaf_nodes=100,max_features="auto",
                                     class_weight="balanced",random_state=7)
print(clf)
stats_df = kfold_cv(X, y, clf, under=False)
stats_df.loc['mean'] = stats_df.mean()
stats_df

RandomForestClassifier(class_weight='balanced', max_depth=80,
                       max_leaf_nodes=100, min_samples_split=4,
                       n_estimators=128, random_state=7)
 >Train: 0=140055, 1=8275, Test: 0=35014, 1=2069 
No Undersampling
In kfold_cv: train_X.shape = (148330, 192)
In kfold_cv: train_y.shape = (148330,)
 >Train: 0=140055, 1=8275, Test: 0=35014, 1=2069 
No Undersampling
In kfold_cv: train_X.shape = (148330, 192)
In kfold_cv: train_y.shape = (148330,)
 >Train: 0=140055, 1=8275, Test: 0=35014, 1=2069 
No Undersampling
In kfold_cv: train_X.shape = (148330, 192)
In kfold_cv: train_y.shape = (148330,)
 >Train: 0=140055, 1=8276, Test: 0=35014, 1=2068 
No Undersampling
In kfold_cv: train_X.shape = (148331, 192)
In kfold_cv: train_y.shape = (148331,)
 >Train: 0=140056, 1=8275, Test: 0=35013, 1=2069 
No Undersampling
In kfold_cv: train_X.shape = (148331, 192)
In kfold_cv: train_y.shape = (148331,)
CPU times: user 1min 43s, sys: 819 ms, total: 1min 44s
Wall time: 1min 4

Unnamed: 0,ntp,nfn,nfp,ntn,acc,bacc,prec,rec,ROC,MCC,PR_AUC
0,504.0,53.0,3428.0,6013.0,0.651754,0.770112,0.128163,0.903335,0.770112,0.25384,0.209082
1,503.0,54.0,3405.0,6036.0,0.653992,0.770615,0.128743,0.901885,0.770615,0.254589,0.208664
2,507.0,50.0,3413.0,6028.0,0.653588,0.773811,0.129376,0.909135,0.773811,0.257454,0.220956
3,502.0,55.0,3458.0,5984.0,0.648697,0.767565,0.126915,0.901354,0.767565,0.25108,0.21644
4,504.0,53.0,3428.0,6013.0,0.651745,0.770334,0.128214,0.903818,0.770334,0.254044,0.219508
mean,504.0,53.0,3426.4,6014.8,0.651955,0.770487,0.128282,0.903905,0.770487,0.254201,0.21493


In [12]:
df = pd.read_csv('../../data/csl/CSL_he_PI_Freq.csv', index_col=0)
X = df.drop('high_EBL', axis=1, inplace=False)
y = df['high_EBL'].values

In [13]:
%%time
clf = RandomForestClassifier(n_estimators=128, criterion="gini",max_depth=80, min_samples_leaf=1,
                                     min_samples_split=4,max_leaf_nodes=100,max_features="auto",
                                     class_weight="balanced",random_state=7)
print(clf)
stats_df = kfold_cv(X, y, clf, under=False)
stats_df.loc['mean'] = stats_df.mean()
stats_df

RandomForestClassifier(class_weight='balanced', max_depth=80,
                       max_leaf_nodes=100, min_samples_split=4,
                       n_estimators=128, random_state=7)
 >Train: 0=104581, 1=4608, Test: 0=26146, 1=1152 
No Undersampling
In kfold_cv: train_X.shape = (109189, 192)
In kfold_cv: train_y.shape = (109189,)
 >Train: 0=104581, 1=4608, Test: 0=26146, 1=1152 
No Undersampling
In kfold_cv: train_X.shape = (109189, 192)
In kfold_cv: train_y.shape = (109189,)
 >Train: 0=104582, 1=4608, Test: 0=26145, 1=1152 
No Undersampling
In kfold_cv: train_X.shape = (109190, 192)
In kfold_cv: train_y.shape = (109190,)
 >Train: 0=104582, 1=4608, Test: 0=26145, 1=1152 
No Undersampling
In kfold_cv: train_X.shape = (109190, 192)
In kfold_cv: train_y.shape = (109190,)
 >Train: 0=104582, 1=4608, Test: 0=26145, 1=1152 
No Undersampling
In kfold_cv: train_X.shape = (109190, 192)
In kfold_cv: train_y.shape = (109190,)
CPU times: user 1min 20s, sys: 598 ms, total: 1min 21s
Wall time: 1min 2

Unnamed: 0,ntp,nfn,nfp,ntn,acc,bacc,prec,rec,ROC,MCC,PR_AUC
0,380.0,41.0,2209.0,7368.0,0.774892,0.835188,0.146838,0.901042,0.835188,0.307667,0.239348
1,384.0,37.0,2238.0,7338.0,0.772364,0.838847,0.146607,0.911458,0.838847,0.309713,0.233432
2,378.0,43.0,2289.0,7288.0,0.766678,0.828825,0.141837,0.896701,0.828825,0.298946,0.236861
3,374.0,47.0,2275.0,7302.0,0.76774,0.825231,0.141416,0.888021,0.825231,0.296316,0.234779
4,378.0,43.0,2213.0,7364.0,0.774261,0.832784,0.145986,0.896701,0.832784,0.305364,0.253537
mean,378.8,42.2,2244.8,7332.0,0.771187,0.832175,0.144537,0.898785,0.832175,0.303601,0.239592


# Rerun RF w/ sample weights 

In [14]:
from sklearn.utils import class_weight
weights = class_weight.compute_sample_weight(class_weight="balanced", y=y)
print(np.unique(weights))

[ 0.52203064 11.84782986]


In [15]:
%%time
clf = RandomForestClassifier(n_estimators=128, criterion="gini",max_depth=80, min_samples_leaf=1,
                                     min_samples_split=4,max_leaf_nodes=100,max_features="auto",
                                     random_state=7)
print(clf)
stats_df = kfold_cv(X, y, clf, sample_weights=weights, under=False)
stats_df.loc['mean'] = stats_df.mean()
stats_df

RandomForestClassifier(max_depth=80, max_leaf_nodes=100, min_samples_split=4,
                       n_estimators=128, random_state=7)
 >Train: 0=104581, 1=4608, Test: 0=26146, 1=1152 
No Undersampling
In kfold_cv: train_X.shape = (109189, 192)
In kfold_cv: train_y.shape = (109189,)
np.unique(weights): [ 0.52203064 11.84782986]
 >Train: 0=104581, 1=4608, Test: 0=26146, 1=1152 
No Undersampling
In kfold_cv: train_X.shape = (109189, 192)
In kfold_cv: train_y.shape = (109189,)
np.unique(weights): [ 0.52203064 11.84782986]
 >Train: 0=104582, 1=4608, Test: 0=26145, 1=1152 
No Undersampling
In kfold_cv: train_X.shape = (109190, 192)
In kfold_cv: train_y.shape = (109190,)
np.unique(weights): [ 0.52203064 11.84782986]
 >Train: 0=104582, 1=4608, Test: 0=26145, 1=1152 
No Undersampling
In kfold_cv: train_X.shape = (109190, 192)
In kfold_cv: train_y.shape = (109190,)
np.unique(weights): [ 0.52203064 11.84782986]
 >Train: 0=104582, 1=4608, Test: 0=26145, 1=1152 
No Undersampling
In kfold_cv: train

Unnamed: 0,ntp,nfn,nfp,ntn,acc,bacc,prec,rec,ROC,MCC,PR_AUC
0,380.0,41.0,2209.0,7368.0,0.774892,0.835188,0.146838,0.901042,0.835188,0.307667,0.239348
1,384.0,37.0,2238.0,7338.0,0.772364,0.838847,0.146607,0.911458,0.838847,0.309713,0.233432
2,378.0,43.0,2289.0,7288.0,0.766678,0.828825,0.141837,0.896701,0.828825,0.298946,0.236861
3,374.0,47.0,2275.0,7302.0,0.76774,0.825231,0.141416,0.888021,0.825231,0.296316,0.234779
4,378.0,43.0,2213.0,7364.0,0.774261,0.832784,0.145986,0.896701,0.832784,0.305364,0.253537
mean,378.8,42.2,2244.8,7332.0,0.771187,0.832175,0.144537,0.898785,0.832175,0.303601,0.239592


In [16]:
df = pd.read_csv('../../data/csl/CSL_he_PI_Freq.csv', index_col=0)
X = df.drop('high_EBL', axis=1, inplace=False)
y = df['high_EBL'].values

In [17]:
%%time
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=120, max_depth=9, min_samples_split=600, min_samples_leaf=60, 
                                 subsample=0.8, learning_rate=0.1, max_features=None, random_state=7)
print(clf)
stats_df = kfold_cv(X, y, clf, under=True)
stats_df.loc['mean'] = stats_df.mean()
stats_df

GradientBoostingClassifier(max_depth=9, min_samples_leaf=60,
                           min_samples_split=600, n_estimators=120,
                           random_state=7, subsample=0.8)
 >Train: 0=104581, 1=4608, Test: 0=26146, 1=1152 
Undersampling
In kfold_cv: train_X.shape = (109189, 192)
In kfold_cv: train_y.shape = (109189,)
 >Train: 0=104581, 1=4608, Test: 0=26146, 1=1152 
Undersampling
In kfold_cv: train_X.shape = (109189, 192)
In kfold_cv: train_y.shape = (109189,)
 >Train: 0=104582, 1=4608, Test: 0=26145, 1=1152 
Undersampling
In kfold_cv: train_X.shape = (109190, 192)
In kfold_cv: train_y.shape = (109190,)
 >Train: 0=104582, 1=4608, Test: 0=26145, 1=1152 
Undersampling
In kfold_cv: train_X.shape = (109190, 192)
In kfold_cv: train_y.shape = (109190,)
 >Train: 0=104582, 1=4608, Test: 0=26145, 1=1152 
Undersampling
In kfold_cv: train_X.shape = (109190, 192)
In kfold_cv: train_y.shape = (109190,)
CPU times: user 36.5 s, sys: 518 ms, total: 37 s
Wall time: 37 s


Unnamed: 0,ntp,nfn,nfp,ntn,acc,bacc,prec,rec,ROC,MCC,PR_AUC
0,368.0,53.0,1884.0,7693.0,0.80614,0.837808,0.163415,0.872396,0.837808,0.32513,0.246345
1,369.0,52.0,1918.0,7659.0,0.802879,0.837351,0.161409,0.875,0.837351,0.322937,0.230202
2,365.0,56.0,2025.0,7552.0,0.791882,0.827876,0.153057,0.867188,0.827876,0.30909,0.235722
3,369.0,52.0,1998.0,7579.0,0.794923,0.833612,0.156095,0.875868,0.833612,0.315548,0.243163
4,364.0,57.0,1980.0,7597.0,0.796241,0.828492,0.155469,0.863715,0.828492,0.311776,0.247869
mean,367.0,54.0,1961.0,7616.0,0.798413,0.833028,0.157889,0.870833,0.833028,0.316896,0.24066


In [18]:
%%time
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.utils import class_weight
weights = class_weight.compute_sample_weight(class_weight="balanced", y=y)
print(np.unique(weights))

[ 0.52203064 11.84782986]
CPU times: user 19.8 ms, sys: 2.01 ms, total: 21.8 ms
Wall time: 18.4 ms


In [19]:
print(np.bincount(y))
print(np.unique(weights))

[130727   5760]
[ 0.52203064 11.84782986]


In [20]:
print(np.bincount(y))
print(np.unique(weights))

[130727   5760]
[ 0.52203064 11.84782986]


In [21]:
df = pd.read_csv('../../data/csl/CSL_he_PI_Freq.csv', index_col=0)
X = df.drop('high_EBL', axis=1, inplace=False)
y = df['high_EBL'].values

In [22]:
%%time
clf = GradientBoostingClassifier(n_estimators=120, max_depth=9, min_samples_split=600, min_samples_leaf=60, 
                                 subsample=0.8, learning_rate=0.1, max_features=None, random_state=7)
print(clf)
params = clf.get_params()
for item in params.items():
    print(item)
stats_df = kfold_cv(X, y, clf, sample_weights=weights, under=False)
stats_df.loc['mean'] = stats_df.mean()
stats_df

GradientBoostingClassifier(max_depth=9, min_samples_leaf=60,
                           min_samples_split=600, n_estimators=120,
                           random_state=7, subsample=0.8)
('ccp_alpha', 0.0)
('criterion', 'friedman_mse')
('init', None)
('learning_rate', 0.1)
('loss', 'deviance')
('max_depth', 9)
('max_features', None)
('max_leaf_nodes', None)
('min_impurity_decrease', 0.0)
('min_samples_leaf', 60)
('min_samples_split', 600)
('min_weight_fraction_leaf', 0.0)
('n_estimators', 120)
('n_iter_no_change', None)
('random_state', 7)
('subsample', 0.8)
('tol', 0.0001)
('validation_fraction', 0.1)
('verbose', 0)
('warm_start', False)
 >Train: 0=104581, 1=4608, Test: 0=26146, 1=1152 
No Undersampling
In kfold_cv: train_X.shape = (109189, 192)
In kfold_cv: train_y.shape = (109189,)
np.unique(weights): [ 0.52203064 11.84782986]
 >Train: 0=104581, 1=4608, Test: 0=26146, 1=1152 
No Undersampling
In kfold_cv: train_X.shape = (109189, 192)
In kfold_cv: train_y.shape = (109189,)
np.unique

Unnamed: 0,ntp,nfn,nfp,ntn,acc,bacc,prec,rec,ROC,MCC,PR_AUC
0,392.0,29.0,1774.0,7803.0,0.819657,0.872663,0.181234,0.930556,0.872663,0.363717,0.397316
1,394.0,27.0,1801.0,7776.0,0.817056,0.872965,0.179513,0.934028,0.872965,0.362275,0.38063
2,397.0,24.0,1827.0,7750.0,0.814778,0.875095,0.178524,0.940972,0.875095,0.362662,0.405329
3,392.0,29.0,1830.0,7747.0,0.814045,0.869734,0.176664,0.930556,0.869734,0.357563,0.403887
4,393.0,28.0,1774.0,7803.0,0.819724,0.873943,0.181619,0.93316,0.873943,0.364878,0.404202
mean,393.6,27.4,1801.2,7775.8,0.817052,0.87288,0.179511,0.933854,0.87288,0.362219,0.398273


In [23]:
%%time
clf = GradientBoostingClassifier(n_estimators=120, max_depth=9, min_samples_split=600, min_samples_leaf=60, 
                                 subsample=0.8, learning_rate=0.1, max_features=None, random_state=7)
print(clf)
params = clf.get_params()
for item in params.items():
    print(item)
stats_df = kfold_cv(X, y, clf, sample_weights=weights, under=False)
stats_df.loc['mean'] = stats_df.mean()
stats_df

GradientBoostingClassifier(max_depth=9, min_samples_leaf=60,
                           min_samples_split=600, n_estimators=120,
                           random_state=7, subsample=0.8)
('ccp_alpha', 0.0)
('criterion', 'friedman_mse')
('init', None)
('learning_rate', 0.1)
('loss', 'deviance')
('max_depth', 9)
('max_features', None)
('max_leaf_nodes', None)
('min_impurity_decrease', 0.0)
('min_samples_leaf', 60)
('min_samples_split', 600)
('min_weight_fraction_leaf', 0.0)
('n_estimators', 120)
('n_iter_no_change', None)
('random_state', 7)
('subsample', 0.8)
('tol', 0.0001)
('validation_fraction', 0.1)
('verbose', 0)
('warm_start', False)
 >Train: 0=104581, 1=4608, Test: 0=26146, 1=1152 
No Undersampling
In kfold_cv: train_X.shape = (109189, 192)
In kfold_cv: train_y.shape = (109189,)
np.unique(weights): [ 0.52203064 11.84782986]
 >Train: 0=104581, 1=4608, Test: 0=26146, 1=1152 
No Undersampling
In kfold_cv: train_X.shape = (109189, 192)
In kfold_cv: train_y.shape = (109189,)
np.unique

Unnamed: 0,ntp,nfn,nfp,ntn,acc,bacc,prec,rec,ROC,MCC,PR_AUC
0,392.0,29.0,1774.0,7803.0,0.819657,0.872663,0.181234,0.930556,0.872663,0.363717,0.397316
1,394.0,27.0,1801.0,7776.0,0.817056,0.872965,0.179513,0.934028,0.872965,0.362275,0.38063
2,397.0,24.0,1827.0,7750.0,0.814778,0.875095,0.178524,0.940972,0.875095,0.362662,0.405329
3,392.0,29.0,1830.0,7747.0,0.814045,0.869734,0.176664,0.930556,0.869734,0.357563,0.403887
4,393.0,28.0,1774.0,7803.0,0.819724,0.873943,0.181619,0.93316,0.873943,0.364878,0.404202
mean,393.6,27.4,1801.2,7775.8,0.817052,0.87288,0.179511,0.933854,0.87288,0.362219,0.398273


In [24]:
%%time
clf = GradientBoostingClassifier(n_estimators=120, max_depth=9, min_samples_split=600, min_samples_leaf=60, 
                                 subsample=0.8, learning_rate=0.1, max_features=None, random_state=7)
print(clf)
params = clf.get_params()
for item in params.items():
    print(item)
stats_df = kfold_cv(X, y, clf, sample_weights=weights, under=False)
stats_df.loc['mean'] = stats_df.mean()
stats_df

GradientBoostingClassifier(max_depth=9, min_samples_leaf=60,
                           min_samples_split=600, n_estimators=120,
                           random_state=7, subsample=0.8)
('ccp_alpha', 0.0)
('criterion', 'friedman_mse')
('init', None)
('learning_rate', 0.1)
('loss', 'deviance')
('max_depth', 9)
('max_features', None)
('max_leaf_nodes', None)
('min_impurity_decrease', 0.0)
('min_samples_leaf', 60)
('min_samples_split', 600)
('min_weight_fraction_leaf', 0.0)
('n_estimators', 120)
('n_iter_no_change', None)
('random_state', 7)
('subsample', 0.8)
('tol', 0.0001)
('validation_fraction', 0.1)
('verbose', 0)
('warm_start', False)
 >Train: 0=104581, 1=4608, Test: 0=26146, 1=1152 
No Undersampling
In kfold_cv: train_X.shape = (109189, 192)
In kfold_cv: train_y.shape = (109189,)
np.unique(weights): [ 0.52203064 11.84782986]
 >Train: 0=104581, 1=4608, Test: 0=26146, 1=1152 
No Undersampling
In kfold_cv: train_X.shape = (109189, 192)
In kfold_cv: train_y.shape = (109189,)
np.unique

Unnamed: 0,ntp,nfn,nfp,ntn,acc,bacc,prec,rec,ROC,MCC,PR_AUC
0,392.0,29.0,1774.0,7803.0,0.819657,0.872663,0.181234,0.930556,0.872663,0.363717,0.397316
1,394.0,27.0,1801.0,7776.0,0.817056,0.872965,0.179513,0.934028,0.872965,0.362275,0.38063
2,397.0,24.0,1827.0,7750.0,0.814778,0.875095,0.178524,0.940972,0.875095,0.362662,0.405329
3,392.0,29.0,1830.0,7747.0,0.814045,0.869734,0.176664,0.930556,0.869734,0.357563,0.403887
4,393.0,28.0,1774.0,7803.0,0.819724,0.873943,0.181619,0.93316,0.873943,0.364878,0.404202
mean,393.6,27.4,1801.2,7775.8,0.817052,0.87288,0.179511,0.933854,0.87288,0.362219,0.398273


In [25]:
%%time
clf = GradientBoostingClassifier(n_estimators=120, max_depth=9, min_samples_split=600, min_samples_leaf=60, 
                                 subsample=0.8, learning_rate=0.1, max_features=None, random_state=7)
print(clf)
params = clf.get_params()
params = clf.get_params()
for item in params.items():
    print(item)
stats_df = kfold_cv(X, y, clf, sample_weights=True, under=False)
stats_df.loc['mean'] = stats_df.mean()
stats_df

GradientBoostingClassifier(max_depth=9, min_samples_leaf=60,
                           min_samples_split=600, n_estimators=120,
                           random_state=7, subsample=0.8)
('ccp_alpha', 0.0)
('criterion', 'friedman_mse')
('init', None)
('learning_rate', 0.1)
('loss', 'deviance')
('max_depth', 9)
('max_features', None)
('max_leaf_nodes', None)
('min_impurity_decrease', 0.0)
('min_samples_leaf', 60)
('min_samples_split', 600)
('min_weight_fraction_leaf', 0.0)
('n_estimators', 120)
('n_iter_no_change', None)
('random_state', 7)
('subsample', 0.8)
('tol', 0.0001)
('validation_fraction', 0.1)
('verbose', 0)
('warm_start', False)
 >Train: 0=104581, 1=4608, Test: 0=26146, 1=1152 
No Undersampling
In kfold_cv: train_X.shape = (109189, 192)
In kfold_cv: train_y.shape = (109189,)
np.unique(weights): [ 0.52203064 11.84782986]
 >Train: 0=104581, 1=4608, Test: 0=26146, 1=1152 
No Undersampling
In kfold_cv: train_X.shape = (109189, 192)
In kfold_cv: train_y.shape = (109189,)
np.unique

Unnamed: 0,ntp,nfn,nfp,ntn,acc,bacc,prec,rec,ROC,MCC,PR_AUC
0,392.0,29.0,1774.0,7803.0,0.819657,0.872663,0.181234,0.930556,0.872663,0.363717,0.397316
1,394.0,27.0,1801.0,7776.0,0.817056,0.872965,0.179513,0.934028,0.872965,0.362275,0.38063
2,397.0,24.0,1827.0,7750.0,0.814778,0.875095,0.178524,0.940972,0.875095,0.362662,0.405329
3,392.0,29.0,1830.0,7747.0,0.814045,0.869734,0.176664,0.930556,0.869734,0.357563,0.403887
4,393.0,28.0,1774.0,7803.0,0.819724,0.873943,0.181619,0.93316,0.873943,0.364878,0.404202
mean,393.6,27.4,1801.2,7775.8,0.817052,0.87288,0.179511,0.933854,0.87288,0.362219,0.398273


In [26]:
df = pd.read_csv('../../data/csl/CSL_he_PI_Freq.csv', index_col=0)
X = df.drop('high_EBL', axis=1, inplace=False)
y = df['high_EBL'].values

In [27]:
%%time
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
clf = GradientBoostingClassifier(n_estimators=120, max_depth=9, min_samples_split=600, min_samples_leaf=60, 
                                 subsample=0.8, learning_rate=0.1, max_features=None, random_state=7)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=7)
weights = class_weight.compute_sample_weight(class_weight="balanced", y=y_train)
print(np.unique(weights))

clf.fit(X_train, y_train, sample_weight=weights)
y_pred = clf.predict(X_test)
stats = calc_stats(y_test, y_pred, X_test, clf)
stats

[ 0.52203086 11.84771825]
CPU times: user 1min 53s, sys: 149 ms, total: 1min 53s
Wall time: 1min 53s


ntp        359.000000
nfn         62.000000
nfp       1807.000000
ntn       7770.000000
acc          0.812978
bacc         0.831282
prec         0.165803
rec          0.851273
ROC          0.831282
MCC          0.323336
PR_AUC       0.247899
dtype: float64

In [28]:
%%time
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
clf = GradientBoostingClassifier(n_estimators=120, max_depth=9, min_samples_split=600, min_samples_leaf=60, 
                                 subsample=0.8, learning_rate=0.1, max_features=None, random_state=7)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=7)
weights = class_weight.compute_sample_weight(class_weight="balanced", y=y_train)
print(np.unique(weights))

clf.fit(X_train, y_train, sample_weight=weights)
y_pred = clf.predict(X_test)
stats = calc_stats(y_test, y_pred, X_test, clf)
stats

[ 0.52203086 11.84771825]
CPU times: user 1min 53s, sys: 147 ms, total: 1min 53s
Wall time: 1min 53s


ntp        359.000000
nfn         62.000000
nfp       1807.000000
ntn       7770.000000
acc          0.812978
bacc         0.831282
prec         0.165803
rec          0.851273
ROC          0.831282
MCC          0.323336
PR_AUC       0.247899
dtype: float64

In [29]:
df = pd.read_csv('../../data/csl/CSL_he_PI_Freq.csv', index_col=0)
X = df.drop('high_EBL', axis=1, inplace=False)
y = df['high_EBL'].values

In [30]:
%%time
clf = GradientBoostingClassifier(n_estimators=120, max_depth=9, min_samples_split=600, min_samples_leaf=60, 
                                 subsample=0.8, learning_rate=0.1, max_features=None, random_state=7)
stats_df = kfold_cv(X, y, clf, splits=2, sample_weights=True, under=False)
stats_df

 >Train: 0=65363, 1=2880, Test: 0=65364, 1=2880 
No Undersampling
In kfold_cv: train_X.shape = (68243, 192)
In kfold_cv: train_y.shape = (68243,)
np.unique(weights): [ 0.52203064 11.84782986]
 >Train: 0=65364, 1=2880, Test: 0=65363, 1=2880 
No Undersampling
In kfold_cv: train_X.shape = (68244, 192)
In kfold_cv: train_y.shape = (68244,)
np.unique(weights): [ 0.52203064 11.84782986]
CPU times: user 5min 18s, sys: 335 ms, total: 5min 18s
Wall time: 5min 18s


Unnamed: 0,ntp,nfn,nfp,ntn,acc,bacc,prec,rec,ROC,MCC,PR_AUC
0,394.0,27.0,1797.0,7780.0,0.817493,0.873857,0.180044,0.935417,0.873857,0.363334,0.390631
1,393.0,28.0,1805.0,7772.0,0.816611,0.871903,0.17894,0.932292,0.871903,0.361071,0.405177
