In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, recall_score, \
                            classification_report, roc_auc_score, precision_score, \
                            f1_score, matthews_corrcoef, average_precision_score, \
                            precision_recall_curve, auc, roc_curve
from collections import Counter
%config Completer.use_jedi = False

In [2]:
def calc_stats(y_test, y_pred, X_test, clf):
    probs = clf.predict_proba(X_test)
    prob1 = probs[:, 1]
    stats_s = pd.Series(dtype='float')
    stats_s['recall'] = recall_score(y_test, y_pred)
    stats_s['prec'] = precision_score(y_test, y_pred)
    stats_s['MCC'] = matthews_corrcoef(y_test, y_pred)
    precision, recall, _ = precision_recall_curve(y_test, prob1, pos_label=1)
    stats_s['PR_AUC'] = auc(recall, precision)
    stats_s['avg_prec'] = average_precision_score(y_test, prob1)
    stats_s['roc_auc'] = roc_auc_score(y_test, prob1)

    return stats_s


In [3]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
def sample_data(X, y, samp_type, samp_strat, seed=0):
    if samp_type == 'over':
        sampler = RandomOverSampler(sampling_strategy=samp_strat, random_state=seed)
    elif samp_type == 'under':
        sampler = RandomUnderSampler(sampling_strategy=samp_strat, random_state=seed)
    else:
        print("Invalid 'samp_type'")
        
    # fit and apply the transform
    X_res, y_res = sampler.fit_resample(X, y)
    # summarize class distribution
    #print(Counter(y_res))
    #print(X_res.shape)
    
    return X_res, y_res

# Bootstrapped  random statistics runs

In [4]:
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
def bootstrap_stat(X, y, clf, nsamples=100, under=False, samp_strat=1.0):
    stats_df = pd.DataFrame()
    for seed in range(nsamples):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=seed)
        #print(f'In bstrap(): y_train.shape = {y_train.shape}; X_train.shape = {X_train.shape}')
        #print(f'In bstrap(): np.bincount(y_train) = {np.bincount(y_train)}')

        if under:
            # Undersample the training data
            #print('Undersampling')
            X_res, y_res = sample_data(X_train, y_train, "under", samp_strat=samp_strat, seed=seed)
        else:
            #print('No Undersampling')
            X_res, y_res = X, y # Not subsampled - for use with class_weight='balanced'
            
#        print(f'In kfold_cv: train_y.shape = {train_y.shape}')
#        if sample_weights:
#            weights = class_weight.compute_sample_weight('balanced', y=y_res)
#            print(f'np.unique(weights): {np.unique(weights)}')
#            clf.fit(X_res, y_res, sample_weight=weights)
#        else:
#            clf.fit(X_res, y_res)
#            
        clf.fit(X_res, y_res)
        y_pred = clf.predict(X_test)

        stats_s = calc_stats(y_test, y_pred, X_test, clf)
        if stats_df.empty:
            stats_df = pd.DataFrame(stats_s)
            stats_df = stats_df.T
        else:
            stats_df = stats_df.append(stats_s, ignore_index=True)

    return stats_df

In [5]:
corr_df = pd.read_csv('../../data/csl/CramerTheil/Cramer_PI_Tl_coeff_Union50.csv', index_col=0, header=None, delimiter='\t')
corr_vars = list(corr_df.index.values)
len(corr_vars)

66

In [8]:
df = pd.read_csv('../../data/csl/CSL_tl_PI.csv', index_col=0)
X = df.drop('trans_loss', axis=1, inplace=False)
#X = X[corr_vars]
y = df['trans_loss'].values

In [12]:
%%time
clf = LogisticRegression(C=0.1,solver="liblinear",class_weight="balanced",random_state=7)
print(clf.get_params())
stats_df = bootstrap_stat(X, y, clf, nsamples=1000, under=True, samp_strat=0.5)
stats_df.loc['mean'] = stats_df.mean()
#stats_df.loc['mean',:]
stats_df

{'C': 0.1, 'class_weight': 'balanced', 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': 7, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
CPU times: user 1h 3min 3s, sys: 5min 8s, total: 1h 8min 12s
Wall time: 55min 11s


Unnamed: 0,recall,prec,MCC,PR_AUC,avg_prec,roc_auc
0,0.840799,0.130274,0.243443,0.177571,0.177815,0.813972
1,0.822430,0.128824,0.236686,0.179516,0.179790,0.813363
2,0.832743,0.128762,0.238883,0.184744,0.185011,0.815107
3,0.821463,0.128123,0.235196,0.174318,0.174669,0.812075
4,0.830164,0.129656,0.239927,0.182402,0.182660,0.813307
...,...,...,...,...,...,...
996,0.838543,0.128430,0.239580,0.176689,0.176959,0.813107
997,0.829520,0.127996,0.236763,0.173261,0.173530,0.810570
998,0.828553,0.127265,0.235205,0.180076,0.180424,0.814209
999,0.830809,0.130883,0.242282,0.178981,0.179245,0.815282


In [13]:
print(f'Recall:    mean = {stats_df["recall"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["recall"], (2.5, 97.5)), 4)}')
print(f'Precision: mean = {stats_df["prec"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["prec"], (2.5, 97.5)), 4)}')
print(f'MCC:       mean = {stats_df["MCC"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["MCC"], (2.5, 97.5)), 4)}')
print(f'PR_AUC:    mean = {stats_df["PR_AUC"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["PR_AUC"], (2.5, 97.5)), 4)}')
print(f'ROC_AUC:   mean = {stats_df["roc_auc"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["roc_auc"], (2.5, 97.5)), 4)}')

Recall:    mean = 0.8295; 95% CI = [0.8137 0.844 ]
Precision: mean = 0.1287; 95% CI = [0.1267 0.1307]
MCC:       mean = 0.2380; 95% CI = [0.2318 0.244 ]
PR_AUC:    mean = 0.1774; 95% CI = [0.1692 0.1855]
ROC_AUC:   mean = 0.8129; 95% CI = [0.8078 0.8178]
