In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, recall_score, \
                            classification_report, roc_auc_score, precision_score, \
                            f1_score, matthews_corrcoef, average_precision_score, \
                            precision_recall_curve, auc, roc_curve
from collections import Counter
%config Completer.use_jedi = False

In [2]:
def calc_stats(y_test, y_pred, X_test, clf):
    probs = clf.predict_proba(X_test)
    prob1 = probs[:, 1]
    stats_s = pd.Series(dtype='float')
    stats_s['recall'] = recall_score(y_test, y_pred)
    stats_s['prec'] = precision_score(y_test, y_pred)
    stats_s['MCC'] = matthews_corrcoef(y_test, y_pred)
    precision, recall, _ = precision_recall_curve(y_test, prob1, pos_label=1)
    stats_s['PR_AUC'] = auc(recall, precision)
    stats_s['avg_prec'] = average_precision_score(y_test, prob1)
    stats_s['roc_auc'] = roc_auc_score(y_test, prob1)

    return stats_s


In [3]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
def sample_data(X, y, samp_type, samp_strat, seed=0):
    if samp_type == 'over':
        sampler = RandomOverSampler(sampling_strategy=samp_strat, random_state=seed)
    elif samp_type == 'under':
        sampler = RandomUnderSampler(sampling_strategy=samp_strat, random_state=seed)
    else:
        print("Invalid 'samp_type'")
        
    # fit and apply the transform
    X_res, y_res = sampler.fit_resample(X, y)
    # summarize class distribution
    #print(Counter(y_res))
    #print(X_res.shape)
    
    return X_res, y_res

# Bootstrapped  random statistics runs

In [4]:
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
def bootstrap_stat(X, y, clf, nsamples=100, under=False, samp_strat=1.0):
    stats_df = pd.DataFrame()
    feat_imps_df = pd.DataFrame()

    for seed in range(nsamples):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=seed)
        #print(f'In bstrap(): y_train.shape = {y_train.shape}; X_train.shape = {X_train.shape}')
        #print(f'In bstrap(): np.bincount(y_train) = {np.bincount(y_train)}')

        if under:
            # Undersample the training data
            #print('Undersampling')
            X_res, y_res = sample_data(X_train, y_train, "under", samp_strat=samp_strat, seed=seed)
        else:
            #print('No Undersampling')
            X_res, y_res = X, y # Not subsampled - for use with class_weight='balanced'
            
#        print(f'In kfold_cv: train_y.shape = {train_y.shape}')
#        if sample_weights:
#            weights = class_weight.compute_sample_weight('balanced', y=y_res)
#            print(f'np.unique(weights): {np.unique(weights)}')
#            clf.fit(X_res, y_res, sample_weight=weights)
#        else:
#            clf.fit(X_res, y_res)
#            
        clf.fit(X_res, y_res)
        y_pred = clf.predict(X_test)

        stats_s = calc_stats(y_test, y_pred, X_test, clf)
        if stats_df.empty:
            stats_df = pd.DataFrame(stats_s)
            stats_df = stats_df.T
        else:
            stats_df = stats_df.append(stats_s, ignore_index=True)
            
        if feat_imps_df.empty:
            feat_imps_df = pd.DataFrame(data=clf.coef_[0], index=X_test.columns.values, columns=[seed])
        else:
            temp_df = pd.DataFrame(data=clf.coef_[0], index=X_test.columns.values, columns=[seed])
            feat_imps_df = feat_imps_df.merge(temp_df, left_index=True, right_index=True, how="left")

    return stats_df, feat_imps_df

In [5]:
corr_df = pd.read_csv('../../data/csl/CramerTheil/Cramer_PI_Tl_coeff_Union50.csv', index_col=0, header=None, delimiter='\t')
corr_vars = list(corr_df.index.values)
len(corr_vars)

66

In [6]:
df = pd.read_csv('../../data/csl/CSL_tl_PI.csv', index_col=0)
X = df.drop('trans_loss', axis=1, inplace=False)
X = X[corr_vars]
y = df['trans_loss'].values

In [7]:
%%time
clf = LogisticRegression(C=0.1,solver="liblinear",class_weight="balanced",random_state=7)
print(clf.get_params())
#stats_df, feats_df = bootstrap_stat(X, y, clf, nsamples=1000, under=True, samp_strat=0.5)
stats_df, feats_df = bootstrap_stat(X, y, clf, nsamples=50, under=True, samp_strat=0.5)
stats_df.loc['mean'] = stats_df.mean()
#stats_df.loc['mean',:]
stats_df

{'C': 0.1, 'class_weight': 'balanced', 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': 7, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
CPU times: user 1min 53s, sys: 5.31 s, total: 1min 58s
Wall time: 1min 35s


Unnamed: 0,recall,prec,MCC,PR_AUC,avg_prec,roc_auc
0,0.845311,0.130083,0.244119,0.169836,0.170072,0.81155
1,0.819207,0.127617,0.23377,0.171363,0.171612,0.810652
2,0.830487,0.128273,0.237485,0.178551,0.178818,0.813188
3,0.817274,0.127277,0.23272,0.168996,0.169278,0.808985
4,0.826297,0.12788,0.235831,0.176214,0.176485,0.811198
5,0.815662,0.125996,0.230006,0.168512,0.168806,0.805689
6,0.825653,0.128537,0.236885,0.172054,0.17236,0.809526
7,0.831131,0.129021,0.238993,0.173292,0.173609,0.81167
8,0.824364,0.128536,0.236596,0.165767,0.166067,0.809967
9,0.842411,0.127805,0.239301,0.176418,0.176761,0.814476


In [8]:
print(f'Recall:    mean = {stats_df["recall"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["recall"], (2.5, 97.5)), 4)}')
print(f'Precision: mean = {stats_df["prec"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["prec"], (2.5, 97.5)), 4)}')
print(f'MCC:       mean = {stats_df["MCC"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["MCC"], (2.5, 97.5)), 4)}')
print(f'PR_AUC:    mean = {stats_df["PR_AUC"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["PR_AUC"], (2.5, 97.5)), 4)}')
print(f'ROC_AUC:   mean = {stats_df["roc_auc"]["mean"]:.4f}; 95% CI = {np.around(np.percentile(stats_df["roc_auc"], (2.5, 97.5)), 4)}')

Recall:    mean = 0.8298; 95% CI = [0.8154 0.8449]
Precision: mean = 0.1281; 95% CI = [0.1264 0.1299]
MCC:       mean = 0.2370; 95% CI = [0.2312 0.2427]
PR_AUC:    mean = 0.1700; 95% CI = [0.1624 0.1785]
ROC_AUC:   mean = 0.8103; 95% CI = [0.8044 0.815 ]


In [9]:
feats_df['mean'] = feats_df.mean(axis=1)
feats_df['abs_mean'] = np.abs(feats_df['mean'])
feats_df.sort_values(by='abs_mean', inplace=True, ascending=False)
pd.options.display.max_rows = None
feats_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,42,43,44,45,46,47,48,49,mean,abs_mean
Hxanemia,0.891687,0.852615,0.813895,0.795446,0.785724,0.805464,0.876435,0.924341,0.923375,0.895991,...,0.794785,0.816725,0.843772,0.8293191,0.861927,0.747978,0.804375,0.80841,0.829411,0.829411
intra_previa,0.799887,0.792721,0.765277,0.954101,0.731605,0.937639,0.831099,0.947194,0.735085,0.828669,...,0.831342,0.824585,0.994973,0.7691001,0.97883,0.873479,0.793858,0.876267,0.815934,0.815934
intra_abruptio,0.790865,0.786087,0.777423,0.902611,0.828908,0.839274,0.829202,0.912016,0.714708,0.852594,...,0.821102,0.815576,0.695864,0.7061599,0.789934,0.759022,0.64766,0.917383,0.779163,0.779163
Delmode,0.725635,0.788386,0.770523,0.751956,0.767476,0.742949,0.800318,0.787123,0.801241,0.803371,...,0.778087,0.794487,0.78567,0.7749426,0.776297,0.76692,0.760061,0.77078,0.774275,0.774275
HosEpiNurse,-0.785926,-0.742542,-0.704588,-0.732075,-0.651784,-0.81185,-0.797166,-0.755822,-0.674217,-0.669263,...,-0.733851,-0.757875,-0.624166,-0.7324746,-0.76461,-0.668679,-0.680832,-0.617981,-0.722546,0.722546
Hxcsection,-0.578998,-0.688554,-0.470372,-0.654861,-0.565572,-0.588801,-0.639336,-0.696105,-0.564039,-0.71424,...,-0.512622,-0.640299,-0.518824,-0.5705929,-0.672732,-0.520972,-0.500062,-0.639389,-0.582876,0.582876
Lac_None,-0.537602,-0.54572,-0.56626,-0.571891,-0.52483,-0.641567,-0.495953,-0.55916,-0.467617,-0.607341,...,-0.640439,-0.57244,-0.527285,-0.5499808,-0.501647,-0.518549,-0.569531,-0.536147,-0.549744,0.549744
Intrafever,0.527122,0.63495,0.476122,0.56155,0.56045,0.544263,0.491835,0.549758,0.496909,0.605249,...,0.436866,0.480851,0.578139,0.5728778,0.54983,0.532588,0.457773,0.501943,0.515876,0.515876
prelaborCD,0.503355,0.404607,0.456433,0.4843,0.514133,0.547025,0.44518,0.430116,0.609995,0.42443,...,0.476525,0.352962,0.471958,0.5259253,0.565541,0.471005,0.430464,0.422978,0.47832,0.47832
Anteprevia,0.402769,0.355391,0.608491,0.286347,0.530363,0.318539,0.454313,0.311607,0.33175,0.459022,...,0.327168,0.325969,0.286144,0.377368,0.314394,0.411533,0.304649,0.290942,0.38498,0.38498


In [23]:
feats_df.loc[:,['mean','abs_mean']].iloc[0:20]

Unnamed: 0,mean,abs_mean
Hxanemia,0.829411,0.829411
intra_previa,0.815934,0.815934
intra_abruptio,0.779163,0.779163
Delmode,0.774275,0.774275
HosEpiNurse,-0.722546,0.722546
Hxcsection,-0.582876,0.582876
Lac_None,-0.549744,0.549744
Intrafever,0.515876,0.515876
prelaborCD,0.47832,0.47832
Anteprevia,0.38498,0.38498
