In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, recall_score, \
                            classification_report, roc_auc_score, precision_score, \
                            f1_score, matthews_corrcoef, average_precision_score, \
                            precision_recall_curve, auc, roc_curve
from collections import Counter
%config Completer.use_jedi = False

In [2]:
def calc_stats(y_test, y_pred, X_test, clf):
    probs = clf.predict_proba(X_test)
    prob1 = probs[:, 1]
    stats_s = pd.Series(dtype='float')
    stats_s['recall'] = recall_score(y_test, y_pred)
    stats_s['prec'] = precision_score(y_test, y_pred)
    stats_s['MCC'] = matthews_corrcoef(y_test, y_pred)
    precision, recall, _ = precision_recall_curve(y_test, prob1, pos_label=1)
    stats_s['PR_AUC'] = auc(recall, precision)
    stats_s['avg_prec'] = average_precision_score(y_test, prob1)
    stats_s['roc_auc'] = roc_auc_score(y_test, prob1)
    
    return stats_s

In [3]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
def sample_data(X, y, samp_type, samp_strat, seed=0):
    if samp_type == 'over':
        sampler = RandomOverSampler(sampling_strategy=samp_strat, random_state=seed)
    elif samp_type == 'under':
        sampler = RandomUnderSampler(sampling_strategy=samp_strat, random_state=seed)
    else:
        print("Invalid 'samp_type'")
        
    # fit and apply the transform
    X_res, y_res = sampler.fit_resample(X, y)
    # summarize class distribution
    #print(Counter(y_res))
    #print(X_res.shape)
    
    return X_res, y_res

# Bootstrapped  random statistics runs

In [4]:
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
# def bootstrap_stat(X, y, clf, nsamples=100, test_size=0.3, sample_weights=False, under=False, samp_strat=1.0):
#     stats_df = pd.DataFrame()
#     feat_imps_df = pd.DataFrame()
#     for seed in range(nsamples):
#         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=seed)
#         #print(f'In bstrap(): y_train.shape = {y_train.shape}; X_train.shape = {X_train.shape}')
#         #print(f'In bstrap(): np.bincount(y_train) = {np.bincount(y_train)}')

#         if under:
#             # Undersample the training data
#             #print('Undersampling')
#             X_res, y_res = sample_data(X_train, y_train, "under", samp_strat=samp_strat, seed=seed)
#         else:
#             #print('No Undersampling')
#             X_res, y_res = X_train, y_train # Not subsampled; use with class_weight='balanced' or sample_weights
            
#         if sample_weights:
#             weights = class_weight.compute_sample_weight('balanced', y=y_res)
#             #print(f'np.unique(weights): {np.unique(weights)}')
#             clf.fit(X_res, y_res, sample_weight=weights)
#         else:
#             clf.fit(X_res, y_res)
            
#         y_pred = clf.predict(X_test)

#         stats_s = calc_stats(y_test, y_pred, X_test, clf)
#         if stats_df.empty:
#             stats_df = pd.DataFrame(stats_s)
#             stats_df = stats_df.T
#         else:
#             stats_df = stats_df.append(stats_s, ignore_index=True)
            
#         if feat_imps_df.empty:
#             feat_imps_df = pd.DataFrame(data=clf.feature_importances_, index=X_test.columns.values, columns=[seed])
#         else:
#             temp_df = pd.DataFrame(data=clf.feature_importances_, index=X_test.columns.values, columns=[seed])
#             feat_imps_df = feat_imps_df.merge(temp_df, left_index=True, right_index=True, how="left")
        
#     return stats_df, feat_imps_df

In [4]:
corr_df = pd.read_csv('../../data/csl/CramerTheil/Cramer_PI_Tl_coeff_Union50.csv', index_col=0, header=None, delimiter='\t')
#corr_df = pd.read_csv('../../data/csl/CramerTheil/Cramer_PI_noDelmode.csv', index_col=0, header=None, delimiter='\t')
corr_vars = list(corr_df.index.values)
len(corr_vars)
df = pd.read_csv('../../data/csl/CSL_tl_PI.csv', index_col=0)
#df = pd.read_csv('../../data/csl/CSL_tl_PI_Freq.csv', index_col=0)
X = df.drop('trans_loss', axis=1, inplace=False)
X = X[corr_vars]
y = df['trans_loss'].values

In [6]:
%%time
from mwb_bootstrap import bootstrap_stat
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=70, criterion="friedman_mse",max_depth=11, min_samples_leaf=50,
                                     min_samples_split=900,max_leaf_nodes=None,max_features=12,subsample=0.9,
                                     learning_rate=0.1,random_state=7)
#clf = GradientBoostingClassifier(n_estimators=70, criterion="friedman_mse",max_depth=11, min_samples_leaf=50,
#                                     min_samples_split=900,max_leaf_nodes=None,max_features=None,subsample=0.9,
#                                     learning_rate=0.1,random_state=7)
print(clf.get_params())
#stats_df = bootstrap_stat(X, y, clf, nsamples=100, under=True)
stats_df, feats_df, _ = bootstrap_stat(X, y, clf, nsamples=25, under=True)
#stats_df, feats_df = bootstrap_stat(X, y, clf, sample_weights=True, nsamples=20, under=False)
stats_df.loc['mean'] = stats_df.mean()
stats_df.loc['mean',:]

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 11, 'max_features': 12, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 50, 'min_samples_split': 900, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 70, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': 7, 'subsample': 0.9, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
CPU times: user 55.4 s, sys: 1.22 s, total: 56.6 s
Wall time: 56.7 s


recall      0.876532
prec        0.134697
MCC         0.259562
PR_AUC      0.209446
avg_prec    0.209740
roc_auc     0.832368
Name: mean, dtype: float64

In [8]:
feats_df['mean'] = feats_df.mean(axis=1)
feats_df.sort_values(by='mean', inplace=True, ascending=False)
feats_df.head(30)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,mean
Delmode,0.210295,0.24737,0.238563,0.240012,0.270936,0.221043,0.259313,0.248807,0.253288,0.226934,...,0.240074,0.221149,0.235556,0.238446,0.280697,0.241586,0.216838,0.247658,0.286112,0.238933
Inoxy_incrdose,0.065815,0.060375,0.09757,0.109158,0.083446,0.066472,0.075514,0.087013,0.044786,0.113235,...,0.06287,0.042603,0.098086,0.053054,0.10781,0.05329,0.051879,0.047302,0.094097,0.072889
Intratocolytix,0.060062,0.067275,0.076105,0.038851,0.067885,0.064964,0.06199,0.071126,0.080447,0.051242,...,0.069849,0.069356,0.071938,0.082581,0.063003,0.071966,0.053165,0.073419,0.031802,0.064847
DMControl,0.053555,0.032987,0.021429,0.024046,0.054583,0.041525,0.041666,0.029243,0.042917,0.024481,...,0.05583,0.039801,0.035989,0.026373,0.017163,0.04072,0.048403,0.026119,0.032194,0.03644
Education,0.033805,0.026341,0.023657,0.031921,0.019228,0.022004,0.029712,0.040462,0.056557,0.038946,...,0.020289,0.061116,0.0237,0.058001,0.019093,0.047316,0.045948,0.046173,0.032786,0.035939
GAmethod,0.036443,0.021462,0.03067,0.023587,0.04909,0.02651,0.011244,0.027227,0.048416,0.029601,...,0.038715,0.045823,0.036965,0.043439,0.028247,0.043026,0.042478,0.031054,0.026903,0.034168
Insurance,0.037202,0.02786,0.030513,0.032305,0.022781,0.031738,0.043211,0.019599,0.036822,0.016613,...,0.027123,0.033815,0.024887,0.02505,0.028749,0.02693,0.03217,0.040846,0.020097,0.028446
ROMmeth,0.031776,0.030824,0.0285,0.033941,0.017896,0.027663,0.025308,0.022317,0.028048,0.029647,...,0.01846,0.023505,0.018605,0.020397,0.016615,0.032602,0.028873,0.023537,0.018213,0.027148
TrialLabor,0.021259,0.032573,0.029964,0.028945,0.020362,0.032666,0.026718,0.030881,0.018018,0.023388,...,0.018278,0.029251,0.022489,0.029536,0.016263,0.030835,0.029456,0.030021,0.020464,0.026629
Hxanemia,0.022023,0.027212,0.018918,0.018849,0.020425,0.028447,0.022862,0.029468,0.024357,0.034217,...,0.023175,0.021922,0.020906,0.026984,0.024241,0.024931,0.027195,0.018724,0.020251,0.02415


In [10]:
corr_df = pd.read_csv('../../data/csl/CramerTheil/Cramer_Pre_Tl_coeff_Union50.csv', index_col=0, header=None, delimiter='\t')
#corr_df = pd.read_csv('../../data/csl/CramerTheil/Cramer_PI_noDelmode.csv', index_col=0, header=None, delimiter='\t')
corr_vars = list(corr_df.index.values)
len(corr_vars)
df = pd.read_csv('../../data/csl/CSL_tl_Pre.csv', index_col=0)
#df = pd.read_csv('../../data/csl/CSL_tl_PI.csv', index_col=0)
#df = pd.read_csv('../../data/csl/CSL_tl_PI_Freq.csv', index_col=0)
X = df.drop('trans_loss', axis=1, inplace=False)
X = X[corr_vars]
y = df['trans_loss'].values

In [11]:
%%time
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=70, criterion="friedman_mse",max_depth=11, min_samples_leaf=50,
                                     min_samples_split=900,max_leaf_nodes=None,max_features=12,subsample=0.9,
                                     learning_rate=0.1,random_state=7)
#clf = GradientBoostingClassifier(n_estimators=70, criterion="friedman_mse",max_depth=11, min_samples_leaf=50,
#                                     min_samples_split=900,max_leaf_nodes=None,max_features=None,subsample=0.9,
#                                     learning_rate=0.1,random_state=7)
print(clf.get_params())
#stats_df = bootstrap_stat(X, y, clf, nsamples=100, under=True)
stats_df, feats_df = bootstrap_stat(X, y, clf, nsamples=25, under=True)
#stats_df, feats_df = bootstrap_stat(X, y, clf, sample_weights=True, nsamples=20, under=False)
stats_df.loc['mean'] = stats_df.mean()
stats_df.loc['mean',:]

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 11, 'max_features': 12, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 50, 'min_samples_split': 900, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 70, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': 7, 'subsample': 0.9, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
CPU times: user 42.4 s, sys: 641 ms, total: 43 s
Wall time: 43 s


recall      0.793323
prec        0.119189
MCC         0.212299
PR_AUC      0.172064
avg_prec    0.172343
roc_auc     0.784541
Name: mean, dtype: float64

In [12]:
feats_df['mean'] = feats_df.mean(axis=1)
feats_df.sort_values(by='mean', inplace=True, ascending=False)
feats_df.head(30)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,mean
HxnumCS,0.144438,0.11513,0.140736,0.128202,0.116515,0.131698,0.118182,0.124227,0.136038,0.133076,...,0.117554,0.118867,0.148099,0.119524,0.131439,0.11844,0.129595,0.12929,0.116968,0.128191
DMControl,0.125042,0.107653,0.103942,0.130679,0.119324,0.126415,0.11572,0.107632,0.131351,0.134597,...,0.127274,0.127296,0.109678,0.121699,0.11715,0.107422,0.124563,0.115088,0.120292,0.120303
prelaborCD,0.067423,0.087077,0.0799,0.093225,0.088969,0.093897,0.07831,0.083945,0.083431,0.083099,...,0.073467,0.074255,0.113419,0.083552,0.077906,0.087991,0.078412,0.082201,0.090533,0.085979
Hxanemia,0.067046,0.068595,0.058807,0.062237,0.054331,0.069888,0.063974,0.063962,0.071209,0.077653,...,0.070108,0.071376,0.055276,0.068422,0.074181,0.061394,0.0677,0.068863,0.06243,0.067088
Insurance,0.060529,0.059656,0.070413,0.078728,0.066129,0.062573,0.077509,0.051483,0.05059,0.053468,...,0.062558,0.058432,0.050307,0.065298,0.066693,0.063441,0.068996,0.068237,0.068611,0.062976
Education,0.049262,0.053093,0.042419,0.049154,0.048616,0.054972,0.053804,0.045803,0.058301,0.058819,...,0.043925,0.038333,0.046698,0.044227,0.058135,0.059249,0.043288,0.049971,0.047959,0.049717
Antefetdistress,0.047632,0.044712,0.046799,0.044964,0.047622,0.044849,0.03867,0.044961,0.049322,0.053025,...,0.054418,0.0463,0.050658,0.040868,0.054324,0.047956,0.055392,0.049469,0.03085,0.046927
Hostype,0.053884,0.043313,0.046983,0.029952,0.043828,0.039886,0.048192,0.047755,0.03399,0.019001,...,0.052833,0.047666,0.034587,0.046601,0.041094,0.046966,0.031296,0.044349,0.051476,0.040027
HospElectCS,0.026834,0.039626,0.031235,0.035736,0.03264,0.038108,0.035463,0.044117,0.031204,0.046316,...,0.034431,0.04125,0.034367,0.044504,0.032631,0.039233,0.044128,0.02612,0.041345,0.036438
momrace_new,0.03735,0.0329,0.041038,0.028054,0.041135,0.029168,0.036072,0.037309,0.033497,0.038388,...,0.03524,0.040718,0.0341,0.033412,0.036743,0.034666,0.029047,0.040519,0.039586,0.035549
