In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config Completer.use_jedi = False
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
df = pd.read_csv('../../data/csl/CSL_tytl_PI.csv', index_col=0)
#corr_df = pd.read_csv('../../data/csl/Features/PI_Tl_Del_noSiteCorr.csv', index_col=0, header=None, delimiter='\t')
#corr_df = pd.read_csv('../../data/csl/Features/PI_Tl_Del_noSite_abCont_u50.csv', index_col=0, header=None, delimiter='\t')
corr_df = pd.read_csv('../../data/csl/Features/PI_Tl_Del_noSite_abCont_ALL.csv', index_col=0, header=None, delimiter='\t')
corr_vars = list(corr_df.index.values)

In [3]:
%%time
from mwb_bootstrap import bootstrap_stat
from sklearn.ensemble import GradientBoostingClassifier

site_stats = pd.DataFrame()
site_feats = pd.DataFrame()

# Loop through Sites
for site in [41, 44, 48, 49, 51]:
    site_df = df[df['Sitenum'] == site]
    X = site_df.drop('trans_loss', axis=1, inplace=False)
    X = X[corr_vars]
    print('Delmode' in X.columns)
    y = site_df['trans_loss'].values
    
    clf = GradientBoostingClassifier(n_estimators=70, criterion="friedman_mse",max_depth=11, 
                          min_samples_leaf=50, min_samples_split=900,max_leaf_nodes=None,
                          max_features=12,subsample=0.9, learning_rate=0.1,random_state=7)
    stats_df, feats_df, X_train = bootstrap_stat(X, y, clf, test_size=0.25, 
                                                 sample_weights=True, nsamples=20, under=False)
    stats_df.loc['mean'] = stats_df.mean()
    site_stats[site] = stats_df.loc['mean',:]
    
    feats_df['mean'] = feats_df.mean(axis=1)
    if site_feats.empty:
        feats_df.sort_values(by='mean', inplace=True, ascending=False)
        site_feats[site] = feats_df['mean']
    else:
        this_site = pd.DataFrame(feats_df['mean'])
        this_site.columns = [site]
        site_feats = site_feats.merge(this_site, left_index=True, 
                                      right_index=True, how='left')
    
site_stats

True
True
True
True
True
CPU times: user 2min, sys: 652 ms, total: 2min 1s
Wall time: 2min 1s


Unnamed: 0,41,44,48,49,51
recall,0.464565,0.82064,0.527108,0.855628,0.344444
prec,0.12301,0.173404,0.053642,0.15987,0.033059
MCC,0.061325,0.286283,0.126302,0.301014,0.080885
PR_AUC,0.136898,0.265911,0.069899,0.225272,0.038532
roc_auc,0.585798,0.827181,0.809923,0.863327,0.74925


In [None]:
fig = plt.figure()
ax = site_stats.T.plot.bar(figsize=(12,5), title="Prediction stats by Sitenum")
ax.set_ylabel("Score")
ax.set_xlabel("Sitenum")
plt.draw()
#ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=16)
plt.show()

In [None]:
fig = plt.figure()
ax = site_stats.T.plot(figsize=(12,5), title="Prediction stats by Sitenum")
ax.set_ylabel("Score")
ax.set_xlabel("Sitenum")
plt.draw()
#ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=16)
plt.show()

In [4]:
site_feats

Unnamed: 0,41,44,48,49,51
momrace_new,0.137966,0.007117,0.020001,0.007269,0.019370
BESTGA,0.048442,0.037570,0.022729,0.015840,0.055182
AdmSBP,0.047393,0.017985,0.038033,0.011660,0.009746
Anteanemia,0.046732,0.006431,0.000648,0.010511,0.000000
AdmDBP,0.044998,0.015823,0.036440,0.013644,0.009534
...,...,...,...,...,...
hypertyro,0.000000,0.000000,0.000000,0.000000,0.000000
Hyp_clon,0.000000,0.000000,0.000000,0.000000,0.000000
Hyp_calc,0.000000,0.000000,0.000000,0.000387,0.000000
Hyp_alpha,0.000000,0.000000,0.000000,0.000000,0.000000


In [None]:
fig = plt.figure()
num_labels = 25
x_labels = site_feats.index[0:num_labels].values
print(x_labels)
ax = site_feats.iloc[0:num_labels,:].plot(figsize=(12,5), title="Top features by Site")
ax.set_ylabel("Raw coefficient")
ax.set_xlabel("Variable")
ax.set_xticks(range(num_labels))
ax.set_xticklabels(x_labels)
plt.draw()
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=12)
#ax.set_xticks(ax.get_xticks(), rotation=90, fontsize=12)
plt.show()

In [5]:
site_feats[41].sort_values(ascending=False)[0:12]

momrace_new    0.137966
BESTGA         0.048442
AdmSBP         0.047393
Anteanemia     0.046732
AdmDBP         0.044998
new_BMI        0.038756
new_age        0.035536
Admcontract    0.032006
Dilat_lst      0.021779
AdmBishop      0.021283
Delmode        0.020065
Education      0.019643
Name: 41, dtype: float64

In [6]:
site_feats[44].sort_values(ascending=False)[0:12]

Delmode       0.260834
CS_FTP        0.047750
TrialLabor    0.042860
BESTGA        0.037570
spontlabor    0.035958
Dilat_lst     0.029896
new_age       0.023859
prelaborCD    0.023353
Admreason     0.023117
Admefface     0.022465
new_BMI       0.018756
uscar         0.018518
Name: 44, dtype: float64

In [7]:
site_feats[49].sort_values(ascending=False)[0:12]

Delmode        0.201261
Delfetalpos    0.101549
Hxanemia       0.092565
TrialLabor     0.051426
CS_FTP         0.042513
Lac_Min        0.031643
Lac_None       0.030244
Admefface      0.024739
prelaborCD     0.024000
Intrafever     0.019185
new_age        0.016524
BESTGA         0.015840
Name: 49, dtype: float64

In [8]:
pd.DataFrame(feats_df['mean']).index

Index(['Accrete', 'Activeherpes', 'AdmBishop', 'Admcervpos', 'Admconsistency',
       'Admcontract', 'AdmDBP', 'Admefface', 'Admpresent', 'Admreason',
       ...
       'spontlabor', 'TD_nos', 'ThreatenedPB', 'threatpb9', 'TrialLabor',
       'UnspecHBP', 'Urupture', 'uscar', 'version9', 'vertex'],
      dtype='object', length=192)

In [9]:
s49_df = df[df['Sitenum'] == 49]
X = s49_df.drop('trans_loss', axis=1, inplace=False)
X = X[corr_vars]
y = s49_df['trans_loss'].values
X

Unnamed: 0_level_0,Accrete,Activeherpes,AdmBishop,Admcervpos,Admconsistency,Admcontract,AdmDBP,Admefface,Admpresent,Admreason,...,spontlabor,TD_nos,ThreatenedPB,threatpb9,TrialLabor,UnspecHBP,Urupture,uscar,version9,vertex
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
49-00001,0,0,8,1,3,1.0,73,60.0,1,3,...,0,0,0,0,1,0,0,0,0,1
49-00002,0,0,8,3,3,0.0,67,80.0,1,3,...,0,0,0,0,1,0,0,0,0,1
49-00003,0,0,8,8,8,2.0,83,100.0,1,4,...,1,0,0,0,1,0,0,0,0,1
49-00004,0,0,8,3,3,3.0,73,50.0,1,3,...,0,0,0,0,1,0,0,0,0,1
49-00005,0,0,8,1,3,0.0,61,90.0,1,3,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49-26354,0,0,8,2,3,2.0,80,50.0,1,8,...,1,1,0,0,1,0,0,0,0,1
49-26356,0,0,8,3,2,1.0,76,40.0,1,8,...,0,0,0,0,1,0,0,0,0,1
49-26358,0,1,8,1,2,1.0,94,50.0,1,8,...,0,0,0,0,1,0,0,0,0,1
49-26359,0,0,8,3,3,2.0,81,50.0,1,3,...,0,0,0,0,1,0,0,0,0,1


In [10]:
%%time
from mwb_bootstrap import bootstrap_stat
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=70, criterion="friedman_mse",max_depth=11, min_samples_leaf=50,
                                     min_samples_split=900,max_leaf_nodes=None,max_features=12,subsample=0.9,
                                     learning_rate=0.1,random_state=7)
print(clf.get_params())
#stats_df, feats_df, X_train = bootstrap_stat(X, y, clf, nsamples=25, under=True)
stats_df, feats_df, X_train = bootstrap_stat(X, y, clf, test_size=0.2, sample_weights=True, nsamples=10, 
                                             under=False)
stats_df.loc['mean'] = stats_df.mean()
stats_df.loc['mean',:]

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 11, 'max_features': 12, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 50, 'min_samples_split': 900, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 70, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': 7, 'subsample': 0.9, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
CPU times: user 12.3 s, sys: 11.5 ms, total: 12.3 s
Wall time: 12.3 s


recall     0.859162
prec       0.160498
MCC        0.302821
PR_AUC     0.225342
roc_auc    0.863063
Name: mean, dtype: float64

In [11]:
feats_df['mean'] = feats_df.mean(axis=1)
feats_df.sort_values(by='mean', inplace=True, ascending=False)
feats_df.head(30)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,mean
Delmode,0.261933,0.219739,0.205082,0.189348,0.195243,0.099768,0.176637,0.174806,0.178844,0.23057,0.193197
Delfetalpos,0.092729,0.099041,0.109978,0.112755,0.096893,0.103539,0.101186,0.103491,0.115396,0.107086,0.104209
Hxanemia,0.087453,0.092596,0.092911,0.095448,0.102298,0.090048,0.102538,0.081468,0.114163,0.09325,0.095217
TrialLabor,0.041112,0.035601,0.042506,0.045333,0.050548,0.088774,0.060221,0.052446,0.031135,0.033513,0.048119
CS_FTP,0.030934,0.043239,0.043639,0.043052,0.037355,0.05052,0.047307,0.038314,0.03814,0.031697,0.04042
Lac_Min,0.04051,0.039521,0.041433,0.01936,0.046055,0.02719,0.023331,0.04109,0.048759,0.022791,0.035004
Lac_None,0.02037,0.025216,0.023611,0.037539,0.015827,0.027636,0.035599,0.049837,0.040398,0.034568,0.03106
Admefface,0.015916,0.021326,0.026713,0.025183,0.024268,0.02618,0.024023,0.023947,0.023271,0.039833,0.025066
prelaborCD,0.010361,0.030373,0.006531,0.037214,0.008223,0.023862,0.025234,0.02484,0.022127,0.024662,0.021343
Intrafever,0.013012,0.017306,0.024175,0.0165,0.017149,0.024898,0.019001,0.017138,0.012261,0.019599,0.018104


In [12]:
s44_df = df[df['Sitenum'] == 44]
X = s44_df.drop('trans_loss', axis=1, inplace=False)
X = X[corr_vars]
y = s44_df['trans_loss'].values
X

Unnamed: 0_level_0,Accrete,Activeherpes,AdmBishop,Admcervpos,Admconsistency,Admcontract,AdmDBP,Admefface,Admpresent,Admreason,...,spontlabor,TD_nos,ThreatenedPB,threatpb9,TrialLabor,UnspecHBP,Urupture,uscar,version9,vertex
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
44-00001,0,0,8,8,8,99.0,74,0.0,77,5,...,0,0,0,0,1,0,0,0,0,1
44-00003,0,0,8,8,8,99.0,74,100.0,77,8,...,1,0,1,0,1,0,0,0,0,0
44-00004,0,0,8,8,8,0.0,74,20.0,77,3,...,0,0,0,0,1,0,0,0,0,1
44-00005,0,0,8,8,8,99.0,74,60.0,77,4,...,1,0,0,0,1,0,0,0,0,1
44-00006,0,0,8,8,8,99.0,74,50.0,77,3,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44-19938,0,0,8,8,8,3.3,65,40.0,77,6,...,0,0,1,0,1,0,0,0,0,1
44-19939,0,0,8,8,8,88.0,74,0.0,77,3,...,0,0,0,0,1,0,0,0,0,1
44-19940,0,0,8,8,8,88.0,96,0.0,77,4,...,0,0,0,0,0,0,0,1,0,1
44-19941,0,0,8,8,8,99.0,74,90.0,77,4,...,1,0,1,0,1,0,0,0,0,1


In [13]:
%%time
from mwb_bootstrap import bootstrap_stat
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=70, criterion="friedman_mse",max_depth=11, min_samples_leaf=50,
                                     min_samples_split=900,max_leaf_nodes=None,max_features=12,subsample=0.9,
                                     learning_rate=0.1,random_state=7)
print(clf.get_params())
#stats_df, feats_df, X_train = bootstrap_stat(X, y, clf, nsamples=25, under=True)
stats_df, feats_df, X_train = bootstrap_stat(X, y, clf, test_size=0.2, sample_weights=True, nsamples=10, 
                                             under=False)
stats_df.loc['mean'] = stats_df.mean()
stats_df.loc['mean',:]

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 11, 'max_features': 12, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 50, 'min_samples_split': 900, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 70, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': 7, 'subsample': 0.9, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
CPU times: user 7.68 s, sys: 19.5 ms, total: 7.7 s
Wall time: 7.69 s


recall     0.817733
prec       0.173756
MCC        0.286086
PR_AUC     0.267469
roc_auc    0.828282
Name: mean, dtype: float64

In [14]:
feats_df['mean'] = feats_df.mean(axis=1)
feats_df.sort_values(by='mean', inplace=True, ascending=False)
feats_df.head(30)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,mean
Delmode,0.291295,0.286626,0.299019,0.270717,0.266025,0.237767,0.205587,0.28132,0.24131,0.27128,0.265095
CS_FTP,0.033714,0.047718,0.042223,0.037653,0.033971,0.055558,0.071624,0.033961,0.051936,0.054859,0.046322
TrialLabor,0.044856,0.041979,0.036731,0.067441,0.046096,0.055508,0.051463,0.030899,0.037906,0.043846,0.045673
BESTGA,0.029333,0.040252,0.045051,0.03925,0.037621,0.036555,0.041642,0.040263,0.033345,0.030919,0.037423
spontlabor,0.022408,0.031119,0.041269,0.019746,0.019853,0.054772,0.035552,0.0193,0.022857,0.037168,0.030404
Dilat_lst,0.034917,0.0201,0.027853,0.034642,0.027209,0.028834,0.035206,0.029024,0.023804,0.030522,0.029211
new_age,0.022742,0.031445,0.022334,0.026,0.022061,0.01637,0.028144,0.029393,0.022989,0.025212,0.024669
Admreason,0.018526,0.026929,0.018742,0.021917,0.026484,0.024168,0.025997,0.025224,0.027445,0.029675,0.024511
Admefface,0.014591,0.020012,0.026908,0.014565,0.026337,0.029517,0.01677,0.023751,0.049986,0.020427,0.024286
uscar,0.028106,0.031571,0.008942,0.028742,0.032018,0.018762,0.016419,0.028018,0.015952,0.011098,0.021963


In [15]:
stats_df

Unnamed: 0,recall,prec,MCC,PR_AUC,roc_auc
0,0.81686,0.177961,0.292162,0.262361,0.832943
1,0.811047,0.171587,0.28104,0.270989,0.826202
2,0.793605,0.17224,0.277545,0.245235,0.821215
3,0.825581,0.172855,0.28674,0.27893,0.823745
4,0.851744,0.175659,0.297866,0.280456,0.835805
5,0.796512,0.168408,0.272428,0.250493,0.819005
6,0.802326,0.17207,0.279531,0.262986,0.822938
7,0.811047,0.173724,0.284292,0.246665,0.829921
8,0.811047,0.174266,0.285112,0.285009,0.825616
9,0.857558,0.178788,0.304142,0.29156,0.845429
