# 5. Prediction on New Data

Assuming that the new data is in the folder "01.Data/New Data/" and is an excel file named "NewData.xlsx" with a worksheet named "NewData"

Imports:

In [130]:
import os
import pandas as pd
import pickle
import numpy as np
import xgboost as xgb
import sklearn.metrics as skmetric

Directories:

In [64]:
def get_parent_dir(directory):
    import os
    return os.path.dirname(directory)

current_dirs_parent = get_parent_dir(os.getcwd())
dataraw_dir=current_dirs_parent+"/01.Data/Raw/"
preproc_dir=current_dirs_parent+"/01.Data/Preprocessing/"
models_a_dir=current_dirs_parent+"/03.Models/Set A/"
models_b_dir=current_dirs_parent+"/03.Models/Set B/"
datanew_dir=current_dirs_parent+"/01.Data/New Data/"

Read data:

In [22]:
newdata=pd.read_excel(datanew_dir+"NewData.xlsx", sheet_name='NewData')

In [23]:
newdata.head()

Unnamed: 0,time,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x52,x53,x54,x55,x56,x57,x58,x59,x60,x61
0,1999-05-26 09:30:00,-0.031064,14.012333,14.305617,350.172129,125.340586,320.88325,-3.533673,-3.323706,-3.743641,...,3566.251414,3642.906687,12.191752,2053.524851,269.336649,607.634592,-3.644594,97.163676,-3.343541,0
1,1999-05-26 09:32:00,-0.085997,14.098046,14.075781,349.319956,125.355128,321.09492,-3.533673,-3.323706,-3.743641,...,3566.251414,3642.911082,12.191752,2038.584177,269.582255,608.275462,-3.644631,97.297251,-3.343535,0
2,1999-05-26 09:34:00,-0.094949,14.188851,14.384268,348.635142,125.369685,321.30662,-3.533673,-3.323706,-3.743641,...,3566.251414,3642.915476,12.191752,2047.170603,269.432688,606.383311,-3.644669,98.01639,-3.34353,0
3,1999-05-26 09:36:00,-0.05504,14.141621,14.379366,347.950358,125.384226,322.58253,-3.533673,-3.323706,-3.743641,...,3566.251414,3642.919871,12.191752,2034.14033,269.283152,607.634592,-3.644707,97.774645,-3.343525,0
4,1999-05-26 09:38:00,-0.0744,14.15638,14.107552,347.265543,125.398783,321.980265,-3.533673,-3.323706,-3.723804,...,3566.251414,3642.924021,12.191752,2035.51179,269.133585,607.02418,-3.644745,97.175883,-3.343519,0


Transformations:

In [24]:
#Add EventId
def event_id(df,yname="y"):
    dfout=df
    dfout["EventID"]=dfout[yname].shift(periods=-1,fill_value=0)
    dfout["EventID"]=dfout["EventID"].cumsum()+1    
    return(dfout)

#Cycles since last failure
def add_cycles(df):
    dfout=df
    dfout["Cycle"]=1
    dfout["Cycle"]=dfout.groupby("EventID")["Cycle"].cumsum()
    return(dfout)
    

#Lagged difference. 
##i_list: variable number as list, assuming are all named x
##n_list: number of lag periods as list
##df: pandas dataframe to add variables
def lagdif_xi_n(df,i_list=[1,2,3],n_list=[1,2,3]):
    dfout=df
    for n in n_list:
        for i in i_list:
            dfout["x"+str(i)+"_df_l"+str(n)]=dfout["x"+str(i)].diff(periods=n)
    
    return(dfout)

#Second order Lagged difference. 
##i: variable number as list, assuming are all named x 
##n: number of lag periods as list
##df: pandas dataframe to add variables

def lagdif2_xi_n(df,i_list=[1,2,3],n_list=[1,2,3]):
    dfout=df
    for n in n_list:
        for i in i_list:
            dfout["x"+str(i)+"_df2_l"+str(n)]=dfout["x"+str(i)].diff(periods=n).diff(periods=n)
            
    return(dfout)
            

#Lagged percentual difference. 
##i_list: variable number as list, assuming are all named x
##n_list: number of lag periods as list
##df: pandas dataframe to add variables

def perc_lagdif_xi_n(df,i_list=[1,2,3],n_list=[1,2,3]):
    dfout=df
    for n in n_list:
        for i in i_list:
            dfout["x"+str(i)+"_pdf_l"+str(n)]=dfout["x"+str(i)].pct_change(periods=n)
    
    return(dfout)

#Lagged absolute percentual difference
##i_list: variable number as list, assuming are all named x
##n_list: number of lag periods as list
##df: pandas dataframe to add variables

def abs_perc_lagdif_xi_n(df,i_list=[1,2,3],n_list=[1,2,3]):
    dfout=df
    for n in n_list:
        for i in i_list:
            dfout["x"+str(i)+"_apdf_l"+str(n)]=dfout["x"+str(i)].pct_change(periods=n).abs()
            
    return(dfout)

#Exponential moving average percentual difference
##i_list: variable number as list, assuming are all named x
##n_list: number of lag periods as list
##alpha_list: smoothing parameter as list
##df: pandas dataframe to add variables

def EMW_perc_lagdif_xi_n(df,i_list=[1,2,3],n_list=[1,2,3],alpha_list=[0.25,0.5,0.75]):
    dfout=df
    for a in alpha_list:
        for n in n_list:        
            for i in i_list:
                dfout["x"+str(i)+"_emwpdf_a"+str(a)+"_l"+str(n)]=dfout["x"+str(i)].pct_change(periods=n).ewm(alpha=a).mean()
            
    return(dfout)

#Lagged log percentual difference 
##i_list: variable number as list, assuming are all named x
##n_list: number of lag periods as list
##df: pandas dataframe to add variables

def log_perc_lagdif_xi_n(df,i_list=[1,2,3],n_list=[1,2,3]):
    dfout=df
    for n in n_list:
        for i in i_list:
            dfout["x"+str(i)+"_logpdf_l"+str(n)]=np.log(dfout["x"+str(i)].pct_change(periods=n)+1)
    
    return(dfout)

#Exponential moving average of log percentual difference
##i_list: variable number as list, assuming are all named x
##n_list: number of lag periods as list
##alpha_list: smoothing parameter as list
##df: pandas dataframe to add variables

def EMW_log_perc_lagdif_xi_n(df,i_list=[1,2,3],n_list=[1,2,3],alpha_list=[0.25,0.5,0.75]):
    dfout=df
    for a in alpha_list:
        for n in n_list:        
            for i in i_list:
                dfout["x"+str(i)+"_emwpdf_a"+str(a)+"_l"+str(n)]=_
                np.log(dfout["x"+str(i)].pct_change(periods=n)+1).ewm(alpha=a).mean()
            
    return(dfout)

Date functions:

In [25]:
#Extracting:
##xday
def add_day(df):
    dfout=df    
    dfout["xday"]=dfout["time"].dt.day
    return(dfout)

##xhour
def add_hour(df):
    dfout=df    
    dfout["xhour"]=dfout["time"].dt.hour
    return(dfout)

##xminute
def add_minute(df):
    dfout=df    
    dfout["xminute"]=dfout["time"].dt.minute
    return(dfout)

##Month
def add_month(df):
    dfout=df    
    dfout["xmonth"]=dfout["time"].dt.month
    return(dfout)

#Turn into dummies
def date_dummies(df,varlist=["xday","xmonth","xhour","xminute"]):
    dfout=df
    for var in varlist:
        dfout=pd.concat([dfout,pd.get_dummies(dfout[var],prefix=var)],axis=1)
    return(dfout)

#Time difference from each row in minutes... minus 2 minutes
def time_diff(df):
    dfout=df
    dfout["xtimedif"]=dfout["time"].diff(periods=1)/np.timedelta64(1,'m')-2
    return(dfout)

#Dummy if there was a skip
def add_skip(df):
    dfout=df
    dfout["xskip"]=[1 if x > 0 else 0 for x in df['xtimedif']]
    return(dfout)


#SkipNumber
def add_skipnumber(df):
    dfout=df
    dfout["xskipid"]=dfout["xskip"].cumsum()+1
    return(dfout)

#Time since last skip
def add_sinceskip(df):
    dfout=df
    dfout["xsinceskip"]=1
    dfout["xsinceskip"]=dfout.groupby("xskipid")["xsinceskip"].cumsum()
    return(dfout)

Pipeline:

In [26]:
def preprocessing_pipeline(df):
    dfout=df.copy()
    dfout=lagdif_xi_n(df=dfout,i_list=range(1,62),n_list=range(4))
    dfout=perc_lagdif_xi_n(df=dfout,i_list=range(1,62),n_list=range(4))
    dfout=abs_perc_lagdif_xi_n(df=dfout,i_list=range(1,62),n_list=range(4))
    dfout=EMW_perc_lagdif_xi_n(df=dfout,i_list=range(1,61),n_list=range(4),alpha_list=[0.1,0.5,0.95])
    dfout=lagdif2_xi_n(df=dfout,i_list=range(1,62),n_list=range(4))
    
    #dfout=log_perc_lagdif_xi_n(df=dfout,i_list=range(1,61),n_list=range(4))
    #dfout=EMW_log_perc_lagdif_xi_n(df=dfout,i_list=range(1,61),n_list=range(4),alpha_list=[0.1,0.5,0.95])

    dfout=add_minute(dfout)
    dfout=add_hour(dfout)
    dfout=add_day(dfout)
    dfout=add_month(dfout)

    dfout=time_diff(dfout)
    dfout=add_skip(dfout)
    dfout=add_skipnumber(dfout)
    dfout=add_sinceskip(dfout)

    return(dfout)

In [29]:
newdata_preproc=None
newdata_preproc=preprocessing_pipeline(newdata)

In [44]:
newdata_preproc.head()

Unnamed: 0,time,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x60_df2_l3,x61_df2_l3,xminute,xhour,xday,xmonth,xtimedif,xskip,xskipid,xsinceskip
0,1999-05-26 09:30:00,-0.031064,14.012333,14.305617,350.172129,125.340586,320.88325,-3.533673,-3.323706,-3.743641,...,,,30,9,26,5,,0,1,1
1,1999-05-26 09:32:00,-0.085997,14.098046,14.075781,349.319956,125.355128,321.09492,-3.533673,-3.323706,-3.743641,...,,,32,9,26,5,0.0,0,1,2
2,1999-05-26 09:34:00,-0.094949,14.188851,14.384268,348.635142,125.369685,321.30662,-3.533673,-3.323706,-3.743641,...,,,34,9,26,5,0.0,0,1,3
3,1999-05-26 09:36:00,-0.05504,14.141621,14.379366,347.950358,125.384226,322.58253,-3.533673,-3.323706,-3.743641,...,,,36,9,26,5,0.0,0,1,4
4,1999-05-26 09:38:00,-0.0744,14.15638,14.107552,347.265543,125.398783,321.980265,-3.533673,-3.323706,-3.723804,...,,,38,9,26,5,0.0,0,1,5


In [49]:
selcols=[a for a in newdata_preproc.columns if a.startswith("x")]
X_newdata=newdata_preproc[newdata_preproc.columns.intersection(selcols)]
X_newdata=X_newdata.copy()
X_newdata.fillna(0, inplace=True) 
X_newdata.replace(to_replace=np.inf, value=0, inplace=True)

In [50]:
X_newdata.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x60_df2_l3,x61_df2_l3,xminute,xhour,xday,xmonth,xtimedif,xskip,xskipid,xsinceskip
0,-0.031064,14.012333,14.305617,350.172129,125.340586,320.88325,-3.533673,-3.323706,-3.743641,-3.713732,...,0.0,0.0,30,9,26,5,0.0,0,1,1
1,-0.085997,14.098046,14.075781,349.319956,125.355128,321.09492,-3.533673,-3.323706,-3.743641,-3.713732,...,0.0,0.0,32,9,26,5,0.0,0,1,2
2,-0.094949,14.188851,14.384268,348.635142,125.369685,321.30662,-3.533673,-3.323706,-3.743641,-3.723804,...,0.0,0.0,34,9,26,5,0.0,0,1,3
3,-0.05504,14.141621,14.379366,347.950358,125.384226,322.58253,-3.533673,-3.323706,-3.743641,-3.723804,...,0.0,0.0,36,9,26,5,0.0,0,1,4
4,-0.0744,14.15638,14.107552,347.265543,125.398783,321.980265,-3.533673,-3.323706,-3.723804,-3.723804,...,0.0,0.0,38,9,26,5,0.0,0,1,5


Scaling this preprocessed data:

In [45]:
#Load scaler
scaler=pickle.load(open(models_b_dir+"clust_scaler.pickle.dat", "rb"))

In [51]:
X_newdata_sc=scaler.transform(X_newdata)

Load all clustering methods:

In [55]:
clf=pickle.load(open(models_b_dir+"clust_svdd.pickle.dat", "rb"))
db=pickle.load(open(models_b_dir+"clust_db.pickle.dat", "rb"))
isoforest=pickle.load(open(models_b_dir+"clust_isoforest.pickle.dat", "rb"))
pca=pickle.load(open(models_b_dir+"clust_pca.pickle.dat", "rb"))
fica=pickle.load(open(models_b_dir+"clust_fica.pickle.dat", "rb"))
svd=pickle.load(open(models_b_dir+"clust_svd.pickle.dat", "rb"))
lof=pickle.load(open(models_b_dir+"clust_lof.pickle.dat", "rb"))
tsnes=pickle.load(open(models_b_dir+"clust_tsnes.pickle.dat", "rb"))

Cluster the new data:

In [56]:
svdd_test=pd.DataFrame(clf.predict(X_newdata_sc))
dbscan_test=pd.DataFrame(db.fit_predict(X_newdata_sc))
isoforest_test=pd.DataFrame(isoforest.predict(X_newdata_sc))

test_pca_df=pd.DataFrame(pca.transform(X_newdata_sc))
test_fica_df=pd.DataFrame(fica.transform(X_newdata_sc))
test_svd_df=pd.DataFrame(svd.transform(X_newdata_sc))
test_tsne_df=pd.DataFrame(tsnes.fit_transform(X_newdata_sc))

In [117]:
test_lof_df=pd.DataFrame(lof.predict(X_newdata_sc))

Join in a new dataframe:

In [118]:
c_colnames=[*["x_svdd_"+str(i) for i in svdd_test.columns],\
          *["x_db_"+str(i) for i in dbscan_test.columns],\
          *["x_iso_"+str(i) for i in isoforest_test.columns],\
          *["x_pca_"+str(i) for i in test_pca_df.columns],\
          *["x_svd_"+str(i) for i in test_svd_df.columns],\
          *["x_lof_"+str(i) for i in test_lof_df],\
          *["x_tsne_"+str(i) for i in test_tsne_df.columns]]

In [119]:
clust_test_df=pd.concat([svdd_test,dbscan_test,isoforest_test,test_pca_df,test_svd_df,test_lof_df,test_tsne_df],axis=1)
clust_test_df.columns=c_colnames

In [120]:
clust_test_df.head()

Unnamed: 0,x_svdd_0,x_db_0,x_iso_0,x_pca_0,x_pca_1,x_pca_2,x_pca_3,x_pca_4,x_svd_0,x_svd_1,x_svd_2,x_svd_3,x_svd_4,x_lof_0,x_tsne_0,x_tsne_1,x_tsne_2
0,-1,-1,1,-0.3365,-0.671571,0.019272,-0.163695,-0.163861,-0.3365,-0.671576,0.019194,-0.163956,-0.160219,1,-15.162313,17.338415,-42.20219
1,-1,-1,1,-0.306868,-0.980071,-0.135501,-0.124544,0.163919,-0.306869,-0.980122,-0.136004,-0.126947,0.178375,-1,18.707413,-12.675645,-41.663372
2,-1,-1,1,-0.196795,1.877152,0.386392,0.922178,0.387118,-0.196795,1.877194,0.386761,0.923362,0.384856,-1,-31.314941,-54.680695,-16.887177
3,-1,-1,1,-0.184501,0.757122,-0.006129,0.615808,-0.053376,-0.184503,0.757072,-0.006509,0.613089,-0.034247,-1,-9.762266,-43.100525,-49.808998
4,-1,-1,1,-0.259596,1.070508,0.602782,0.781721,1.016329,-0.259597,1.07051,0.602917,0.781266,1.019,-1,-36.905991,-56.882664,-19.742243


Load all tree methods:

In [65]:
model2_blead2=pickle.load(open(models_b_dir+"xgboost_y_blead2.pickle.dat", "rb"))
model2_lead1=pickle.load(open(models_a_dir+"xgboost_y_lead1.pickle.dat", "rb"))
model2=pickle.load(open(models_a_dir+"xgboost_y.pickle.dat", "rb"))

In [68]:
dtest_b=xgb.DMatrix(X_newdata_sc)

In [69]:
y_lead2_pred_test_b_own = model2_blead2.predict(dtest_b)
y_pred_test_b = model2.predict(dtest_b)
y_lead1_pred_test_b = model2_lead1.predict(dtest_b)

In [77]:
xgb_test_b_own=pd.DataFrame({"xgb_y_prob" : y_pred_test_b,
                                 "xgb_ylead1_prob" : y_lead1_pred_test_b,
                                 #"xgb_ylead2_prob" : y_lead2_pred_test_b,
                                 "xgb_yblead2_prob" : y_lead2_pred_test_b_own})

In [78]:
xgb_test_b_own.head()

Unnamed: 0,xgb_y_prob,xgb_ylead1_prob,xgb_yblead2_prob
0,0.591168,0.612815,0.142413
1,0.64395,0.612815,0.163147
2,0.590034,0.655965,-0.227935
3,0.591831,0.621997,-0.558809
4,0.588151,0.623684,-0.227935


In [79]:
def logisticf(x):
    p=1/(1+np.exp(x))
    return p

logisticf(-0.167528)

0.5417843204057448

In [80]:
xgb_test_b_prob=logisticf(xgb_test_b_own)
xgb_test_b_prob=xgb_test_b_prob.add_suffix("_prob")

In [81]:
xgb_test_b_prob.head()

Unnamed: 0,xgb_y_prob_prob,xgb_ylead1_prob_prob,xgb_yblead2_prob_prob
0,0.356367,0.351417,0.464457
1,0.344354,0.351417,0.459304
2,0.356627,0.341647,0.556738
3,0.356215,0.349328,0.636177
4,0.357059,0.348944,0.556738


Add moving average to XGB features:

In [83]:
def EMW_prob_lagdif_xi_n(df,n_list=[1,2,3],alpha_list=[0.25,0.5,0.75]):
    dfout=df.copy()
    i_list=dfout.columns
    for a in alpha_list:    
        for i in i_list:
            dfout[str(i)+"_emwpdf_a"+str(a)]=dfout[i].ewm(alpha=a).mean()            
    return(dfout)

In [85]:
xgb_test_b_emw=EMW_prob_lagdif_xi_n(xgb_test_b_own,alpha_list=[0.1,0.5,0.95])
xgb_test_b_prob_emw=EMW_prob_lagdif_xi_n(xgb_test_b_prob,alpha_list=[0.1,0.5,0.95])

In [86]:
xgb_test_b_emw.head()

Unnamed: 0,xgb_y_prob,xgb_ylead1_prob,xgb_yblead2_prob,xgb_y_prob_emwpdf_a0.1,xgb_ylead1_prob_emwpdf_a0.1,xgb_yblead2_prob_emwpdf_a0.1,xgb_y_prob_emwpdf_a0.5,xgb_ylead1_prob_emwpdf_a0.5,xgb_yblead2_prob_emwpdf_a0.5,xgb_y_prob_emwpdf_a0.95,xgb_ylead1_prob_emwpdf_a0.95,xgb_yblead2_prob_emwpdf_a0.95
0,0.591168,0.612815,0.142413,0.591168,0.612815,0.142413,0.591168,0.612815,0.142413,0.591168,0.612815,0.142413
1,0.64395,0.612815,0.163147,0.618948,0.612815,0.153325,0.626356,0.612815,0.156235,0.641437,0.612815,0.162159
2,0.590034,0.655965,-0.227935,0.608279,0.628738,0.012639,0.605601,0.637472,-0.063291,0.592598,0.653813,-0.208477
3,0.591831,0.621997,-0.558809,0.603496,0.626777,-0.153528,0.598257,0.629219,-0.327567,0.591869,0.623587,-0.541295
4,0.588151,0.623684,-0.227935,0.599749,0.626022,-0.171698,0.593041,0.626362,-0.276144,0.588337,0.623679,-0.243603


In [94]:
xgb_test_b_prob_emw.head()

Unnamed: 0,xgb_y_prob_prob,xgb_ylead1_prob_prob,xgb_yblead2_prob_prob,xgb_y_prob_prob_emwpdf_a0.1,xgb_ylead1_prob_prob_emwpdf_a0.1,xgb_yblead2_prob_prob_emwpdf_a0.1,xgb_y_prob_prob_emwpdf_a0.5,xgb_ylead1_prob_prob_emwpdf_a0.5,xgb_yblead2_prob_prob_emwpdf_a0.5,xgb_y_prob_prob_emwpdf_a0.95,xgb_ylead1_prob_prob_emwpdf_a0.95,xgb_yblead2_prob_prob_emwpdf_a0.95
0,0.356367,0.351417,0.464457,0.356367,0.351417,0.464457,0.356367,0.351417,0.464457,0.356367,0.351417,0.464457
1,0.344354,0.351417,0.459304,0.350044,0.351417,0.461745,0.348358,0.351417,0.461021,0.344926,0.351417,0.459549
2,0.356627,0.341647,0.556738,0.352473,0.347812,0.496798,0.353083,0.345834,0.515717,0.356043,0.342134,0.55189
3,0.356215,0.349328,0.636177,0.353561,0.348253,0.537327,0.354754,0.347697,0.579962,0.356206,0.348968,0.631963
4,0.357059,0.348944,0.556738,0.354416,0.348421,0.542067,0.355944,0.348341,0.567976,0.357017,0.348945,0.5605


Concatenating all data before Logistic Regression:

In [95]:
X_test_sc_df=pd.DataFrame(columns=X_newdata.columns,data=X_newdata_sc)

In [121]:
X_test_sc_df.shape

(1840, 1765)

In [122]:
clust_test_df.shape

(1840, 17)

In [123]:
xgb_test_b_emw.shape

(1840, 12)

In [124]:
X_test_sc_df.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x60_df2_l3,x61_df2_l3,xminute,xhour,xday,xmonth,xtimedif,xskip,xskipid,xsinceskip
0,-0.122795,0.11428,-0.122528,0.373482,0.229374,1.218019,-1.622169,0.486298,-0.368877,-0.045063,...,2e-06,0.0,0.055307,-0.329704,1.752204,0.0,-0.028983,-0.077161,-1.774221,-1.187332
1,-0.20315,0.132324,-0.16174,0.367113,0.251404,1.223763,-1.622169,0.486298,-0.368877,-0.045063,...,2e-06,0.0,0.171038,-0.329704,1.752204,0.0,-0.028983,-0.077161,-1.774221,-1.181664
2,-0.216244,0.151439,-0.109109,0.361994,0.273456,1.229508,-1.622169,0.486298,-0.368877,-0.139113,...,2e-06,0.0,0.286769,-0.329704,1.752204,0.0,-0.028983,-0.077161,-1.774221,-1.175996
3,-0.157867,0.141497,-0.109945,0.356876,0.295485,1.26413,-1.622169,0.486298,-0.368877,-0.139113,...,2e-06,0.0,0.4025,-0.329704,1.752204,0.0,-0.028983,-0.077161,-1.774221,-1.170329
4,-0.186186,0.144604,-0.156319,0.351758,0.317537,1.247787,-1.622169,0.486298,-0.236463,-0.139113,...,2e-06,0.0,0.518232,-0.329704,1.752204,0.0,-0.028983,-0.077161,-1.774221,-1.164661


In [125]:
final_b_test=pd.concat([X_test_sc_df,clust_test_df,xgb_test_b_emw,xgb_test_b_prob_emw],axis=1)

In [126]:
final_b_test.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,xgb_yblead2_prob_prob,xgb_y_prob_prob_emwpdf_a0.1,xgb_ylead1_prob_prob_emwpdf_a0.1,xgb_yblead2_prob_prob_emwpdf_a0.1,xgb_y_prob_prob_emwpdf_a0.5,xgb_ylead1_prob_prob_emwpdf_a0.5,xgb_yblead2_prob_prob_emwpdf_a0.5,xgb_y_prob_prob_emwpdf_a0.95,xgb_ylead1_prob_prob_emwpdf_a0.95,xgb_yblead2_prob_prob_emwpdf_a0.95
0,-0.122795,0.11428,-0.122528,0.373482,0.229374,1.218019,-1.622169,0.486298,-0.368877,-0.045063,...,0.464457,0.356367,0.351417,0.464457,0.356367,0.351417,0.464457,0.356367,0.351417,0.464457
1,-0.20315,0.132324,-0.16174,0.367113,0.251404,1.223763,-1.622169,0.486298,-0.368877,-0.045063,...,0.459304,0.350044,0.351417,0.461745,0.348358,0.351417,0.461021,0.344926,0.351417,0.459549
2,-0.216244,0.151439,-0.109109,0.361994,0.273456,1.229508,-1.622169,0.486298,-0.368877,-0.139113,...,0.556738,0.352473,0.347812,0.496798,0.353083,0.345834,0.515717,0.356043,0.342134,0.55189
3,-0.157867,0.141497,-0.109945,0.356876,0.295485,1.26413,-1.622169,0.486298,-0.368877,-0.139113,...,0.636177,0.353561,0.348253,0.537327,0.354754,0.347697,0.579962,0.356206,0.348968,0.631963
4,-0.186186,0.144604,-0.156319,0.351758,0.317537,1.247787,-1.622169,0.486298,-0.236463,-0.139113,...,0.556738,0.354416,0.348421,0.542067,0.355944,0.348341,0.567976,0.357017,0.348945,0.5605


In [127]:
LR13=pickle.load(open(models_b_dir+"LR13.pickle.dat", "rb"))

In [128]:
LR13

LogisticRegression(C=0.01, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=10000, multi_class='warn', n_jobs=4, penalty='l1',
                   random_state=42, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [129]:
y_pred=LR13.predict(final_b_test)

In [132]:
#save as csv
pd.DataFrame({"y_pred":y_pred}).to_csv(datanew_dir+"y_pred.csv",index=False)