# QSR Data Challenge

## Preprocessing

Imports

In [36]:
import os
import pandas as pd
import datetime
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Get the current working directory:

In [2]:
def get_parent_dir(directory):
    import os
    return os.path.dirname(directory)

current_dirs_parent = get_parent_dir(os.getcwd())
dataraw_dir=current_dirs_parent+"/01.Data/Raw/"
preproc_dir=current_dirs_parent+"/01.Data/Preprocessing/"

Load the data:

In [3]:
data_a=pd.read_excel(dataraw_dir+"processminer-rare-event-detection-data-augmentation.xlsx", 
                     sheet_name='data-(a)-raw-data')
data_b=pd.read_excel(dataraw_dir+"processminer-rare-event-detection-data-augmentation.xlsx", 
                     sheet_name='data-(b)-4-min-ahead-conse-rmvd')
data_b.rename(columns={"y-4min-ahead":"y_lead2"},inplace=True)

Check first lines:

In [4]:
data_a.head()

Unnamed: 0,time,y,x1,x2,x3,x4,x5,x6,x7,x8,...,x52,x53,x54,x55,x56,x57,x58,x59,x60,x61
0,1999-05-01 00:00:00,0,0.360183,8.916353,10.298991,332.475535,125.067515,255.941905,-3.373756,-3.413735,...,3578.032176,3645.031931,4.149806,1990.071848,283.75117,611.968211,-3.620528,97.959956,-3.334341,0
1,1999-05-01 00:02:00,0,0.459238,8.970286,10.376388,335.208506,125.057612,257.853709,-3.373756,-3.413735,...,3578.036326,3645.041453,4.149061,1982.248728,287.995799,611.177744,-3.620436,98.097239,-3.334334,0
2,1999-05-01 00:04:00,0,0.347366,8.831394,10.0416,333.105845,125.047709,258.775156,-3.363685,-3.413735,...,3578.04072,3645.050974,4.148316,1980.47822,292.240397,612.032298,-3.620343,98.847277,-3.334327,0
3,1999-05-01 00:06:00,0,0.285108,8.753854,10.371135,332.139414,125.038203,258.094187,-3.372413,-3.413735,...,3578.045115,3645.060252,4.147571,1978.707713,286.908183,612.480906,-3.620251,99.561617,-3.33432,0
4,1999-05-01 00:08:00,0,0.249096,8.76286,10.061597,334.245188,125.031031,259.105875,-3.373756,-3.413735,...,3578.049509,3645.069773,4.146826,1976.937205,287.142985,612.096384,-3.620195,98.417231,-3.334312,0


In [5]:
data_b.head()

Unnamed: 0,time,y_lead2,x1,x2,x3,x4,x5,x6,x7,x8,...,x52,x53,x54,x55,x56,x57,x58,x59,x60,x61
0,1999-05-01 00:00:00,0,0.360183,8.916353,10.298991,332.475535,125.067515,255.941905,-3.373756,-3.413735,...,3578.032176,3645.031931,4.149806,1990.071848,283.75117,611.968211,-3.620528,97.959956,-3.334341,0
1,1999-05-01 00:02:00,0,0.459238,8.970286,10.376388,335.208506,125.057612,257.853709,-3.373756,-3.413735,...,3578.036326,3645.041453,4.149061,1982.248728,287.995799,611.177744,-3.620436,98.097239,-3.334334,0
2,1999-05-01 00:04:00,0,0.347366,8.831394,10.0416,333.105845,125.047709,258.775156,-3.363685,-3.413735,...,3578.04072,3645.050974,4.148316,1980.47822,292.240397,612.032298,-3.620343,98.847277,-3.334327,0
3,1999-05-01 00:06:00,0,0.285108,8.753854,10.371135,332.139414,125.038203,258.094187,-3.372413,-3.413735,...,3578.045115,3645.060252,4.147571,1978.707713,286.908183,612.480906,-3.620251,99.561617,-3.33432,0
4,1999-05-01 00:08:00,0,0.249096,8.76286,10.061597,334.245188,125.031031,259.105875,-3.373756,-3.413735,...,3578.049509,3645.069773,4.146826,1976.937205,287.142985,612.096384,-3.620195,98.417231,-3.334312,0


In [6]:
data_b["y_lead2"].sum()

124

In [7]:
data_b.shape

(18398, 63)

Define functions for data transformations:

In [8]:
#Add EventId
def event_id(df,yname="y"):
    dfout=df
    dfout["EventID"]=dfout[yname].shift(periods=-1,fill_value=0)
    dfout["EventID"]=dfout["EventID"].cumsum()+1    
    return(dfout)

#Cycles since last failure
def add_cycles(df):
    dfout=df
    dfout["Cycle"]=1
    dfout["Cycle"]=dfout.groupby("EventID")["Cycle"].cumsum()
    return(dfout)
    

#Lagged difference. 
##i_list: variable number as list, assuming are all named x
##n_list: number of lag periods as list
##df: pandas dataframe to add variables
def lagdif_xi_n(df,i_list=[1,2,3],n_list=[1,2,3]):
    dfout=df
    for n in n_list:
        for i in i_list:
            dfout["x"+str(i)+"_df_l"+str(n)]=dfout["x"+str(i)].diff(periods=n)
    
    return(dfout)

#Second order Lagged difference. 
##i: variable number as list, assuming are all named x 
##n: number of lag periods as list
##df: pandas dataframe to add variables

def lagdif2_xi_n(df,i_list=[1,2,3],n_list=[1,2,3]):
    dfout=df
    for n in n_list:
        for i in i_list:
            dfout["x"+str(i)+"_df2_l"+str(n)]=dfout["x"+str(i)].diff(periods=n).diff(periods=n)
            
    return(dfout)
            

#Lagged percentual difference. 
##i_list: variable number as list, assuming are all named x
##n_list: number of lag periods as list
##df: pandas dataframe to add variables

def perc_lagdif_xi_n(df,i_list=[1,2,3],n_list=[1,2,3]):
    dfout=df
    for n in n_list:
        for i in i_list:
            dfout["x"+str(i)+"_pdf_l"+str(n)]=dfout["x"+str(i)].pct_change(periods=n)
    
    return(dfout)

#Lagged absolute percentual difference
##i_list: variable number as list, assuming are all named x
##n_list: number of lag periods as list
##df: pandas dataframe to add variables

def abs_perc_lagdif_xi_n(df,i_list=[1,2,3],n_list=[1,2,3]):
    dfout=df
    for n in n_list:
        for i in i_list:
            dfout["x"+str(i)+"_apdf_l"+str(n)]=dfout["x"+str(i)].pct_change(periods=n).abs()
            
    return(dfout)

#Exponential moving average percentual difference
##i_list: variable number as list, assuming are all named x
##n_list: number of lag periods as list
##alpha_list: smoothing parameter as list
##df: pandas dataframe to add variables

def EMW_perc_lagdif_xi_n(df,i_list=[1,2,3],n_list=[1,2,3],alpha_list=[0.25,0.5,0.75]):
    dfout=df
    for a in alpha_list:
        for n in n_list:        
            for i in i_list:
                dfout["x"+str(i)+"_emwpdf_a"+str(a)+"_l"+str(n)]=dfout["x"+str(i)].pct_change(periods=n).ewm(alpha=a).mean()
            
    return(dfout)

#Lagged log percentual difference 
##i_list: variable number as list, assuming are all named x
##n_list: number of lag periods as list
##df: pandas dataframe to add variables

def log_perc_lagdif_xi_n(df,i_list=[1,2,3],n_list=[1,2,3]):
    dfout=df
    for n in n_list:
        for i in i_list:
            dfout["x"+str(i)+"_logpdf_l"+str(n)]=np.log(dfout["x"+str(i)].pct_change(periods=n)+1)
    
    return(dfout)

#Exponential moving average of log percentual difference
##i_list: variable number as list, assuming are all named x
##n_list: number of lag periods as list
##alpha_list: smoothing parameter as list
##df: pandas dataframe to add variables

def EMW_log_perc_lagdif_xi_n(df,i_list=[1,2,3],n_list=[1,2,3],alpha_list=[0.25,0.5,0.75]):
    dfout=df
    for a in alpha_list:
        for n in n_list:        
            for i in i_list:
                dfout["x"+str(i)+"_emwpdf_a"+str(a)+"_l"+str(n)]=_
                np.log(dfout["x"+str(i)].pct_change(periods=n)+1).ewm(alpha=a).mean()
            
    return(dfout)

Date functions:

In [9]:
#Extracting:
##xday
def add_day(df):
    dfout=df    
    dfout["xday"]=dfout["time"].dt.day
    return(dfout)

##xhour
def add_hour(df):
    dfout=df    
    dfout["xhour"]=dfout["time"].dt.hour
    return(dfout)

##xminute
def add_minute(df):
    dfout=df    
    dfout["xminute"]=dfout["time"].dt.minute
    return(dfout)

##Month
def add_month(df):
    dfout=df    
    dfout["xmonth"]=dfout["time"].dt.month
    return(dfout)

#Turn into dummies
def date_dummies(df,varlist=["xday","xmonth","xhour","xminute"]):
    dfout=df
    for var in varlist:
        dfout=pd.concat([dfout,pd.get_dummies(dfout[var],prefix=var)],axis=1)
    return(dfout)

#Time difference from each row in minutes... minus 2 minutes
def time_diff(df):
    dfout=df
    dfout["xtimedif"]=dfout["time"].diff(periods=1)/np.timedelta64(1,'m')-2
    return(dfout)

#Dummy if there was a skip
def add_skip(df):
    dfout=df
    dfout["xskip"]=[1 if x > 0 else 0 for x in df['xtimedif']]
    return(dfout)


#SkipNumber
def add_skipnumber(df):
    dfout=df
    dfout["xskipid"]=dfout["xskip"].cumsum()+1
    return(dfout)

#Time since last skip
def add_sinceskip(df):
    dfout=df
    dfout["xsinceskip"]=1
    dfout["xsinceskip"]=dfout.groupby("xskipid")["xsinceskip"].cumsum()
    return(dfout)

Separate training and test data: 10% at the end for testing

In [97]:
train_b, test_b = train_test_split(data_b,test_size=0.1,shuffle=False)

train_b=train_b.copy()
test_b=test_b.copy()

first_test_date=test_b["time"].iloc[0]

#Data a should be split according to the date on data_b test set:
train_a = data_a.loc[data_a["time"] < first_test_date]
test_a = data_a.loc[data_a["time"] >= first_test_date]

train_a=train_a.copy()
test_a=test_a.copy()

In [98]:
train_b.tail()

Unnamed: 0,time,y_lead2,x1,x2,x3,x4,x5,x6,x7,x8,...,x52,x53,x54,x55,x56,x57,x58,x59,x60,x61
16553,1999-05-26 09:20:00,0,0.012196,13.847258,14.340477,348.620646,125.267832,322.29719,-3.543744,-3.332518,...,3566.251414,3642.884959,12.191752,2043.060496,266.791575,607.02418,-3.64579,98.285967,-3.343567,0
16554,1999-05-26 09:22:00,0,-0.053825,14.080695,14.3105,349.257822,125.282389,322.007456,-3.543744,-3.323706,...,3566.251414,3642.889353,12.191752,2033.24116,267.300608,607.02418,-3.645968,97.557215,-3.343562,0
16555,1999-05-26 09:24:00,0,-0.004109,13.927493,14.28761,349.894999,125.296931,323.346201,-3.543744,-3.323706,...,3566.251414,3642.893748,12.191752,2046.334177,267.809611,607.149302,-3.645648,97.17068,-3.343556,0
16556,1999-05-26 09:26:00,0,-0.089955,13.926491,14.413989,348.582194,125.311488,321.469859,-3.533673,-3.323706,...,3566.251414,3642.897898,12.191752,2048.477488,268.318644,607.02418,-3.645079,97.73942,-3.343551,0
16557,1999-05-26 09:28:00,0,-0.067187,14.086188,14.28761,352.530497,125.326029,321.940775,-3.533673,-3.323706,...,3566.251414,3642.902293,12.191752,2042.669871,268.827647,607.523631,-3.644556,98.287257,-3.343546,0


In [12]:
test_b.head()

Unnamed: 0,time,y_lead2,x1,x2,x3,x4,x5,x6,x7,x8,...,x52,x53,x54,x55,x56,x57,x58,x59,x60,x61
16558,1999-05-26 09:30:00,0,-0.031064,14.012333,14.305617,350.172129,125.340586,320.88325,-3.533673,-3.323706,...,3566.251414,3642.906687,12.191752,2053.524851,269.336649,607.634592,-3.644594,97.163676,-3.343541,0
16559,1999-05-26 09:32:00,0,-0.085997,14.098046,14.075781,349.319956,125.355128,321.09492,-3.533673,-3.323706,...,3566.251414,3642.911082,12.191752,2038.584177,269.582255,608.275462,-3.644631,97.297251,-3.343535,0
16560,1999-05-26 09:34:00,0,-0.094949,14.188851,14.384268,348.635142,125.369685,321.30662,-3.533673,-3.323706,...,3566.251414,3642.915476,12.191752,2047.170603,269.432688,606.383311,-3.644669,98.01639,-3.34353,0
16561,1999-05-26 09:36:00,0,-0.05504,14.141621,14.379366,347.950358,125.384226,322.58253,-3.533673,-3.323706,...,3566.251414,3642.919871,12.191752,2034.14033,269.283152,607.634592,-3.644707,97.774645,-3.343525,0
16562,1999-05-26 09:38:00,0,-0.0744,14.15638,14.107552,347.265543,125.398783,321.980265,-3.533673,-3.323706,...,3566.251414,3642.924021,12.191752,2035.51179,269.133585,607.02418,-3.644745,97.175883,-3.343519,0


In [99]:
train_b["y_lead2"].sum()

113

In [100]:
test_b["y_lead2"].sum()

11

In [101]:
train_b.shape

(16558, 63)

In [102]:
test_b.shape

(1840, 63)

Pipeline to add new variables:

In [103]:
def preprocessing_pipeline(df):
    dfout=df.copy()
    dfout=lagdif_xi_n(df=dfout,i_list=range(1,62),n_list=range(4))
    dfout=perc_lagdif_xi_n(df=dfout,i_list=range(1,62),n_list=range(4))
    dfout=abs_perc_lagdif_xi_n(df=dfout,i_list=range(1,62),n_list=range(4))
    dfout=EMW_perc_lagdif_xi_n(df=dfout,i_list=range(1,61),n_list=range(4),alpha_list=[0.1,0.5,0.95])
    dfout=lagdif2_xi_n(df=dfout,i_list=range(1,62),n_list=range(4))
    
    #dfout=log_perc_lagdif_xi_n(df=dfout,i_list=range(1,61),n_list=range(4))
    #dfout=EMW_log_perc_lagdif_xi_n(df=dfout,i_list=range(1,61),n_list=range(4),alpha_list=[0.1,0.5,0.95])

    dfout=add_minute(dfout)
    dfout=add_hour(dfout)
    dfout=add_day(dfout)
    dfout=add_month(dfout)

    dfout=time_diff(dfout)
    dfout=add_skip(dfout)
    dfout=add_skipnumber(dfout)
    dfout=add_sinceskip(dfout)

    return(dfout)

Pipeline for data_b:

In [104]:
train_b_preproc=None
test_b_preproc=None
train_b_preproc=preprocessing_pipeline(train_b)
test_b_preproc=preprocessing_pipeline(test_b)

In [105]:
train_b_preproc.tail()

Unnamed: 0,time,y_lead2,x1,x2,x3,x4,x5,x6,x7,x8,...,x60_df2_l3,x61_df2_l3,xminute,xhour,xday,xmonth,xtimedif,xskip,xskipid,xsinceskip
16553,1999-05-26 09:20:00,0,0.012196,13.847258,14.340477,348.620646,125.267832,322.29719,-3.543744,-3.332518,...,0.0,0.0,20,9,26,5,0.0,0,99,294
16554,1999-05-26 09:22:00,0,-0.053825,14.080695,14.3105,349.257822,125.282389,322.007456,-3.543744,-3.323706,...,1e-06,0.0,22,9,26,5,0.0,0,99,295
16555,1999-05-26 09:24:00,0,-0.004109,13.927493,14.28761,349.894999,125.296931,323.346201,-3.543744,-3.323706,...,0.0,0.0,24,9,26,5,0.0,0,99,296
16556,1999-05-26 09:26:00,0,-0.089955,13.926491,14.413989,348.582194,125.311488,321.469859,-3.533673,-3.323706,...,0.0,0.0,26,9,26,5,0.0,0,99,297
16557,1999-05-26 09:28:00,0,-0.067187,14.086188,14.28761,352.530497,125.326029,321.940775,-3.533673,-3.323706,...,0.0,0.0,28,9,26,5,0.0,0,99,298


In [106]:
test_b_preproc.head()

Unnamed: 0,time,y_lead2,x1,x2,x3,x4,x5,x6,x7,x8,...,x60_df2_l3,x61_df2_l3,xminute,xhour,xday,xmonth,xtimedif,xskip,xskipid,xsinceskip
16558,1999-05-26 09:30:00,0,-0.031064,14.012333,14.305617,350.172129,125.340586,320.88325,-3.533673,-3.323706,...,,,30,9,26,5,,0,1,1
16559,1999-05-26 09:32:00,0,-0.085997,14.098046,14.075781,349.319956,125.355128,321.09492,-3.533673,-3.323706,...,,,32,9,26,5,0.0,0,1,2
16560,1999-05-26 09:34:00,0,-0.094949,14.188851,14.384268,348.635142,125.369685,321.30662,-3.533673,-3.323706,...,,,34,9,26,5,0.0,0,1,3
16561,1999-05-26 09:36:00,0,-0.05504,14.141621,14.379366,347.950358,125.384226,322.58253,-3.533673,-3.323706,...,,,36,9,26,5,0.0,0,1,4
16562,1999-05-26 09:38:00,0,-0.0744,14.15638,14.107552,347.265543,125.398783,321.980265,-3.533673,-3.323706,...,,,38,9,26,5,0.0,0,1,5


Write to file

In [21]:
train_b_preproc.to_csv(preproc_dir+"train_b_preproc1.csv",index=False)
test_b_preproc.to_csv(preproc_dir+"test_b_preproc1.csv",index=False)

In [107]:
test_b_preproc["y_lead2"].sum()

11

In [108]:
train_b_preproc["y_lead2"].sum()

113

In [109]:
test_b_preproc.shape

(1840, 1767)

In [110]:
train_b_preproc.shape

(16558, 1767)

Pipeline for data a:

In [111]:
train_a_preproc=None
test_a_preproc=None
train_a_preproc=preprocessing_pipeline(train_a)
test_a_preproc=preprocessing_pipeline(test_a)

In [32]:
##Do this only before adding to train set b
#train_a_preproc=train_a_preproc.add_suffix("_aug")
#test_a_preproc=test_a_preproc.add_suffix("_aug")

In [112]:
train_a_preproc.tail()

Unnamed: 0,time,y,x1,x2,x3,x4,x5,x6,x7,x8,...,x60_df2_l3,x61_df2_l3,xminute,xhour,xday,xmonth,xtimedif,xskip,xskipid,xsinceskip
17577,1999-05-26 09:20:00,0,0.012196,13.847258,14.340477,348.620646,125.267832,322.29719,-3.543744,-3.332518,...,0.0,0.0,20,9,26,5,0.0,0,14,772
17578,1999-05-26 09:22:00,0,-0.053825,14.080695,14.3105,349.257822,125.282389,322.007456,-3.543744,-3.323706,...,1e-06,0.0,22,9,26,5,0.0,0,14,773
17579,1999-05-26 09:24:00,0,-0.004109,13.927493,14.28761,349.894999,125.296931,323.346201,-3.543744,-3.323706,...,0.0,0.0,24,9,26,5,0.0,0,14,774
17580,1999-05-26 09:26:00,0,-0.089955,13.926491,14.413989,348.582194,125.311488,321.469859,-3.533673,-3.323706,...,0.0,0.0,26,9,26,5,0.0,0,14,775
17581,1999-05-26 09:28:00,0,-0.067187,14.086188,14.28761,352.530497,125.326029,321.940775,-3.533673,-3.323706,...,0.0,0.0,28,9,26,5,0.0,0,14,776


In [113]:
test_a_preproc.head()

Unnamed: 0,time,y,x1,x2,x3,x4,x5,x6,x7,x8,...,x60_df2_l3,x61_df2_l3,xminute,xhour,xday,xmonth,xtimedif,xskip,xskipid,xsinceskip
17582,1999-05-26 09:30:00,0,-0.031064,14.012333,14.305617,350.172129,125.340586,320.88325,-3.533673,-3.323706,...,,,30,9,26,5,,0,1,1
17583,1999-05-26 09:32:00,0,-0.085997,14.098046,14.075781,349.319956,125.355128,321.09492,-3.533673,-3.323706,...,,,32,9,26,5,0.0,0,1,2
17584,1999-05-26 09:34:00,0,-0.094949,14.188851,14.384268,348.635142,125.369685,321.30662,-3.533673,-3.323706,...,,,34,9,26,5,0.0,0,1,3
17585,1999-05-26 09:36:00,0,-0.05504,14.141621,14.379366,347.950358,125.384226,322.58253,-3.533673,-3.323706,...,,,36,9,26,5,0.0,0,1,4
17586,1999-05-26 09:38:00,0,-0.0744,14.15638,14.107552,347.265543,125.398783,321.980265,-3.533673,-3.323706,...,,,38,9,26,5,0.0,0,1,5


Write to file

In [35]:
train_a_preproc.to_csv(preproc_dir+"train_a_preproc1.csv",index=False)
test_a_preproc.to_csv(preproc_dir+"test_a_preproc1.csv",index=False)

Scale first:

In [114]:
train_b.head()

Unnamed: 0,time,y_lead2,x1,x2,x3,x4,x5,x6,x7,x8,...,x52,x53,x54,x55,x56,x57,x58,x59,x60,x61
0,1999-05-01 00:00:00,0,0.360183,8.916353,10.298991,332.475535,125.067515,255.941905,-3.373756,-3.413735,...,3578.032176,3645.031931,4.149806,1990.071848,283.75117,611.968211,-3.620528,97.959956,-3.334341,0
1,1999-05-01 00:02:00,0,0.459238,8.970286,10.376388,335.208506,125.057612,257.853709,-3.373756,-3.413735,...,3578.036326,3645.041453,4.149061,1982.248728,287.995799,611.177744,-3.620436,98.097239,-3.334334,0
2,1999-05-01 00:04:00,0,0.347366,8.831394,10.0416,333.105845,125.047709,258.775156,-3.363685,-3.413735,...,3578.04072,3645.050974,4.148316,1980.47822,292.240397,612.032298,-3.620343,98.847277,-3.334327,0
3,1999-05-01 00:06:00,0,0.285108,8.753854,10.371135,332.139414,125.038203,258.094187,-3.372413,-3.413735,...,3578.045115,3645.060252,4.147571,1978.707713,286.908183,612.480906,-3.620251,99.561617,-3.33432,0
4,1999-05-01 00:08:00,0,0.249096,8.76286,10.061597,334.245188,125.031031,259.105875,-3.373756,-3.413735,...,3578.049509,3645.069773,4.146826,1976.937205,287.142985,612.096384,-3.620195,98.417231,-3.334312,0


In [115]:
#Select X
selcols=[a for a in train_b if a.startswith("x")]
X_train_b=train_b[train_b.columns.intersection(selcols)].copy()
X_test_b=test_b[test_b.columns.intersection(selcols)].copy()

In [116]:
#X_train_b.fillna(0, inplace=True) 
#X_train_b.replace(to_replace=np.inf, value=0, inplace=True)

#X_test_b.fillna(0, inplace=True) 
#X_test_b.replace(to_replace=np.inf, value=0, inplace=True)

In [117]:
scaler= StandardScaler().fit(X_train_b)
X_train_b_sc=scaler.transform(X_train_b)
X_test_b_sc=scaler.transform(X_test_b)

In [147]:
unselcols=[a for a in train_b if not(a.startswith("x"))]
un_train_b=train_b[train_b.columns.intersection(unselcols)].copy()
un_test_b=test_b[test_b.columns.intersection(unselcols)].copy()
un_test_b.reset_index(inplace=True)

In [148]:
un_test_b.head()

Unnamed: 0,index,time,y_lead2
0,16558,1999-05-26 09:30:00,0
1,16559,1999-05-26 09:32:00,0
2,16560,1999-05-26 09:34:00,0
3,16561,1999-05-26 09:36:00,0
4,16562,1999-05-26 09:38:00,0


In [149]:
X_train_b_sc_df=pd.DataFrame(columns=selcols,data=X_train_b_sc)
X_test_b_sc_df=pd.DataFrame(columns=selcols,data=X_test_b_sc)

In [150]:
X_train_b_sc_df.tail()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x52,x53,x54,x55,x56,x57,x58,x59,x60,x61
16553,-0.059516,0.079529,-0.11658,0.361886,0.119158,1.256387,-1.717993,0.356684,-0.368877,0.048978,...,-0.355738,-0.179498,0.98856,0.440238,-0.108989,-0.782438,0.216033,0.102923,-1.251968,-0.033894
16554,-0.156089,0.128671,-0.121695,0.366648,0.141211,1.248525,-1.717993,0.486298,-0.368877,0.042908,...,-0.355738,-0.179203,0.98856,0.298201,-0.102926,-0.782438,0.212549,0.000818,-1.250768,-0.033894
16555,-0.083366,0.09642,-0.1256,0.371411,0.163241,1.284853,-1.717993,0.486298,-0.368877,-0.045063,...,-0.355738,-0.178909,0.98856,0.487591,-0.096864,-0.730112,0.218813,-0.05334,-1.249328,-0.033894
16556,-0.208939,0.096209,-0.104038,0.361599,0.185293,1.233937,-1.622169,0.486298,-0.368877,-0.045063,...,-0.355738,-0.178631,0.98856,0.518594,-0.090802,-0.782438,0.229951,0.026346,-1.248128,-0.033894
16557,-0.175635,0.129827,-0.1256,0.391109,0.207322,1.246716,-1.622169,0.486298,-0.368877,-0.045063,...,-0.355738,-0.178336,0.98856,0.434587,-0.084739,-0.573566,0.240188,0.103104,-1.246928,-0.033894


In [151]:
X_test_b_sc_df.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x52,x53,x54,x55,x56,x57,x58,x59,x60,x61
0,-0.122795,0.11428,-0.122528,0.373482,0.229374,1.218019,-1.622169,0.486298,-0.368877,-0.045063,...,-0.355738,-0.178042,0.98856,0.591604,-0.078677,-0.527161,0.239445,-0.054321,-1.245728,-0.033894
1,-0.20315,0.132324,-0.16174,0.367113,0.251404,1.223763,-1.622169,0.486298,-0.368877,-0.045063,...,-0.355738,-0.177747,0.98856,0.375488,-0.075752,-0.259147,0.23872,-0.035606,-1.244288,-0.033894
2,-0.216244,0.151439,-0.109109,0.361994,0.273456,1.229508,-1.622169,0.486298,-0.368877,-0.139113,...,-0.355738,-0.177453,0.98856,0.49969,-0.077533,-1.050452,0.237977,0.065153,-1.243088,-0.033894
3,-0.157867,0.141497,-0.109945,0.356876,0.295485,1.26413,-1.622169,0.486298,-0.368877,-0.139113,...,-0.355738,-0.177158,0.98856,0.311208,-0.079314,-0.527161,0.237233,0.031282,-1.241888,-0.033894
4,-0.186186,0.144604,-0.156319,0.351758,0.317537,1.247787,-1.622169,0.486298,-0.236463,-0.139113,...,-0.355738,-0.17688,0.98856,0.331046,-0.081096,-0.782438,0.236489,-0.052611,-1.240448,-0.033894


In [152]:
train_b_sc=pd.concat([un_train_b,X_train_b_sc_df],axis=1)
test_b_sc=pd.concat([un_test_b,X_test_b_sc_df],axis=1)

In [153]:
train_b_sc.tail()

Unnamed: 0,time,y_lead2,x1,x2,x3,x4,x5,x6,x7,x8,...,x52,x53,x54,x55,x56,x57,x58,x59,x60,x61
16553,1999-05-26 09:20:00,0,-0.059516,0.079529,-0.11658,0.361886,0.119158,1.256387,-1.717993,0.356684,...,-0.355738,-0.179498,0.98856,0.440238,-0.108989,-0.782438,0.216033,0.102923,-1.251968,-0.033894
16554,1999-05-26 09:22:00,0,-0.156089,0.128671,-0.121695,0.366648,0.141211,1.248525,-1.717993,0.486298,...,-0.355738,-0.179203,0.98856,0.298201,-0.102926,-0.782438,0.212549,0.000818,-1.250768,-0.033894
16555,1999-05-26 09:24:00,0,-0.083366,0.09642,-0.1256,0.371411,0.163241,1.284853,-1.717993,0.486298,...,-0.355738,-0.178909,0.98856,0.487591,-0.096864,-0.730112,0.218813,-0.05334,-1.249328,-0.033894
16556,1999-05-26 09:26:00,0,-0.208939,0.096209,-0.104038,0.361599,0.185293,1.233937,-1.622169,0.486298,...,-0.355738,-0.178631,0.98856,0.518594,-0.090802,-0.782438,0.229951,0.026346,-1.248128,-0.033894
16557,1999-05-26 09:28:00,0,-0.175635,0.129827,-0.1256,0.391109,0.207322,1.246716,-1.622169,0.486298,...,-0.355738,-0.178336,0.98856,0.434587,-0.084739,-0.573566,0.240188,0.103104,-1.246928,-0.033894


In [154]:
test_b_sc.head()

Unnamed: 0,index,time,y_lead2,x1,x2,x3,x4,x5,x6,x7,...,x52,x53,x54,x55,x56,x57,x58,x59,x60,x61
0,16558,1999-05-26 09:30:00,0,-0.122795,0.11428,-0.122528,0.373482,0.229374,1.218019,-1.622169,...,-0.355738,-0.178042,0.98856,0.591604,-0.078677,-0.527161,0.239445,-0.054321,-1.245728,-0.033894
1,16559,1999-05-26 09:32:00,0,-0.20315,0.132324,-0.16174,0.367113,0.251404,1.223763,-1.622169,...,-0.355738,-0.177747,0.98856,0.375488,-0.075752,-0.259147,0.23872,-0.035606,-1.244288,-0.033894
2,16560,1999-05-26 09:34:00,0,-0.216244,0.151439,-0.109109,0.361994,0.273456,1.229508,-1.622169,...,-0.355738,-0.177453,0.98856,0.49969,-0.077533,-1.050452,0.237977,0.065153,-1.243088,-0.033894
3,16561,1999-05-26 09:36:00,0,-0.157867,0.141497,-0.109945,0.356876,0.295485,1.26413,-1.622169,...,-0.355738,-0.177158,0.98856,0.311208,-0.079314,-0.527161,0.237233,0.031282,-1.241888,-0.033894
4,16562,1999-05-26 09:38:00,0,-0.186186,0.144604,-0.156319,0.351758,0.317537,1.247787,-1.622169,...,-0.355738,-0.17688,0.98856,0.331046,-0.081096,-0.782438,0.236489,-0.052611,-1.240448,-0.033894


Preprocess second:

In [155]:
train_b_preproc_sc=None
test_b_preproc_sc=None
train_b_preproc_sc=preprocessing_pipeline(train_b_sc)
test_b_preproc_sc=preprocessing_pipeline(test_b_sc)

In [156]:
train_b_preproc_sc.tail()

Unnamed: 0,time,y_lead2,x1,x2,x3,x4,x5,x6,x7,x8,...,x60_df2_l3,x61_df2_l3,xminute,xhour,xday,xmonth,xtimedif,xskip,xskipid,xsinceskip
16553,1999-05-26 09:20:00,0,-0.059516,0.079529,-0.11658,0.361886,0.119158,1.256387,-1.717993,0.356684,...,0.0,0.0,20,9,26,5,0.0,0,99,294
16554,1999-05-26 09:22:00,0,-0.156089,0.128671,-0.121695,0.366648,0.141211,1.248525,-1.717993,0.486298,...,0.00024,0.0,22,9,26,5,0.0,0,99,295
16555,1999-05-26 09:24:00,0,-0.083366,0.09642,-0.1256,0.371411,0.163241,1.284853,-1.717993,0.486298,...,0.0,0.0,24,9,26,5,0.0,0,99,296
16556,1999-05-26 09:26:00,0,-0.208939,0.096209,-0.104038,0.361599,0.185293,1.233937,-1.622169,0.486298,...,0.0,0.0,26,9,26,5,0.0,0,99,297
16557,1999-05-26 09:28:00,0,-0.175635,0.129827,-0.1256,0.391109,0.207322,1.246716,-1.622169,0.486298,...,0.0,0.0,28,9,26,5,0.0,0,99,298


In [157]:
test_b_preproc_sc.head()

Unnamed: 0,index,time,y_lead2,x1,x2,x3,x4,x5,x6,x7,...,x60_df2_l3,x61_df2_l3,xminute,xhour,xday,xmonth,xtimedif,xskip,xskipid,xsinceskip
0,16558,1999-05-26 09:30:00,0,-0.122795,0.11428,-0.122528,0.373482,0.229374,1.218019,-1.622169,...,,,30,9,26,5,,0,1,1
1,16559,1999-05-26 09:32:00,0,-0.20315,0.132324,-0.16174,0.367113,0.251404,1.223763,-1.622169,...,,,32,9,26,5,0.0,0,1,2
2,16560,1999-05-26 09:34:00,0,-0.216244,0.151439,-0.109109,0.361994,0.273456,1.229508,-1.622169,...,,,34,9,26,5,0.0,0,1,3
3,16561,1999-05-26 09:36:00,0,-0.157867,0.141497,-0.109945,0.356876,0.295485,1.26413,-1.622169,...,,,36,9,26,5,0.0,0,1,4
4,16562,1999-05-26 09:38:00,0,-0.186186,0.144604,-0.156319,0.351758,0.317537,1.247787,-1.622169,...,,,38,9,26,5,0.0,0,1,5


In [158]:
train_b_preproc_sc.to_csv(preproc_dir+"train_b_preproc_sc.csv",index=False)
test_b_preproc_sc.to_csv(preproc_dir+"test_b_preproc_sc.csv",index=False)