In [1]:
import pandas as pd
import numpy as np

from scipy.sparse import coo_matrix
import sklearn.preprocessing as pre
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, train_test_split
from catboost import CatBoostClassifier,Pool

from datetime import datetime
import os 
import matplotlib.pyplot as plt 
import seaborn as sns
from IPython.core.display import HTML
import json
import pickle, time

from hyperopt import fmin, tpe, Trials, STATUS_OK, STATUS_FAIL, hp, pyll
#from hyperopt import pyll
import csv
#from sklearn.ensemble import GradientBoostingClassifier as GBC

%load_ext line_profiler

### Bayesian Optimization: feature selections

In [2]:
def load_data(klas=1):
    #### load data file
    data=pd.read_csv('orange_small_test.data',header=0,delimiter='\t')
    #### Load label files
    labels=dict({
    1 : pd.read_csv('orange_small_train_appetency.labels',header=None,delimiter='\t'),
    2 : pd.read_csv('orange_small_train_churn.labels',header=None,delimiter='\t'),
    3 : pd.read_csv('orange_small_train_upselling.labels',header=None,delimiter='\t')
    })
           
    lbl=labels[klas]
    idx=np.where(lbl!=klas)
    lbl.at[idx[0],0]=0
    
    return data, lbl

In [None]:
def tokenize_column(col,col_name,isTrain=True):
    new_col_name=[]
    _row_pos=[]
    _col_pos=[]
    _val=[]
    irow=0
    if isTrain:
        _one_hot_col_names=[]
        for d in col.tolist():
            if d not in _one_hot_col_names:
                #print(col_name,d)
                new_col_name.append(col_name+'_' + str(d))
                _one_hot_col_names.append(d)
            _val.append(1)
            _row_pos.append(irow)
            _col_pos.append(_one_hot_col_names.index(d))
            irow=irow+1
        
        new_data=coo_matrix((_val, (_row_pos, [ i+1 for i in _col_pos])), shape=(len(_row_pos), len(_one_hot_col_names)+1)).toarray() 
        col=_one_hot_col_names
    else:
        _one_hot_col_names=col_name
        for d in col.tolist():
            if d not in _one_hot_col_names:
                _col_pos.append(0)
            else:
                col_idx=_one_hot_col_names.index(d)
                _col_pos.append(col_idx)
            _row_pos.append(irow)
            _val.append(1)
            irow=irow+1
        new_data=coo_matrix((_val, (_row_pos, _col_pos)), shape=(len(_row_pos), len(_one_hot_col_names))).toarray()
        new_col_name=None
        col=None
    return new_data, new_col_name, col

def target_based_statistics(x,y,tbs_dict=None,klas=1):
    _idx=x.index
    idx=y[(y==klas)].index
    #print(idx)
    x_w_y_eq_1=np.array(x.loc[idx])
    cols=x.columns.tolist()
    x=np.array(x)
    x_matrix=np.zeros(x.shape)
    #x_matrix=1e-6
    tbs=dict()
    p=np.sum(y==klas)/y.shape[0]
    if tbs_dict is None:
        for i in range(len(cols)):
            col_tbs=dict()
            for val in np.unique(x[:,i]):
                idx=np.where(x[:,i]==val)[0]
                idx2=(np.where(x_w_y_eq_1[:,i]==val))[0]
                vtar=(idx2.shape[0]+p)/idx.shape[0]
                if vtar >= 1:
                    #print(cols[i],val,vtar,idx2.shape[0],idx.shape[0])
                    vtar=1.0
                x_matrix[idx,i]=vtar
                col_tbs[val]=vtar
            tbs[cols[i]]=col_tbs
    else:
        tbs=tbs_dict
        for col in cols:
            try:
                cats=set(tbs[col])
            except KeyError:
                next
            for cat in tbs[col]:
                try:
                    i=cols.index(col)
                    idx=np.where(x[:,i]==cat)[0]
                    #print(col,cat,tbs[col][cat],idx)
                    if len(idx) > 0:
                        #print(idx,i)
                        x_matrix[idx,int(i)]=tbs[col][cat]
                    else:
                        print('column ', col , ' does not have values of ', cat)
                except KeyError:
                    next
    x_matrix[x_matrix==0]=1e-6
    return tbs, pd.DataFrame(x_matrix,columns=cols,index=_idx)

class OneHotEncoder:
    
    cols = None
    verbose = 0
    new_cols = []
    new_cols_flat=None
    new_data_added=None
    new_data = None
    new_features = None
    data_old = None
    iat = None
    
    def __init__(self, cols=None, verbose=0):
        self.cols=cols
        self.verbose=verbose
        print("init")
        
    #def _train(self, data, isTrain=True):
    def fit(self, data):
        _idx=data.index
        if type(data)!=pd.DataFrame:
            print("exit data type must be: ",pd.DataFrame)
            return None
        
        if self.cols == None:
            self.cols = data.columns.tolist()
            cols = self.cols
        
        cols=self.cols
        new_col_name=dict()
        new_col_name_flat=[]
        new_features=[]
        i=0
        for col in cols:
            _new_dat,_new_col_name,_new_features=tokenize_column(data[col],col)
            if self.verbose:
                print(col,_new_dat.shape)
            if i == 0:
                new_dat=np.array(_new_dat)
                i=1
            else:
                new_dat=np.concatenate([new_dat,_new_dat],axis=1)
            new_col_name[col]=['unk']+_new_features
            new_col_name_flat=new_col_name_flat+[col + '_unk']+_new_col_name
            new_features = new_features + _new_features
        
        if self.verbose:
            print('new columns len',len(new_col_name_flat),'new data shape:',new_dat.shape)
        
        self.data_old=data
        self.new_col=new_col_name
        self.new_cols_flat=new_col_name_flat
        self.new_feature=new_features
        new_data=pd.DataFrame(new_dat,columns=new_col_name_flat,dtype=np.int8,index=_idx)
        #new_data=pd.DataFrame(new_dat,columns=new_col_name_flat)
        self.new_data_added=new_data
        self.iat=data.iat
        return None
    
    def transform(self,data):
        #print(data.iat,self.iat)
        _idx=data.index
        if data.iat == self.iat:
            new_data=self.new_data_added
        else:
            cols=self.cols
            _new_binarized_features=None
            icol=0
            for col in cols:
                #print(self.new_col[col])
                _new_data,_,_=tokenize_column(data[col],self.new_col[col],isTrain=False)
                #print(_new_data)
                if icol == 0:
                    new_data=np.array(_new_data,dtype=np.int8)
                    #new_data=np.array(_new_data)
                    icol = 1
                else:
                    new_data=np.concatenate([new_data,_new_data],axis=1)
            #print(new_data)
            new_data=pd.DataFrame(new_data,columns=self.new_cols_flat,dtype=np.int8,index=_idx)
            #new_data=pd.DataFrame(new_data,columns=self.new_cols_flat)
            #print(_idx)
        return new_data

In [3]:
def target_based_statistics(x,y,tbs_dict=None,klas=1):
    _idx=x.index
    idx=np.where(y==klas)[0]
    x_w_y_eq_1=np.array(x.loc[idx])
    cols=x.columns.tolist()
    x=np.array(x)
    tbs=dict()
    p=np.sum(y==klas)[0]/y.shape[0]
    if tbs_dict is None:
        #cols=cols
        x_matrix=np.zeros(x.shape)
        for i in range(len(cols)):
            col_tbs=dict()
            for val in np.unique(x[:,i]):
                idx=np.where(x[:,i]==val)[0]
                #print(idx,val)
                idx2=np.where(x_w_y_eq_1[:,i]==val)[0]
                #print(idx2.shape[0],val,p)
                d=idx.shape[0]
                d=1e-6 if d == 0 else d
                vtar=(idx2.shape[0]+p)/d
                vtar=1 if vtar >=1 else vtar
                x_matrix[idx,i]=vtar
                col_tbs[val]=vtar
            tbs[cols[i]]=col_tbs
    else:
        tbs=tbs_dict
        cols=list(tbs.keys())
        x_matrix=np.zeros((x.shape[0],len(tbs)))      
        for col in cols:
            try:
                cats=set(tbs[col])
            except KeyError:
                next
            for cat in tbs[col]:
                try:
                    i=cols.index(col)
                    idx=np.where(x[:,i]==cat)[0]
                    #print(col,cat,tbs[col][cat],idx)
                    if len(idx) > 0:
                        #print(idx,i)
                        x_matrix[idx,int(i)]=tbs[col][cat]
                    else:
                        #print('column ', col , ' does not have values of ', cat)
                        next
                except KeyError:
                    next
    x_matrix[x_matrix==0]=1e-6
    return tbs, pd.DataFrame(x_matrix,columns=cols,index=_idx)

In [4]:
def data_preparation(x,y):
    
    #drop empty cols
    drop_cols=[]
    for col, isEmpty in (x.describe().T['count']==0).items():
        if isEmpty:
            drop_cols.append(col)
            #print(col)
    #x=x.drop(drop_cols,axis=1)
    col_num=[]
    col_cat=[]
    for col, typ in x.dtypes.items():
        if typ == np.object:
            col_cat.append(col)
        else:
            col_num.append(col)
    for col in col_cat:
        x[col].fillna('?',inplace=True)       
    
    col_app=[]
    
    for col in col_num:
        x[col +"_imputed"]=pd.isnull(x[col]).astype('float')
        col_app.append(col +"_imputed")
        x[col].fillna(0,inplace=True)
    
    #col_num=col_num+col_app
    col_cat=col_cat+col_app
    
    xtrain,xtest,ytrain,ytest=train_test_split(x, y, test_size=0.1, random_state=42, shuffle=True, stratify=y)
    print('train data: ',xtrain.shape[0])
    print('train data class: ',(ytrain==1)[0].sum())
    print('test data: ', xtest.shape[0])
    print('test data class: ', (ytest==1)[0].sum())
    print('test/train class ratios:',(ytest==1)[0].sum()/(ytrain==1)[0].sum())
    
    xtrain_cat=xtrain.loc[:,col_cat]
    xtrain_num=xtrain.loc[:,col_num]
    xtest_cat=xtest.loc[:,col_cat]
    xtest_num=xtest.loc[:,col_num]
    
    print('norm')
    norm = pre.Normalizer()
    xtrain_norm = norm.fit_transform(xtrain_num)
    xtest_norm = norm.transform(xtest_num)
    xtrain_norm=pd.DataFrame(xtrain_norm,columns=col_num, index=xtrain_num.index)
    xtest_norm=pd.DataFrame(xtest_norm,columns=col_num, index=xtest_num.index)
    
    print('tbs')
    # replace categories with target based statistic 
    this_tbs, xtrain_cat_tbs=target_based_statistics(xtrain_cat, ytrain)
    _,xtest_cat_tbs=target_based_statistics(xtest_cat, ytest, tbs_dict=this_tbs)
    
    #xtrain_cat_tbs=[]
    #xtest_cat_tbs=[]
    
    data=dict({
        'category_columns':col_cat,
        'numeric_columns':col_num,
        'train_cat':xtrain_cat,
        'test_cat':xtest_cat,
        'train_tbs':xtrain_cat_tbs,
        'test_tbs':xtest_cat_tbs,
        'train_num':xtrain_num,
        'test_num':xtest_num,
        'train_norm':xtrain_norm,
        'test_norm':xtest_norm,
        'y_train':ytrain,
        'y_test':ytest
    }
    )
    return data

In [7]:
%%time
x,y=load_data()
data=data_preparation(x,y)

train data:  45000
train data class:  801
test data:  5000
test data class:  89
test/train class ratios: 0.1111111111111111
norm
tbs


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  after removing the cwd from sys.path.


Wall time: 1min 22s


In [8]:
with open('data.pickle','wb') as f:
    pickle.dump(data,f)

In [2]:
with open('data.pickle','rb') as f:
    data=pickle.load(f)

In [3]:
x_train=pd.concat([data['train_num'],data['train_tbs']],axis=1)
x_test=pd.concat([data['test_num'],data['test_tbs']],axis=1)
y_train=data['y_train']
y_test=data['y_test']
column_names=x_train.columns.tolist()

In [5]:
_=[ print(col.replace('_','')) for col in column_names]

Var1
Var2
Var3
Var4
Var5
Var6
Var7
Var8
Var9
Var10
Var11
Var12
Var13
Var14
Var15
Var16
Var17
Var18
Var19
Var20
Var21
Var22
Var23
Var24
Var25
Var26
Var27
Var28
Var29
Var30
Var31
Var32
Var33
Var34
Var35
Var36
Var37
Var38
Var39
Var40
Var41
Var42
Var43
Var44
Var45
Var46
Var47
Var48
Var49
Var50
Var51
Var52
Var53
Var54
Var55
Var56
Var57
Var58
Var59
Var60
Var61
Var62
Var63
Var64
Var65
Var66
Var67
Var68
Var69
Var70
Var71
Var72
Var73
Var74
Var75
Var76
Var77
Var78
Var79
Var80
Var81
Var82
Var83
Var84
Var85
Var86
Var87
Var88
Var89
Var90
Var91
Var92
Var93
Var94
Var95
Var96
Var97
Var98
Var99
Var100
Var101
Var102
Var103
Var104
Var105
Var106
Var107
Var108
Var109
Var110
Var111
Var112
Var113
Var114
Var115
Var116
Var117
Var118
Var119
Var120
Var121
Var122
Var123
Var124
Var125
Var126
Var127
Var128
Var129
Var130
Var131
Var132
Var133
Var134
Var135
Var136
Var137
Var138
Var139
Var140
Var141
Var142
Var143
Var144
Var145
Var146
Var147
Var148
Var149
Var150
Var151
Var152
Var153
Var154
Var155
Var156
Var157
Var158
Va