In [40]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder

In [13]:
train = pd.read_csv("../data/train.csv")

In [3]:
test = pd.read_csv("../data/test.csv")

In [78]:
train

Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,...,iterm,locdt,loctm,mcc,mchno,ovrlt,scity,stocn,stscd,txkey
0,6881,113261,38038,513.80,5,0,N,0,N,N,...,0,33,172652.0,457,59333,N,0,102,0,516056
1,6881,15408,188328,513.80,5,0,N,0,N,N,...,0,6,152458.0,457,59333,N,0,102,0,483434
2,6716,157159,29967,1016.11,5,62,N,5,N,N,...,0,5,172946.0,247,50436,N,3281,102,0,1407164
3,5975,105985,81305,713.66,5,62,N,4,N,N,...,0,6,182129.0,263,93775,N,5817,102,0,1051004
4,6411,94435,49219,1806.49,3,62,N,4,N,N,...,0,6,172624.0,339,0,N,5865,102,0,1622153
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1141335,6032,2510,54094,1.38,5,75,Y,2,,,...,0,30,231435.0,248,78297,N,5817,102,0,196080
1141336,1801,12832,137381,1138.04,5,75,Y,8,,,...,0,1,100009.0,201,198,N,1852,98,0,642025
1141337,6322,91008,15189,578.38,5,75,Y,8,,,...,0,4,191642.0,209,38222,N,5817,102,0,1478280
1141338,3226,145107,116252,435.32,5,75,Y,8,,,...,0,13,102338.0,192,90135,N,1458,102,0,661087


In [79]:
test

Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,...,iterm,locdt,loctm,mcc,mchno,ovrlt,scity,stocn,stscd,txkey
0,0,134508,45725,465.62,5,0,N,2,N,N,...,0,9,105114.0,451,0,N,5817,102,0,4376
1,0,78377,2295,465.62,5,0,N,2,N,N,...,0,6,104918.0,451,0,N,5817,102,0,2943
2,0,151054,197751,465.62,5,0,N,2,N,N,...,0,7,104917.0,451,0,N,5817,102,0,3387
3,6716,150887,4541,930.31,5,62,N,4,N,N,...,0,8,142028.0,329,37560,N,5820,102,0,1765957
4,6882,110356,133430,0.00,5,0,N,0,N,N,...,0,9,143657.0,459,2461,N,0,102,0,611269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380442,5588,95249,112041,1.38,5,75,Y,8,,,...,0,17,194234.0,296,73998,N,3426,42,0,668463
380443,6215,85771,169553,609.21,5,75,Y,2,,,...,0,14,133907.0,289,54828,N,3460,46,0,1846986
380444,6231,139728,168027,574.36,5,75,N,5,,,...,0,28,155437.0,245,18405,N,5817,102,0,992751
380445,6032,45406,197460,1.38,5,75,Y,2,,,...,0,24,215218.0,373,79246,N,5817,102,0,338215


In [14]:
train.columns

Index(['acqic', 'bacno', 'cano', 'conam', 'contp', 'csmcu', 'ecfg', 'etymd',
       'flbmk', 'flg_3dsmk', 'fraud_ind', 'hcefg', 'insfg', 'iterm', 'locdt',
       'loctm', 'mcc', 'mchno', 'ovrlt', 'scity', 'stocn', 'stscd', 'txkey'],
      dtype='object')

# Define cols and fill missing value

In [77]:
col_cont = ['acqic', 'bacno', 'cano', 'conam', 'iterm', 'locdt', 'loctm', 'mcc', 'mchno', 'scity']
col_disc = ['contp', 'csmcu', 'ecfg', 'etymd', 'flbmk', 'flg_3dsmk', 'hcefg', 'insfg', 'ovrlt', 'stocn', 'stscd']
col_label = ['fraud_ind']

In [9]:
train = train.fillna("NA") # only flbmk & flg3dmsk has full value
test = test.fillna("NA") # only flbmk & flg3dmsk has full value

# Preprocess numeric data

In [80]:
def preprocessing_cont(train_df, test_df):
    ss = StandardScaler()
    mms = MinMaxScaler()
    
    train_cont_df = train_df[col_cont].copy()
    test_cont_df = test_df[col_cont].copy()
    
    train_cont_df = ss.fit_transform(train_cont_df)
    train_cont_df = mms.fit_transform(train_cont_df)
    train_cont_df = pd.DataFrame(train_cont_df, columns=col_cont)
    
    test_cont_df = ss.transform(test_cont_df)
    test_cont_df = mms.transform(test_cont_df)
    test_cont_df = pd.DataFrame(test_cont_df, columns=col_cont)
    
    return train_cont_df, test_cont_df
    
train_cont_df, test_cont_df = preprocessing_cont(train, test)

# Preprocess categorical data

In [81]:
# initialize label encoder for all possible value in train and test set
les = {}
for c in col_disc:
    le = LabelEncoder()
    le.fit(pd.concat((train[c], test[c]), 0))
    les[c] = le
    
def init_ohe(train_df, test_df):
    all_disc = pd.concat((train[col_disc], test[col_disc]), 0)
    
    for c in col_disc:
        all_disc.loc[:,c] = les[c].transform(all_disc.loc[:,c])

    ohe = OneHotEncoder(sparse=False)
    ohe.fit(all_disc)
    
    return ohe

ohe = init_ohe(train, test)

In [82]:
les

{'contp': LabelEncoder(),
 'csmcu': LabelEncoder(),
 'ecfg': LabelEncoder(),
 'etymd': LabelEncoder(),
 'flbmk': LabelEncoder(),
 'flg_3dsmk': LabelEncoder(),
 'hcefg': LabelEncoder(),
 'insfg': LabelEncoder(),
 'ovrlt': LabelEncoder(),
 'stocn': LabelEncoder(),
 'stscd': LabelEncoder()}

In [83]:
ohe.categories_

[array([0, 1, 2, 3, 4, 5, 6]),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
        68, 69, 70, 71]),
 array([0, 1]),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array([0, 1, 2, 3]),
 array([0, 1, 2, 3]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8]),
 array([0, 1]),
 array([0, 1]),
 array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79, 

In [84]:
def preprocessing_disc(train_df, test_df):
    train_disc_df = train_df[col_disc].copy()
    test_disc_df = test_df[col_disc].copy()
    
    for c in col_disc:
        train_disc_df.loc[:,c] = les[c].transform(train_disc_df.loc[:,c])
        test_disc_df.loc[:,c] = les[c].transform(test_disc_df.loc[:,c])
        
    train_disc_df = ohe.transform(train_disc_df)
    train_disc_df = pd.DataFrame(train_disc_df, columns=ohe.get_feature_names(col_disc))
    
    test_disc_df = ohe.transform(test_disc_df)
    test_disc_df = pd.DataFrame(test_disc_df, columns=ohe.get_feature_names(col_disc))
    
    return train_disc_df, test_disc_df
    

train_disc_df, test_disc_df = preprocessing_disc(train, test)

In [85]:
train_disc_df

Unnamed: 0,contp_0,contp_1,contp_2,contp_3,contp_4,contp_5,contp_6,csmcu_0,csmcu_1,csmcu_2,...,stocn_98,stocn_99,stocn_100,stocn_101,stocn_102,stscd_0,stscd_1,stscd_2,stscd_3,stscd_4
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1141335,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1141336,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1141337,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1141338,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [86]:
test_disc_df

Unnamed: 0,contp_0,contp_1,contp_2,contp_3,contp_4,contp_5,contp_6,csmcu_0,csmcu_1,csmcu_2,...,stocn_98,stocn_99,stocn_100,stocn_101,stocn_102,stscd_0,stscd_1,stscd_2,stscd_3,stscd_4
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380442,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
380443,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
380444,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
380445,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# Merge and save

In [87]:
prep_train_df = pd.concat((train_cont_df, train_disc_df, train[col_label]), 1)
prep_test_df = pd.concat((test_cont_df, test_disc_df, test[col_label]), 1)

In [88]:
prep_train_df.to_csv("../data/type2_train.csv", index=False)

In [89]:
prep_test_df.to_csv("../data/type2_test.csv", index=False)