In [1]:
import pandas as pd
import numpy as np
import os
from datetime import date

from sklearn.manifold import TSNE

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer, PolynomialFeatures, MinMaxScaler



In [2]:
ori_train = pd.read_csv('train.csv')
ori_test = pd.read_csv('test.csv')
sample_submit = pd.read_csv('sample_submission.csv')

ori_train['target'] = ori_train['TARGET']
ori_train['t_id'] = ori_train["ID"]
ori_test['t_id'] = ori_test["ID"]

del ori_train['TARGET'], ori_train["ID"], ori_test["ID"]


In [7]:
def main_feat(train, test, sample_submit=None):
    
    train_target = train['target']
    del train['target']

    #Xóa cột id
    del train['t_id'], test['t_id']

    # đếm giá trị 0 theo id
    def countZero(data):
        return np.sum(data == 0)

    train['count0'] = train.apply(countZero, axis=1)
    test['count0'] = test.apply(countZero, axis=1)

    # Thêm cột count cho những cột giá trị nguyên
    int_col = (train.dtypes == int)[(train.dtypes == int).values].index
    train_test = pd.concat([train,test])
    for i in int_col:
        tmp_cnt = train_test[i].value_counts()
        tmp_cnt = tmp_cnt.to_frame(name=i+'_cnt')
        tmp_cnt[i] = tmp_cnt.index
        tmp_cnt.reset_index(drop=True, inplace=True)
        train = train.reset_index().merge(tmp_cnt, how='left', on=i).sort('index').drop('index', axis=1)
        test = test.reset_index().merge(tmp_cnt, how='left', on=i).sort('index').drop('index', axis=1)
        train.reset_index(drop=True, inplace=True)
        test.reset_index(drop=True, inplace=True)
    del train_test

    # Tạo giá trị dummy cho biến var_3 với ngưỡng 5
    var3_cnt = train.var3.value_counts()
    index_var3_th = var3_cnt[(var3_cnt>=5).values].index
    train['var3_tmp'] = train.var3.apply(lambda x: x if x in index_var3_th else np.nan)
    test['var3_tmp'] = test.var3.apply(lambda x: x if x in index_var3_th else np.nan)
    
    train_test = pd.concat([train,test])
    tmp = pd.get_dummies(train_test['var3_tmp'], prefix='ohe_var3', prefix_sep='_')

    train = pd.concat([train, tmp.iloc[:len(train),:]], axis=1)
    test = pd.concat([test, tmp.iloc[len(train):,:]], axis=1)
    del train['var3_tmp'], test['var3_tmp']

    # thêm feauture cho cột var38
    train['var38mc'] = np.isclose(train.var38, 117310.979016)
    train['logvar38'] = train.loc[~train['var38mc'], 'var38'].map(np.log)
    train.loc[train['var38mc'], 'logvar38'] = 0

    test['var38mc'] = np.isclose(test.var38, 117310.979016)
    test['logvar38'] = test.loc[~test['var38mc'], 'var38'].map(np.log)
    test.loc[test['var38mc'], 'logvar38'] = 0

    train['var38mc'] = train['var38mc'].astype(int)

    test['var38mc'] = test['var38mc'].astype(int)

    #xóa các cột có 1 giá trị duy nhất
    for i in train.columns:
        if len(set(train[i].values)) == 1:
            del train[i], test[i]
    assert( all(train.columns == test.columns))

    #xóa cột trùng
    unique_col = train.T.drop_duplicates().T.columns
    train = train[unique_col]
    test = test[unique_col]
    assert( all(train.columns == test.columns))


    train['target'] = train_target

    train.to_csv('ikki_features_train_ver1.csv',index=None)
    test.to_csv('ikki_features_test_ver1.csv',index=None)



In [8]:
def one_hot_encoder(train, test):

    ohe_col = ['num_var13_corto','num_var13_corto_0','num_meses_var12_ult3','num_meses_var13_corto_ult3','num_meses_var39_vig_ult3','num_meses_var5_ult3','num_var24_0','num_var12','var36','num_var5','num_var5_0','num_var12_0','num_var13','num_var13_0','num_var42','num_var4','num_var42_0','num_var30','num_var39_0','num_var41_0']
    
    train_test = pd.concat([train,test])
    train_test.reset_index(drop=True, inplace=True)
    ohe_data = pd.DataFrame()
    for i in train_test.columns:
        if i in ohe_col:
            tmp = pd.get_dummies(train_test[i], prefix='ohe_'+i, prefix_sep='_')
            ohe_data = pd.concat([ohe_data, tmp], axis=1)
    
    train = ohe_data.iloc[:len(train),:]
    test = ohe_data.iloc[len(train):,:]

    train.to_csv('ikki_one_hot_encoder_train_ver1.csv',index=None)
    test.to_csv('ikki_one_hot_encoder_test_ver1.csv',index=None)


In [9]:
main_feat(train=ori_train.copy(), test=ori_test.copy())
one_hot_encoder(train=ori_train.copy(), test=ori_test.copy())