# Import thư viện và tải dữ liệu

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.model_selection import StratifiedKFold
from scipy.stats.stats import pearsonr
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer, PolynomialFeatures, MinMaxScaler
from datetime import date
from sklearn.cluster import KMeans
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
import os
import glob
import sys, getopt, re

In [7]:
!wget https://github.com/linh0303052/AplliedDSGroup11/raw/main/data.tar.gz

--2022-01-07 20:03:39--  https://github.com/linh0303052/AplliedDSGroup11/raw/main/data.tar.gz
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/linh0303052/AplliedDSGroup11/main/data.tar.gz [following]
--2022-01-07 20:03:39--  https://raw.githubusercontent.com/linh0303052/AplliedDSGroup11/main/data.tar.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12633830 (12M) [application/octet-stream]
Saving to: ‘data.tar.gz’


2022-01-07 20:03:40 (8.57 MB/s) - ‘data.tar.gz’ saved [12633830/12633830]



In [8]:
!tar -xzvf data.tar.gz

data/
data/input/
data/input/5fold_20times.csv
data/input/sample_submission.csv
data/input/test.csv
data/input/train.csv
data/output/
data/output/features/
data/output/features/dmitry_pca_feats.csv
data/output/features/kmeans_feats.csv
data/output/features/tsne_feats.csv


In [None]:
train = pd.read_csv('./data/input/train.csv')
train.shape

In [None]:
test = pd.read_csv('./data/input/test.csv')
test.shape

In [None]:
INPUT_PATH = './data/input/'
OUTPUT_PATH = './data/output/'

# Tiền xử lý dữ liệu

In [9]:
#xử lý các giá trị đặc biệt, thay thế nó bằng giá trị NA (-999.0)
def process_base(train, test):
    train.loc[(train['var38']>117310.979) & (train['var38']<117310.98), 'var38'] = -999.0
    test.loc[(test['var38']>117310.979) & (test['var38']<117310.98), 'var38'] = -999.0

    train.loc[train['var3']==-999999, 'var3'] = -999.0
    test.loc[test['var3']==-999999, 'var3'] = -999.0

    for f in ['imp_op_var40_comer_ult1', 'imp_op_var40_efect_ult3', 'imp_op_var41_comer_ult3', 'imp_sal_var16_ult1']:
        train.loc[train[f]==0.0, f] = -999.0
        test.loc[test[f]==0.0, f] = -999.0

    return train, test

In [10]:
def drop_sparse(train, test):
    flist = [x for x in train.columns if not x in ['ID','TARGET']]
    for f in flist:
        if len(np.unique(train[f]))<2:
            train.drop(f, axis=1, inplace=True)
            test.drop(f, axis=1, inplace=True)
    return train, test

In [11]:
def drop_duplicated(train, test):    
    #Loại bỏ var6 vì nó trùng với var29
    flist = [x for x in train.columns if not x in ['ID','TARGET']]            
    train.drop([x for x in flist if 'var6' in x], axis=1, inplace=True)
    test.drop([x for x in flist if 'var6' in x], axis=1, inplace=True)

    #Loại bỏ các thuộc tính có chứa _0 vì nó bị trùng với cột có chứa _1 theo ngay sau
    flist = [x for x in train.columns if not x in ['ID','TARGET']]        
    flist_remove = []
    for i in range(len(flist)-1):
        v = train[flist[i]].values
        for j in range(i+1, len(flist)):
            if np.array_equal(v, train[flist[j]].values):
                if '_0' in flist[j]:
                    flist_remove.append(flist[j])
                elif  '_0' in flist[i]:
                    flist_remove.append(flist[i])
    train.drop(flist_remove, axis=1, inplace=True)
    test.drop(flist_remove, axis=1, inplace=True)

    #Loại bỏ các cột bị trùng khác
    flist_remove = ['saldo_medio_var13_medio_ult1', 'delta_imp_reemb_var13_1y3', 'delta_imp_reemb_var17_1y3', 
                       'delta_imp_reemb_var33_1y3', 'delta_imp_trasp_var17_in_1y3', 'delta_imp_trasp_var17_out_1y3',
                       'delta_imp_trasp_var33_in_1y3', 'delta_imp_trasp_var33_out_1y3']
    train.drop(flist_remove, axis=1, inplace=True)
    test.drop(flist_remove, axis=1, inplace=True)
    
    return train, test

In [12]:
#Chuẩn hóa các giá trị thuộc tính
def normalize_features(train, test):
    flist = [x for x in train.columns if not x in ['ID','TARGET']]
    for f in flist:
        if train[f].max() == 9999999999.0:
            fmax = train.loc[train[f]<9999999999.0, f].max()
            train.loc[train[f]==9999999999.0, f] = fmax + 1

        if len(train.loc[train[f]<0, f].value_counts()) == 1:
            train.loc[train[f]<0, f] = -1.0
            test.loc[test[f]<0, f] = -1.0
            fmax = max(np.max(train[f]), np.max(test[f]))
            if fmax > 0:
                train.loc[train[f]>0, f] = 1.0*train.loc[train[f]>0, f]/fmax
                test.loc[test[f]>0, f] = 1.0*test.loc[test[f]>0, f]/fmax

        if len(train.loc[train[f]<0, f]) == 0:
            fmax = max(np.max(train[f]), np.max(test[f]))
            if fmax > 0:
                train.loc[train[f]>0, f] = 1.0*train.loc[train[f]>0, f]/fmax
                test.loc[test[f]>0, f] = 1.0*test.loc[test[f]>0, f]/fmax

        if len(train.loc[train[f]<0, f].value_counts()) > 1:
            fmax = max(np.max(train[f]), np.max(test[f]))
            if fmax > 0:
                train[f] = 1.0*train[f]/fmax
                test[f] = 1.0*test[f]/fmax

    return train, test

# Feature engineering

In [None]:
#thuộc tính t_SNE
np.random.seed(12324)
train_tsne, test_tsne = add_features(train, test, ['SumZeros'])

flist = [x for x in train_tsne.columns if not x in ['ID','TARGET']]

X = train_tsne[flist].append(test_tsne[flist], ignore_index=True).values.astype('float64')
svd = TruncatedSVD(n_components=30)
X_svd = svd.fit_transform(X)
X_scaled = StandardScaler().fit_transform(X_svd)
feats_tsne = TSNE(n_components=2, random_state=0).fit_transform(X_scaled)
feats_tsne = pd.DataFrame(feats_tsne, columns=['tsne1', 'tsne2'])
feats_tsne['ID'] = train_tsne[['ID']].append(test_tsne[['ID']], ignore_index=True)['ID'].values
train_tsne = pd.merge(train_tsne, feats_tsne, on='ID', how='left')
test_tsne = pd.merge(test_tsne, feats_tsne, on='ID', how='left')

feat = train_tsne[['ID', 'tsne1', 'tsne2']].append(test_tsne[['ID', 'tsne1', 'tsne2']], ignore_index=True)
feat.to_csv(OUTPUT_PATH + 'tsne_feats.csv', index=False)

In [None]:
#thuộc tính PCA
train_pca, test_pca = add_features(train, test, ['SumZeros'])

flist = [x for x in train_pca.columns if not x in ['ID','TARGET']]

pca = PCA(n_components=2)
x_train_projected = pca.fit_transform(normalize(train_pca[flist], axis=0))
x_test_projected = pca.transform(normalize(test_pca[flist], axis=0))
train_pca.insert(1, 'PCAOne', x_train_projected[:, 0])
train_pca.insert(1, 'PCATwo', x_train_projected[:, 1])
test_pca.insert(1, 'PCAOne', x_test_projected[:, 0])
test_pca.insert(1, 'PCATwo', x_test_projected[:, 1])
pca_feats = train_pca[['ID', 'PCAOne', 'PCATwo']].append(test_pca[['ID', 'PCAOne', 'PCATwo']], ignore_index=True)
pca_feats.to_csv(OUTPUT_PATH + 'dmitry_pca_feats.csv')

In [None]:
#thuộc tính k-means
train_k, test_k = add_features(train, test, ['SumZeros'])
train_k, test_k = normalize_features(train_k, test_k)

flist = [x for x in train_k.columns if not x in ['ID','TARGET']]

flist_kmeans = []
for ncl in range(2,11):
    cls = KMeans(n_clusters=ncl)
    cls.fit_predict(train_k[flist].values)
    train_k['kmeans_cluster'+str(ncl)] = cls.predict(train_k[flist].values)
    test_k['kmeans_cluster'+str(ncl)] = cls.predict(test_k[flist].values)
    flist_kmeans.append('kmeans_cluster'+str(ncl))

train[['ID']+flist_kmeans].append(test[['ID']+flist_kmeans], ignore_index=True).to_csv(OUTPUT_PATH + 'kmeans_feats.csv', index=False)

In [None]:
#Thuộc tính LL = (30.yTB + yG)/(30 + |G|)
def add_likelihood_feature(fname, train_likeli, test_likeli, flist):
    tt_likeli = pd.DataFrame()
    np.random.seed(1232345)
    #Chia fold để tính toán các giá trị
    #Tập test được điền theo tập train
    skf = StratifiedKFold(train_likeli['TARGET'].values, n_folds=5, shuffle=True, random_state=21387)
    for train_index, test_index in skf:
        ids = train_likeli['ID'].values[train_index]
        train_fold = train_likeli.loc[train_likeli['ID'].isin(ids)].copy()
        test_fold = train_likeli.loc[~train_likeli['ID'].isin(ids)].copy()
        global_avg = np.mean(train_fold['TARGET'].values)
        feats_likeli = train_fold.groupby(fname)['TARGET'].agg({'sum': np.sum, 'count': len}).reset_index()
        feats_likeli[fname + '_likeli'] = (feats_likeli['sum'] + 30.0*global_avg)/(feats_likeli['count']+30.0)
        test_fold = pd.merge(test_fold, feats_likeli[[fname, fname + '_likeli']], on=fname, how='left')
        test_fold[fname + '_likeli'] = test_fold[fname + '_likeli'].fillna(global_avg)
        tt_likeli = tt_likeli.append(test_fold[['ID', fname + '_likeli']], ignore_index=True)
    train_likeli = pd.merge(train_likeli, tt_likeli, on='ID', how='left')
    
    global_avg = np.mean(train_likeli['TARGET'].values)
    feats_likeli = train_likeli.groupby(fname)['TARGET'].agg({'sum': np.sum, 'count': len}).reset_index()
    feats_likeli[fname + '_likeli'] = (feats_likeli['sum'] + 30.0*global_avg)/(feats_likeli['count']+30.0)
    test_likeli = pd.merge(test_likeli, feats_likeli[[fname, fname + '_likeli']], on=fname, how='left')
    test_likeli[fname + '_likeli'] = test_likeli[fname + '_likeli'].fillna(global_avg)
    return train_likeli, test_likeli, flist + [fname + '_likeli']

In [1]:
def add_features(train, test, features):
    flist = [x for x in train.columns if not x in ['ID','TARGET']]
    if 'SumZeros' in features:
        train.insert(1, 'SumZeros', (train[flist] == 0).astype(int).sum(axis=1))
        test.insert(1, 'SumZeros', (test[flist] == 0).astype(int).sum(axis=1))
    flist = [x for x in train.columns if not x in ['ID','TARGET']]

    if 'tsne' in features:
        tsne_feats = pd.read_csv(OUTPUT_PATH + 'features/tsne_feats.csv')
        train = pd.merge(train, tsne_feats, on='ID', how='left')
        test = pd.merge(test, tsne_feats, on='ID', how='left')

    if 'pca' in features:
        pca_feats = pd.read_csv(OUTPUT_PATH + 'features/dmitry_pca_feats.csv')
        train = pd.merge(train, pca_feats, on='ID', how='left')
        test = pd.merge(test, pca_feats, on='ID', how='left')

    if 'kmeans' in features:
        kmeans_feats = pd.read_csv(OUTPUT_PATH + 'features/kmeans_feats.csv')
        train = pd.merge(train, kmeans_feats, on='ID', how='left')
        test = pd.merge(test, kmeans_feats, on='ID', how='left')

    return train, test

In [2]:
def train_predict_adaboost_classifier(X_train, y_train, X_test):
    clf = AdaBoostClassifier(n_estimators=300, learning_rate=0.1, random_state=32934)
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_test)[:,1]
    return y_pred

In [3]:
def train_predict_xgboost_bugged(X_train, y_train, X_test):
    param = {}
    param['objective'] = 'binary:logistic'
    param['eta'] = 0.02
    param['max_depth'] = 5
    param['eval_metric'] = 'auc'
    param['silent'] = 1
    param['nthread'] = 6
    param['gamma'] = 1.0
    param['min_child_weight'] = 5
    param['subsample'] = 0.8
    param['colsample_bytree'] = 1.0
    param['colsample_bylevel'] = 0.7
    num_round = 500

    y_pred = [0.0]*len(X_test)
    for seed in [123089, 21324, 324003, 450453, 120032]:
        param['seed'] = seed
        plst = list(param.items())
        xgmat_train = xgb.DMatrix(X_train, label=y_train, missing = -999.0)
        xgmat_test = xgb.DMatrix(X_test, missing = -999.0)
        bst = xgb.train(plst, xgmat_train, num_round)
        y_pred = y_pred + bst.predict( xgmat_test )
    y_pred = y_pred/5.0
    return y_pred

In [4]:
def train_predict_ftrl(X_train, y_train, X_test):
    train_file = '../data/output-ftrl/train_ftrl.csv'
    test_file = '../data/output-ftrl/test_ftrl.csv'
    pred_file = '../data/output-ftrl/pred_ftrl.csv'

    train_csv = pd.DataFrame(X_train)
    train_csv['TARGET'] = y_train
    train_csv['ID'] = [x for x in range(1, len(train_csv)+1)]
    train_csv.to_csv(train_file, index=False)

    test_csv = pd.DataFrame(X_test)
    test_csv['ID'] = [x for x in range(1, len(test_csv)+1)]
    test_csv.to_csv(test_file, index=False)

    non_factor_cols = "''"
    non_feature_cols = "''"
    text_cols = "''"

    os.system('pypy ftrl.py' +
              ' --alpha ' + str(0.06) +
              ' --beta ' + str(1.0) +
              ' --L1 ' + str(0.01) +
              ' --L2 ' + str(1.0) +
              ' --epoch ' + str(3) +
              ' --train ' + train_file +
              ' --test ' + test_file +
              ' --submission ' + pred_file +
              ' --non_feature_cols ' + non_feature_cols +
              ' --non_factor_cols ' + non_factor_cols + 
              ' --text_cols ' + text_cols)

    y_pred = pd.read_csv(pred_file)['PRED'].values
    filelist = glob.glob("../data/output-ftrl/*.*")
    for f in filelist:
        os.remove(f)
    return y_pred

In [None]:
INPUT_PATH = '../data/input/'
OUTPUT_PATH = '../data/output/'

MODELS_ALL = ['ftrl2', 'adaboost_classifier', 'xgboost']
FEATURES_ALL = [['SumZeros', 'likeli'], 
                ['SumZeros', 'pca', 'likeli'],
                ['SumZeros', 'pca', 'likeli']]

train = pd.read_csv(INPUT_PATH + 'train.csv')
test = pd.read_csv(INPUT_PATH + 'test.csv')
preds_all = train[['ID']].append(test[['ID']], ignore_index=True).copy()
for imod in range(len(MODELS_ALL)):
    MODEL = MODELS_ALL[imod]
    FEATURES = FEATURES_ALL[imod]
    print 'Training ' + MODEL + '...'

    train = pd.read_csv(INPUT_PATH + 'train.csv')
    test = pd.read_csv(INPUT_PATH + 'test.csv')
    id_fold = pd.read_csv(INPUT_PATH+'5fold_20times.csv')
    id_fold['ID'] = train['ID'].values

    train, test = process_base(train, test)
    train, test = drop_sparse(train, test)
    train, test = drop_duplicated(train, test)
    train, test = add_features(train, test, FEATURES)

    flist = [x for x in train.columns if not x in ['ID','TARGET']]

    preds_model = pd.DataFrame()
    for it in range(1, 21):
        print 'Processing iteration ' + str(it) + '...'   
        it_id_fold = id_fold[['ID', 'set'+str(it)]]
        it_id_fold.columns = ['ID', 'FOLD']
        if 'FOLD' in train.columns:
            train.drop('FOLD', axis=1, inplace=True)
        train = pd.merge(train, it_id_fold, on='ID', how='left')
        aucs = []
        for fold in range(5):
            train_split = train.query('FOLD != @fold').copy().reset_index(drop=True)
            y_train = train_split['TARGET'].values
            val_split = train.query('FOLD == @fold').copy().reset_index(drop=True)
            test_split = val_split[['ID']+flist].append(test[['ID']+flist], ignore_index=True)
            ids_val = val_split['ID'].values

            if 'likeli' in FEATURES:
                train_split, test_split, flist1 = add_likelihood_feature('saldo_var13', train_split, test_split, flist)
            else:
                flist1 = flist
            
            X_train = train_split[flist1].values
            y_train = train_split['TARGET'].values
            X_test = test_split[flist1].values


            if MODEL == 'xgboost':
                y_pred = train_predict_xgboost_bugged(X_train, y_train, X_test)            

            if MODEL == 'adaboost_classifier':
                y_pred = train_predict_adaboost_classifier(X_train, y_train, X_test)

            if 'ftrl' in MODEL:
                y_pred = train_predict_ftrl(X_train, y_train, X_test)
            
            preds = pd.DataFrame()
            preds['ID'] = test_split['ID'].values
            preds['FOLD'] = fold
            preds['ITER'] = it
            preds[MODEL] = y_pred
            preds_model = preds_model.append(preds, ignore_index=True)

            preds = preds.loc[preds['ID'].isin(ids_val)].copy()
            preds = pd.merge(preds, train[['ID', 'TARGET']], on='ID', how='left')

            fold_auc = auc(preds['TARGET'], preds[MODEL])
            aucs.append(fold_auc)
        print np.mean(aucs), np.std(aucs)

    preds_model.loc[preds_model[MODEL]<0, MODEL] = 0.0
    preds_model.loc[preds_model[MODEL]>1, MODEL] = 1.0
    preds_model = preds_model.groupby(['ID', 'ITER'])[MODEL].mean().reset_index()
    for it in range(1, 21):
        preds_model.loc[preds_model['ITER']==it, MODEL] = preds_model.loc[preds_model['ITER']==it, MODEL].rank()
    preds_model = preds_model.groupby('ID')[MODEL].mean().reset_index()
    preds_model.columns = ['ID', 'dmitry_'+MODEL]
    preds_all = pd.merge(preds_all, preds_model, on='ID', how='left')
    preds_all.to_csv('all_models_temp.csv', index=False)

preds_train = pd.merge(train[['ID']], preds_all, on='ID', how='left')
preds_train.to_csv(OUTPUT_PATH + 'train/' + 'dmitry_train.csv', index=False)
preds_test = pd.merge(test[['ID']], preds_all, on='ID', how='left')
preds_test.to_csv(OUTPUT_PATH + 'test/' + 'dmitry_test.csv', index=False)
print "Done training!"