# Import thư viện và tải dữ liệu

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.model_selection import StratifiedKFold
from scipy.stats.stats import pearsonr
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer, PolynomialFeatures, MinMaxScaler
from datetime import date
from sklearn.cluster import KMeans
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
import os
import glob
import sys, getopt, re

In [7]:
!wget https://github.com/linh0303052/AplliedDSGroup11/raw/main/data.tar.gz

--2022-01-07 20:03:39--  https://github.com/linh0303052/AplliedDSGroup11/raw/main/data.tar.gz
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/linh0303052/AplliedDSGroup11/main/data.tar.gz [following]
--2022-01-07 20:03:39--  https://raw.githubusercontent.com/linh0303052/AplliedDSGroup11/main/data.tar.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12633830 (12M) [application/octet-stream]
Saving to: ‘data.tar.gz’


2022-01-07 20:03:40 (8.57 MB/s) - ‘data.tar.gz’ saved [12633830/12633830]



In [8]:
!tar -xzvf data.tar.gz

data/
data/input/
data/input/5fold_20times.csv
data/input/sample_submission.csv
data/input/test.csv
data/input/train.csv
data/output/
data/output/features/
data/output/features/dmitry_pca_feats.csv
data/output/features/kmeans_feats.csv
data/output/features/tsne_feats.csv


In [None]:
train = pd.read_csv('./data/input/train.csv')
train.shape

In [None]:
test = pd.read_csv('./data/input/test.csv')
test.shape

In [None]:
INPUT_PATH = './data/input/'
OUTPUT_PATH = './data/output/'

# Tiền xử lý dữ liệu

In [9]:
#xử lý các giá trị đặc biệt, thay thế nó bằng giá trị NA (-999.0)
def process_base(train, test):
    train.loc[(train['var38']>117310.979) & (train['var38']<117310.98), 'var38'] = -999.0
    test.loc[(test['var38']>117310.979) & (test['var38']<117310.98), 'var38'] = -999.0

    train.loc[train['var3']==-999999, 'var3'] = -999.0
    test.loc[test['var3']==-999999, 'var3'] = -999.0

    for f in ['imp_op_var40_comer_ult1', 'imp_op_var40_efect_ult3', 'imp_op_var41_comer_ult3', 'imp_sal_var16_ult1']:
        train.loc[train[f]==0.0, f] = -999.0
        test.loc[test[f]==0.0, f] = -999.0

    return train, test

In [10]:
def drop_sparse(train, test):
    flist = [x for x in train.columns if not x in ['ID','TARGET']]
    for f in flist:
        if len(np.unique(train[f]))<2:
            train.drop(f, axis=1, inplace=True)
            test.drop(f, axis=1, inplace=True)
    return train, test

In [11]:
def drop_duplicated(train, test):    
    #Loại bỏ var6 vì nó trùng với var29
    flist = [x for x in train.columns if not x in ['ID','TARGET']]            
    train.drop([x for x in flist if 'var6' in x], axis=1, inplace=True)
    test.drop([x for x in flist if 'var6' in x], axis=1, inplace=True)

    #Loại bỏ các thuộc tính có chứa _0 vì nó bị trùng với cột có chứa _1 theo ngay sau
    flist = [x for x in train.columns if not x in ['ID','TARGET']]        
    flist_remove = []
    for i in range(len(flist)-1):
        v = train[flist[i]].values
        for j in range(i+1, len(flist)):
            if np.array_equal(v, train[flist[j]].values):
                if '_0' in flist[j]:
                    flist_remove.append(flist[j])
                elif  '_0' in flist[i]:
                    flist_remove.append(flist[i])
    train.drop(flist_remove, axis=1, inplace=True)
    test.drop(flist_remove, axis=1, inplace=True)

    #Loại bỏ các cột bị trùng khác
    flist_remove = ['saldo_medio_var13_medio_ult1', 'delta_imp_reemb_var13_1y3', 'delta_imp_reemb_var17_1y3', 
                       'delta_imp_reemb_var33_1y3', 'delta_imp_trasp_var17_in_1y3', 'delta_imp_trasp_var17_out_1y3',
                       'delta_imp_trasp_var33_in_1y3', 'delta_imp_trasp_var33_out_1y3']
    train.drop(flist_remove, axis=1, inplace=True)
    test.drop(flist_remove, axis=1, inplace=True)
    
    return train, test

In [12]:
#Chuẩn hóa các giá trị thuộc tính
def normalize_features(train, test):
    flist = [x for x in train.columns if not x in ['ID','TARGET']]
    for f in flist:
        if train[f].max() == 9999999999.0:
            fmax = train.loc[train[f]<9999999999.0, f].max()
            train.loc[train[f]==9999999999.0, f] = fmax + 1

        if len(train.loc[train[f]<0, f].value_counts()) == 1:
            train.loc[train[f]<0, f] = -1.0
            test.loc[test[f]<0, f] = -1.0
            fmax = max(np.max(train[f]), np.max(test[f]))
            if fmax > 0:
                train.loc[train[f]>0, f] = 1.0*train.loc[train[f]>0, f]/fmax
                test.loc[test[f]>0, f] = 1.0*test.loc[test[f]>0, f]/fmax

        if len(train.loc[train[f]<0, f]) == 0:
            fmax = max(np.max(train[f]), np.max(test[f]))
            if fmax > 0:
                train.loc[train[f]>0, f] = 1.0*train.loc[train[f]>0, f]/fmax
                test.loc[test[f]>0, f] = 1.0*test.loc[test[f]>0, f]/fmax

        if len(train.loc[train[f]<0, f].value_counts()) > 1:
            fmax = max(np.max(train[f]), np.max(test[f]))
            if fmax > 0:
                train[f] = 1.0*train[f]/fmax
                test[f] = 1.0*test[f]/fmax

    return train, test

# Feature engineering

In [None]:
#thuộc tính t_SNE
np.random.seed(12324)
train_tsne, test_tsne = add_features(train, test, ['SumZeros'])

flist = [x for x in train_tsne.columns if not x in ['ID','TARGET']]

X = train_tsne[flist].append(test_tsne[flist], ignore_index=True).values.astype('float64')
svd = TruncatedSVD(n_components=30)
X_svd = svd.fit_transform(X)
X_scaled = StandardScaler().fit_transform(X_svd)
feats_tsne = TSNE(n_components=2, random_state=0).fit_transform(X_scaled)
feats_tsne = pd.DataFrame(feats_tsne, columns=['tsne1', 'tsne2'])
feats_tsne['ID'] = train_tsne[['ID']].append(test_tsne[['ID']], ignore_index=True)['ID'].values
train_tsne = pd.merge(train_tsne, feats_tsne, on='ID', how='left')
test_tsne = pd.merge(test_tsne, feats_tsne, on='ID', how='left')

feat = train_tsne[['ID', 'tsne1', 'tsne2']].append(test_tsne[['ID', 'tsne1', 'tsne2']], ignore_index=True)
feat.to_csv(OUTPUT_PATH + 'tsne_feats.csv', index=False)

In [None]:
#thuộc tính PCA
train_pca, test_pca = add_features(train, test, ['SumZeros'])

flist = [x for x in train_pca.columns if not x in ['ID','TARGET']]

pca = PCA(n_components=2)
x_train_projected = pca.fit_transform(normalize(train_pca[flist], axis=0))
x_test_projected = pca.transform(normalize(test_pca[flist], axis=0))
train_pca.insert(1, 'PCAOne', x_train_projected[:, 0])
train_pca.insert(1, 'PCATwo', x_train_projected[:, 1])
test_pca.insert(1, 'PCAOne', x_test_projected[:, 0])
test_pca.insert(1, 'PCATwo', x_test_projected[:, 1])
pca_feats = train_pca[['ID', 'PCAOne', 'PCATwo']].append(test_pca[['ID', 'PCAOne', 'PCATwo']], ignore_index=True)
pca_feats.to_csv(OUTPUT_PATH + 'dmitry_pca_feats.csv')

In [None]:
#thuộc tính k-means
train_k, test_k = add_features(train, test, ['SumZeros'])
train_k, test_k = normalize_features(train_k, test_k)

flist = [x for x in train_k.columns if not x in ['ID','TARGET']]

flist_kmeans = []
for ncl in range(2,11):
    cls = KMeans(n_clusters=ncl)
    cls.fit_predict(train_k[flist].values)
    train_k['kmeans_cluster'+str(ncl)] = cls.predict(train_k[flist].values)
    test_k['kmeans_cluster'+str(ncl)] = cls.predict(test_k[flist].values)
    flist_kmeans.append('kmeans_cluster'+str(ncl))

train[['ID']+flist_kmeans].append(test[['ID']+flist_kmeans], ignore_index=True).to_csv(OUTPUT_PATH + 'kmeans_feats.csv', index=False)