In [None]:
import tensorflow as tf

import numpy as np
import sys
import os
import matplotlib.pyplot as plt
import math
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [None]:
# Load training data

nrows_train = 1068504
nrows_test = 0

from google.colab import drive
drive.mount('/content/gdrive')
root_dir = "/content/gdrive/My Drive/"

xtrain = np.loadtxt(root_dir + 'xtrain_challenge.csv', delimiter=',', skiprows = 1, max_rows = nrows_train + nrows_test)
ytrain = np.loadtxt(root_dir + 'ytrain_challenge.csv', delimiter=',', skiprows = 1, max_rows = nrows_train + nrows_test)
ytrain = np.array(ytrain).reshape(nrows_train + nrows_test)

In the cell below, we perform data augmentation techniques.

Our algorithm (decision trees) doesn't require scaled data, and allow their range to be heterogeneous. However, we can still mirror the data: since each sample is composed of two faces, and a certain range of features is separable with respect to one of the two faces, we can simply exchange them. This increases the size of the training set by a 2 factor.

In [None]:
#%%
# Pre-processing: we just remove the 13*2 first features, concerning only one of the two faces
#xtrain_preprocessed = xtrain[:, 26:]
xtrain = xtrain.astype('float32')
xtest = np.loadtxt(root_dir + 'xtest_challenge.csv', delimiter=',', skiprows = 1).astype('float32')

# We change the columns
#l2_diff_train = np.sqrt((xtrain[:, 0:13] - xtrain[:, 13:26])**2)
#l2_diff_test = np.sqrt((xtest[:, 0:13] - xtest[:, 13:26])**2)
#xtrain = np.hstack((l2_diff_train, xtrain[:, 26:]))
#xtest = np.hstack((l2_diff_test, xtest[:, 26:]))

scaler = StandardScaler()
scaler.fit(xtrain)

x_train_permuter = np.copy(xtrain)
x_train_permuter[:, :13] = xtrain[:, 13:26]
x_train_permuter[:, 13:26] = xtrain[:, :13]

new_x_train = np.concatenate((xtrain, x_train_permuter), axis=0)
new_y_train = np.concatenate((ytrain, ytrain), axis=0)

#xtrain_preprocessed=scaler.transform(xtrain)
xtrain_preprocessed = xtrain
#xtest_preprocessed = scaler.transform(xtest)
xtest_preprocessed = xtest
#xtrain_preprocessed, xval_preprocessed, ytrain, yval = train_test_split(xtrain_preprocessed, ytrain, test_size=0.0) #KAMIKAZE
print(xtrain_preprocessed.shape)
print(ytrain.shape)

In the code cell below we apply cross-validation on the XGBClassifier model. We boost the learning of many decision trees which are considered as weak leaners.
The most important parameters are :
- learning_rate: similar to other gradient descent methods, the classifier perform more or less a gradient correction. A small learning rate will lead to very slow convergence, a big learning rate can make us miss the sweet spot.
- n_estimators: to be considered as a trade-off with the learning rate. It is the number of rounds performed by the classifier during learning. A too great number will lead to over-fitting.
- max_depth: the maximum depth related to the decision trees. The less it is, the more the decision trees are basic (hence having higher bias, which is not a problem for boosting).

In [None]:
from xgboost import XGBClassifier

with tf.device('/device:GPU:0'):
  boost = XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,max_depth=6
                        gamma=0, learning_rate=0.25, max_delta_step=0,
                        min_child_weight=1, missing=None, n_estimators=700, nthread=-1,
                        objective='binary:logistic', sampling_method='gradient_based',
                        reg_alpha=0, reg_lambda=1, tree_method='gpu_hist',
                        scale_pos_weight=1, seed=0, silent=True, subsample=1)
  
  p_grid_boost = {'max_depth': [2,3,4,5,6,7,8,9,10], 'n_estimators':[500,600,700,800,900,1000]}   

  grid_boost = GridSearchCV(estimator=boost, param_grid=p_grid_boost, scoring="accuracy", cv=5)
  
  grid_boost.fit(xtrain_preprocessed, ytrain)

  print("Best Score: {}".format(grid_boost.best_score_))
  print("Best params: {}".format(grid_boost.best_params_))

In [None]:
boost = XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
                        gamma=0, learning_rate=0.25, max_delta_step=0, max_depth=6,
                        min_child_weight=1, missing=None, n_estimators=700, nthread=-1,
                        objective='binary:logistic', sampling_method='gradient_based',
                        reg_alpha=0, reg_lambda=1, tree_method='gpu_hist',
                        scale_pos_weight=1, seed=0, silent=True, subsample=1)
boost.fit(new_x_train, new_y_train)

In [None]:
output = boost.predict(xtest_preprocessed)

In [None]:
np.savetxt(root_dir + 'ytest_challenge_student.csv', output, fmt = '%1.0d', delimiter=',')