# Import

In [61]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
import xgboost as xgb
%matplotlib inline

train_fname = 'train.csv'
test_fname = 'test.csv'
df = pd.read_csv(train_fname, sep=';')
df_test = pd.read_csv(test_fname, sep=';')

n_thread = 4

# Preprocessing

In [62]:
# One Hot encode categorical values
categorical = ['VOIE_DEPOT', 'COUNTRY', 'SOURCE_BEGIN_MONTH', 'FISRT_APP_COUNTRY', \
               'FISRT_APP_TYPE', 'LANGUAGE_OF_FILLING', 'FISRT_INV_COUNTRY', 'FISRT_INV_TYPE', \
              'SOURCE_CITED_AGE', 'SOURCE_IDX_ORI', 'SOURCE_IDX_RAD']
for column in categorical:
    df = pd.concat([df, pd.get_dummies(df[column])], axis=1)
    df_test = pd.concat([df_test, pd.get_dummies(df_test[column])], axis=1)
    
df = df.drop(categorical, axis=1)
df_test = df_test.drop(categorical, axis=1)

In [63]:
# FIRST_CLASSE
# This column contain too many values to be one hot encoded. Therefore it is just converted to numerical values

df.FIRST_CLASSE = df.FIRST_CLASSE.fillna(-1.)
df_test.FIRST_CLASSE = df_test.FIRST_CLASSE.fillna(-1.)

df.FIRST_CLASSE = df.FIRST_CLASSE.apply(hash)
df_test.FIRST_CLASSE = df_test.FIRST_CLASSE.apply(hash)

In [64]:
# TECHNOLOGIE_SECTOR
# In this column we take account of the number of the sector, written in roman number

df.TECHNOLOGIE_SECTOR = df.TECHNOLOGIE_SECTOR.fillna(-1)
df_test.TECHNOLOGIE_SECTOR = df_test.TECHNOLOGIE_SECTOR.fillna(-1)

a = df.TECHNOLOGIE_SECTOR.tolist()

for i in range(0,259431):
    if(a[i]!=-1):
        if(a[i][0] == 'I'):
            if(a[i][1] == 'I'):
                if(a[i][2] == 'I'):
                    a[i] = 3
                else:
                    a[i] = 2
            elif(a[i][1] == 'V'):
                a[i] = 4
            else:
                a[i] = 1
        elif(a[i][0] == 'V'):
            a[i] = 5
        else:
            a[i] = -1
        
df.TECHNOLOGIE_SECTOR = a

a = df_test.TECHNOLOGIE_SECTOR.tolist()
        
for i in range(0,129715):
    if(a[i]!=-1):
        if(a[i][0] == 'I'):
            if(a[i][1] == 'I'):
                if(a[i][2] == 'I'):
                    a[i] = 3
                else:
                    a[i] = 2
            elif(a[i][1] == 'V'):
                a[i] = 4
            else:
                a[i] = 1
        elif(a[i][0] == 'V'):
            a[i] = 5
        else:
            a[i] = -1
        
df_test.TECHNOLOGIE_SECTOR = a

In [65]:
# TECHNOLOGIE_FIELD

df.TECHNOLOGIE_FIELD = df.TECHNOLOGIE_FIELD.fillna(-1)
df_test.TECHNOLOGIE_FIELD = df_test.TECHNOLOGIE_FIELD.fillna(-1)

a = df.TECHNOLOGIE_FIELD.tolist()
for i in range(0,259431):
    b = a[i][0:2]
    c = int(b)
    a[i] = c
df.TECHNOLOGIE_FIELD = a

a = df_test.TECHNOLOGIE_FIELD.tolist()
for i in range(0,129715):
    b = a[i][0:2]
    c = int(b)
    a[i] = c
df_test.TECHNOLOGIE_FIELD = a

In [66]:
# MAIN_IPC
# As for FIRST_CLASSE, this column has too many values to be one hot encoded

df.MAIN_IPC = df.MAIN_IPC.fillna(-1.)
df_test.MAIN_IPC = df_test.MAIN_IPC.fillna(-1.)

df.MAIN_IPC = df.MAIN_IPC.apply(hash)
df_test.MAIN_IPC = df_test.MAIN_IPC.apply(hash)

In [67]:
# PRIORITY_MONTH
# We have 4 columns with two numbers: one for the month and the other for the year.
# We merge these number in order to take advantage of the order of the time

df.PRIORITY_MONTH = df.PRIORITY_MONTH.fillna(-1)
df_test.PRIORITY_MONTH = df_test.PRIORITY_MONTH.fillna(-1)

a = df.PRIORITY_MONTH.tolist()
for i in range(0,259431):
    if(a[i]!=-1):
        b = a[i][3:7]
        c = a[i][0:2]
        d = b + c
        a[i] = int(d)
df.PRIORITY_MONTH = a

a = df_test.PRIORITY_MONTH.tolist()
for i in range(0,129715):
    if(a[i]!=-1):
        b = a[i][3:7]
        c = a[i][0:2]
        d = b + c
        a[i] = int(d)
df_test.PRIORITY_MONTH = a

In [68]:
# FILING_MONTH

df.FILING_MONTH = df.FILING_MONTH.fillna(-1)
df_test.FILING_MONTH = df_test.FILING_MONTH.fillna(-1)

a = df.FILING_MONTH.tolist()
for i in range(0,259431):
    if(a[i]!=-1):
        b = a[i][3:7]
        c = a[i][0:2]
        d = b + c
        a[i] = int(d)
df.FILING_MONTH = a

a = df_test.FILING_MONTH.tolist()
for i in range(0,129715):
    if(a[i]!=-1):
        b = a[i][3:7]
        c = a[i][0:2]
        d = b + c
        a[i] = int(d)
df_test.FILING_MONTH = a

In [69]:
# PUBLICATION_MONTH

df.PUBLICATION_MONTH = df.PUBLICATION_MONTH.fillna(-1)
df_test.PUBLICATION_MONTH = df_test.PUBLICATION_MONTH.fillna(-1)

a = df.PUBLICATION_MONTH.tolist()
for i in range(0,259431):
    if(a[i]!=-1):
        b = a[i][3:7]
        c = a[i][0:2]
        d = b + c
        a[i] = int(d)
df.PUBLICATION_MONTH = a

a = df_test.PUBLICATION_MONTH.tolist()
for i in range(0,129715):
    if(a[i]!=-1):
        b = a[i][3:7]
        c = a[i][0:2]
        d = b + c
        a[i] = int(d)
df_test.PUBLICATION_MONTH = a

In [70]:
# BEGIN_MONTH

df.BEGIN_MONTH = df.BEGIN_MONTH.fillna(-1)
df_test.BEGIN_MONTH = df_test.BEGIN_MONTH.fillna(-1)

a = df.BEGIN_MONTH.tolist()
for i in range(0,259431):
    if(a[i]!=-1):
        b = a[i][3:7]
        c = a[i][0:2]
        d = b + c
        a[i] = int(d)
df.BEGIN_MONTH = a

a = df_test.BEGIN_MONTH.tolist()
for i in range(0,129715):
    if(a[i]!=-1):
        b = a[i][3:7]
        c = a[i][0:2]
        d = b + c
        a[i] = int(d)
df_test.BEGIN_MONTH = a

# Train

## Setting Parameters

In [71]:
# Only use the columns present in both train and test dataframe
a = list(df.columns.values)
b = list(df_test.columns.values)
c = []

for x in a:
    if (x in b):
        c.append(x)
for x in b:
    if (x in a):
        if (x in c):
            k=0
        else:
            c.append(x)

In [72]:
feature_names = c
X_train = df[feature_names].values
X_test = df_test[feature_names].values
y_train = df.VARIABLE_CIBLE == 'GRANTED'

## XGBoost

In [73]:
dtrain = xgb.DMatrix(X_train, label=y_train)

In [81]:
# Paramêtres du classifieur
param = {'max_depth':8,
         'eta':0.07,
         'gamma ':0,
         'min_child_weight':1,
         'max_delta_step':0,
         'subsample':1,
         'colsample_bytree':0.4,
         'colsample_bylevel':1,
         'silent':1,
         'objective':'binary:logistic' }
param['nthread'] = n_thread
param['eval_metric'] = 'auc'
num_round = 350

In [None]:
# Train model
bst = xgb.train(param, dtrain, num_round)
y_pred_train_xgb = bst.predict(dtrain)

In [None]:
# Save predictions
X_test = df_test[feature_names].values
dtest = xgb.DMatrix(X_test)
y_pred_xgb = bst.predict(dtest)
np.savetxt('y_pred.txt', y_pred_xgb, fmt='%s')