# Vestiaire Collective Hackathon

## Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import average_precision_score
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import catboost as cb
from catboost import CatBoostClassifier

import imblearn as imb
from imblearn.over_sampling import RandomOverSampler

import joblib


import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

## Load data

In [2]:
data = pd.read_csv('./src/train.csv')
data.drop(columns=['Unnamed: 0'], inplace=True)


In [3]:
data.head()

Unnamed: 0,ID_PRODUCT,ID_SELLER,ID_SELLER_COUNTRY,SELLER_GEO_1,SELLER_GEO_2,SELLER_GEO_3,DEPOSIT_PRICE,INSERTION_PRICE,PRICE,RECO_PRICE,...,TOTAL_TRK_1D,TOTAL_TRK_7D,TOTAL_TRK_30D,NB_DROP_SELLER_1D,NB_DROP_SELLER_7D,NB_DROP_SELLER_30D,NB_WIDTHDRAW_SELLER_1D,NB_WIDTHDRAW_SELLER_7D,NB_WIDTHDRAW_SELLER_30D,LABEL
0,f398701175db97ad9f9ae4f061a8c7d7ef4da505708f0b...,904fc91a25b0630028eaaf0941b228a62f9341eadde903...,1253e9373e781b7500266caa55150e08e210bc8cd8cc70...,7e3a78f9aa16d14453f363271db2973b903b3949684f0f...,cf62a64b8a54fd96e70623b69429a70e1ba0e0ef9b502c...,5a9cf672c8be6b5ab9546a2fb49b06dd81a4e364c86ed0...,78.0,78.0,67.0,52.0,...,0,8,26,0,0,0,0,0,0,0
1,7e0544c102ef705f3939dacb080bb23686355879c3ac77...,216fce1cec515e792bd2d5aa5c68ac84d8118ad11823fb...,eb624dbe56eb6620ae62080c10a273cab73ae8eca98ab1...,7e3a78f9aa16d14453f363271db2973b903b3949684f0f...,09fbaf8891f39040036484a565bfc3f832713ce3f2d22b...,7a1ca4ef7515f7276bae7230545829c27810c9d9e98ab2...,258.0,262.0,118.0,94.0,...,39,154,612,2,25,123,0,5,5,0
2,fe9ca89ffb93396c469674056158d6ddfe10e94efb3807...,562a34b067f011d9736069d692be44aeb624a7d8b6eba6...,eb624dbe56eb6620ae62080c10a273cab73ae8eca98ab1...,7e3a78f9aa16d14453f363271db2973b903b3949684f0f...,09fbaf8891f39040036484a565bfc3f832713ce3f2d22b...,7a1ca4ef7515f7276bae7230545829c27810c9d9e98ab2...,478.0,478.0,232.0,2.0,...,12,75,113,1,11,16,0,2,2,0
3,66832d4bbd55c568753a6ec237f8aa213c0cf55929a544...,90c4e50ebbc3dd146dc2852b2b8d428c23fb4eebfee02e...,56f4da26ed956730309fa1488611ee0f13b0ac95ebb1bc...,2099c82f0bcc1c13c9ecc9dd8848c23916cf0eea8f7eef...,9b202ecbc6d45c6d8901d989a918878397a3eb9d00e8f4...,49dca65f362fee401292ed7ada96f96295eab1e589c52e...,350.473498,354.473498,287.256198,0.0,...,4,6,8,0,0,4,0,0,1,0
4,72a3d1f2ed0a526408159da9bb5bb584790eed9ff6d074...,adb633b0e58e3969d4dc099e4b8beb734282f3bcbbd77d...,eb624dbe56eb6620ae62080c10a273cab73ae8eca98ab1...,7e3a78f9aa16d14453f363271db2973b903b3949684f0f...,09fbaf8891f39040036484a565bfc3f832713ce3f2d22b...,7a1ca4ef7515f7276bae7230545829c27810c9d9e98ab2...,91.0,91.0,72.0,86.0,...,18,91,373,2,8,33,0,0,4,0


## EDA

### CORRELATIONS

In [4]:
corr = data.corr()
label_corr = abs(corr['LABEL'].sort_values(ascending=False))
label_corr_best = label_corr[:40]
label_corr_best



LABEL                                      1.000000
NB_DAYS_SINCE_LAST_SOLD                    0.130014
NB_DAYS_SINCE_LAST_ORDER                   0.130011
NB_DAYS_SINCE_SELLER_REPLY_TO_MMAO         0.116948
NB_DAYS_SINCE_LAST_PUBLISHED               0.112946
NB_DAYS_SINCE_LAST_DEPOSITED               0.104858
NB_DAYS_SINCE_LAST_SESSION                 0.097066
NB_DAYS_SINCE_LAST_BS_CHAT                 0.084662
TOTAL_SELLER_CANCELLED                     0.065889
FLAG2                                      0.062977
NON_RECEIVED_PCT                           0.062277
TOTAL_NON_RECEIVED                         0.056298
RECO_PRICE                                 0.049686
PRICE                                      0.047094
NB_DAYS_SINCE_LAST_LIKES                   0.043861
TOTAL_MMAO_TIMEOUT_7D                      0.034946
TOTAL_MMAO_TIMEOUT_1D                      0.031764
TOTAL_MMAO_TIMEOUT_30D                     0.028987
TIME_ONLINE                                0.027985
MMAO_NB     

### Labels Lists

#### Features and target lists

In [5]:
# Label list
features_label = data.columns.to_list()
target_label = [features_label.pop(features_label.index('LABEL'))]

In [6]:
features_label

['ID_PRODUCT',
 'ID_SELLER',
 'ID_SELLER_COUNTRY',
 'SELLER_GEO_1',
 'SELLER_GEO_2',
 'SELLER_GEO_3',
 'DEPOSIT_PRICE',
 'INSERTION_PRICE',
 'PRICE',
 'RECO_PRICE',
 'TIME_ONLINE',
 'SEGMENT',
 'MMAO_NB',
 'BRAND_GROUP',
 'ID_BRAND',
 'ID_PAGE',
 'ID_SITE',
 'LANGUAGE',
 'ID_UNIVERSE',
 'ID_CATEGORY',
 'ID_SUB_SUBCATEGORY',
 'ID_MODEL',
 'ID_MATERIAL',
 'ID_COLOUR',
 'ID_PATTERN',
 'CURRENCY',
 'ID_CONDITION',
 'FLAG1',
 'DEPOSIT_DEVICE',
 'SELLER_AGE',
 'SELLER_FROM_FIRST_SELL_TO_INVOICE',
 'FLAG2',
 'NB_DROP_PRODUCT_1D',
 'NB_DROP_PRODUCT_7D',
 'NB_DROP_PRODUCT_30D',
 'NB_BS_CHAT_PRODUCT_1D',
 'NB_BS_CHAT_PRODUCT_7D',
 'NB_BS_CHAT_PRODUCT_30D',
 'NB_DAYS_SINCE_LAST_LIKES',
 'NB_LIKES_1D',
 'NB_LIKES_7D',
 'NB_LIKES_30D',
 'NB_DAYS_SINCE_LAST_WISHLISTS',
 'NB_WISHLISTS_1D',
 'NB_WISHLISTS_7D',
 'NB_WISHLISTS_30D',
 'NB_DAYS_SINCE_SELLER_REPLY_TO_MMAO',
 'TOTAL_MMAO_REPLIED_1D',
 'TOTAL_MMAO_TIMEOUT_1D',
 'TOTAL_MMAO_REPLIED_7D',
 'TOTAL_MMAO_TIMEOUT_7D',
 'TOTAL_MMAO_REPLIED_30D',
 'T

In [7]:
target_label

['LABEL']

#### Numerical and categorical features

In [8]:
numerical_label = features_label.copy()

In [9]:
categorical_label = numerical_label.pop(numerical_label.index('ID_PRODUCT'))
categorical_label = [categorical_label]
categorical_label.append(numerical_label.pop(numerical_label.index('ID_SELLER')))
categorical_label.append(numerical_label.pop(numerical_label.index('ID_SELLER_COUNTRY')))
categorical_label.append(numerical_label.pop(numerical_label.index('SELLER_GEO_1')))
categorical_label.append(numerical_label.pop(numerical_label.index('SELLER_GEO_2')))
categorical_label.append(numerical_label.pop(numerical_label.index('SELLER_GEO_3')))
categorical_label.append(numerical_label.pop(numerical_label.index('SEGMENT')))
categorical_label.append(numerical_label.pop(numerical_label.index('BRAND_GROUP')))
categorical_label.append(numerical_label.pop(numerical_label.index('ID_BRAND')))
categorical_label.append(numerical_label.pop(numerical_label.index('ID_PAGE')))
categorical_label.append(numerical_label.pop(numerical_label.index('ID_SITE')))
categorical_label.append(numerical_label.pop(numerical_label.index('LANGUAGE')))
categorical_label.append(numerical_label.pop(numerical_label.index('ID_UNIVERSE')))
categorical_label.append(numerical_label.pop(numerical_label.index('ID_CATEGORY')))
categorical_label.append(numerical_label.pop(numerical_label.index('ID_SUB_SUBCATEGORY')))
categorical_label.append(numerical_label.pop(numerical_label.index('ID_MODEL')))
categorical_label.append(numerical_label.pop(numerical_label.index('ID_MATERIAL')))
categorical_label.append(numerical_label.pop(numerical_label.index('ID_COLOUR')))
categorical_label.append(numerical_label.pop(numerical_label.index('ID_PATTERN')))
categorical_label.append(numerical_label.pop(numerical_label.index('CURRENCY')))
categorical_label.append(numerical_label.pop(numerical_label.index('ID_CONDITION')))
categorical_label.append(numerical_label.pop(numerical_label.index('DEPOSIT_DEVICE')))

In [10]:
categorical_label

['ID_PRODUCT',
 'ID_SELLER',
 'ID_SELLER_COUNTRY',
 'SELLER_GEO_1',
 'SELLER_GEO_2',
 'SELLER_GEO_3',
 'SEGMENT',
 'BRAND_GROUP',
 'ID_BRAND',
 'ID_PAGE',
 'ID_SITE',
 'LANGUAGE',
 'ID_UNIVERSE',
 'ID_CATEGORY',
 'ID_SUB_SUBCATEGORY',
 'ID_MODEL',
 'ID_MATERIAL',
 'ID_COLOUR',
 'ID_PATTERN',
 'CURRENCY',
 'ID_CONDITION',
 'DEPOSIT_DEVICE']

## Preprocessing

In [11]:
# Seperate features and target
X = data[features_label]
Y = data['LABEL']

In [12]:
# Preprocessing pipeline

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])
preprocess = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_label),
        ('cat', cat_transformer, categorical_label)
    ])
X = preprocess.fit_transform(X)

In [13]:
# Train test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

print(average_precision_score(Y_train_pca, Y_train_pca_pred))
print(average_precision_score(Y_test_pca, Y_test_pca_pred))

## Baseline : Logistic Regression

### Train model

In [14]:
# Train logistic regression
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
# Predictions in probabilities
Y_pred_proba_train = logreg.predict_proba(X_train)
Y_pred_proba_test = logreg.predict_proba(X_test)


### Score

In [16]:
# Average precision score
print('Average precision score on train set: ', average_precision_score(Y_train, Y_pred_proba_train[:,1]))
print('Average precision score on test set: ', average_precision_score(Y_test, Y_pred_proba_test[:,1]))

Average precision score on train set:  0.30827200760353957
Average precision score on test set:  0.19270321713747957


### Save Model

In [17]:
# save the model to disk
filename = './src/logreg_model.pkl'
joblib.dump(logreg, filename)

['./src/logreg_model.pkl']

### Submission

In [18]:
# Load submission data
test_sub = pd.read_csv('./src/test.csv')
test_sub.head()

# Preprocess test data
test_sub = preprocess.transform(test_sub)
test_sub

# Prediction on test data
test_sub_pred = logreg.predict_proba(test_sub)[:, 1]
test_sub_pred

# Save prediction
submission = pd.DataFrame({'LABEL': test_sub_pred})
submission.reset_index(inplace=True)
submission.to_csv('./src/submission_b.csv', index=False)




## Random Over Sampling

In [19]:
# Random over sampling to balance the dataset
data_os = data.copy()
ros = RandomOverSampler(sampling_strategy='minority')
X_os= data_os[features_label]
Y_os= data_os['LABEL']
X_res, Y_res = ros.fit_resample(X_os, Y_os)
data_os = pd.concat([X_res, Y_res], axis=1)
data_os.reset_index(inplace=True, drop=True)

In [20]:
data_os

Unnamed: 0,ID_PRODUCT,ID_SELLER,ID_SELLER_COUNTRY,SELLER_GEO_1,SELLER_GEO_2,SELLER_GEO_3,DEPOSIT_PRICE,INSERTION_PRICE,PRICE,RECO_PRICE,...,TOTAL_TRK_1D,TOTAL_TRK_7D,TOTAL_TRK_30D,NB_DROP_SELLER_1D,NB_DROP_SELLER_7D,NB_DROP_SELLER_30D,NB_WIDTHDRAW_SELLER_1D,NB_WIDTHDRAW_SELLER_7D,NB_WIDTHDRAW_SELLER_30D,LABEL
0,f398701175db97ad9f9ae4f061a8c7d7ef4da505708f0b...,904fc91a25b0630028eaaf0941b228a62f9341eadde903...,1253e9373e781b7500266caa55150e08e210bc8cd8cc70...,7e3a78f9aa16d14453f363271db2973b903b3949684f0f...,cf62a64b8a54fd96e70623b69429a70e1ba0e0ef9b502c...,5a9cf672c8be6b5ab9546a2fb49b06dd81a4e364c86ed0...,78.000000,78.000000,67.000000,52.0,...,0,8,26,0,0,0,0,0,0,0
1,7e0544c102ef705f3939dacb080bb23686355879c3ac77...,216fce1cec515e792bd2d5aa5c68ac84d8118ad11823fb...,eb624dbe56eb6620ae62080c10a273cab73ae8eca98ab1...,7e3a78f9aa16d14453f363271db2973b903b3949684f0f...,09fbaf8891f39040036484a565bfc3f832713ce3f2d22b...,7a1ca4ef7515f7276bae7230545829c27810c9d9e98ab2...,258.000000,262.000000,118.000000,94.0,...,39,154,612,2,25,123,0,5,5,0
2,fe9ca89ffb93396c469674056158d6ddfe10e94efb3807...,562a34b067f011d9736069d692be44aeb624a7d8b6eba6...,eb624dbe56eb6620ae62080c10a273cab73ae8eca98ab1...,7e3a78f9aa16d14453f363271db2973b903b3949684f0f...,09fbaf8891f39040036484a565bfc3f832713ce3f2d22b...,7a1ca4ef7515f7276bae7230545829c27810c9d9e98ab2...,478.000000,478.000000,232.000000,2.0,...,12,75,113,1,11,16,0,2,2,0
3,66832d4bbd55c568753a6ec237f8aa213c0cf55929a544...,90c4e50ebbc3dd146dc2852b2b8d428c23fb4eebfee02e...,56f4da26ed956730309fa1488611ee0f13b0ac95ebb1bc...,2099c82f0bcc1c13c9ecc9dd8848c23916cf0eea8f7eef...,9b202ecbc6d45c6d8901d989a918878397a3eb9d00e8f4...,49dca65f362fee401292ed7ada96f96295eab1e589c52e...,350.473498,354.473498,287.256198,0.0,...,4,6,8,0,0,4,0,0,1,0
4,72a3d1f2ed0a526408159da9bb5bb584790eed9ff6d074...,adb633b0e58e3969d4dc099e4b8beb734282f3bcbbd77d...,eb624dbe56eb6620ae62080c10a273cab73ae8eca98ab1...,7e3a78f9aa16d14453f363271db2973b903b3949684f0f...,09fbaf8891f39040036484a565bfc3f832713ce3f2d22b...,7a1ca4ef7515f7276bae7230545829c27810c9d9e98ab2...,91.000000,91.000000,72.000000,86.0,...,18,91,373,2,8,33,0,0,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
743421,53c46c7b4c15c7da36d2c34ee15bfd88721a0a3949f971...,e3c1d15fd034c4e09a0a02b92beb62b3dbd2bc46612bb3...,eb624dbe56eb6620ae62080c10a273cab73ae8eca98ab1...,7e3a78f9aa16d14453f363271db2973b903b3949684f0f...,09fbaf8891f39040036484a565bfc3f832713ce3f2d22b...,7a1ca4ef7515f7276bae7230545829c27810c9d9e98ab2...,28.000000,32.000000,32.000000,501.0,...,14,28,28,0,0,0,0,0,0,1
743422,5b71cc0a4810caba703c6ef025483a2d3cdc5e7d910426...,765cd142edcb4716bb24ded7658102b96d8288df561a2a...,1253e9373e781b7500266caa55150e08e210bc8cd8cc70...,7e3a78f9aa16d14453f363271db2973b903b3949684f0f...,cf62a64b8a54fd96e70623b69429a70e1ba0e0ef9b502c...,5a9cf672c8be6b5ab9546a2fb49b06dd81a4e364c86ed0...,38.000000,34.000000,24.000000,34.0,...,6,24,90,0,1,2,0,0,0,1
743423,4e9e6ce9bf9d3101e3a1ce9d6cba67691d2ccb0ee365ea...,30cda959af0854cadcdc48290eca6b9e04e82f64d04902...,1253e9373e781b7500266caa55150e08e210bc8cd8cc70...,7e3a78f9aa16d14453f363271db2973b903b3949684f0f...,cf62a64b8a54fd96e70623b69429a70e1ba0e0ef9b502c...,5a9cf672c8be6b5ab9546a2fb49b06dd81a4e364c86ed0...,113.000000,113.000000,89.000000,141.0,...,24,88,413,0,8,16,0,0,3,1
743424,61b30cf154431e3b9df82ef3686b988cfc499f010a2a3b...,409864969641bce283c1ede88ad851929e277c445fca67...,eb624dbe56eb6620ae62080c10a273cab73ae8eca98ab1...,7e3a78f9aa16d14453f363271db2973b903b3949684f0f...,09fbaf8891f39040036484a565bfc3f832713ce3f2d22b...,7a1ca4ef7515f7276bae7230545829c27810c9d9e98ab2...,1202.000000,1202.000000,902.000000,837.0,...,18,29,53,0,0,0,0,0,0,1


### Preprocessing on oversampled data

In [21]:
# preprocess data
X_os = data_os[features_label]
Y_os = data_os['LABEL']

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])
preprocess_os = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_label),
        ('cat', cat_transformer, categorical_label)
    ])
X_os = preprocess_os.fit_transform(X_os)

In [22]:
# Train test split
X_train_os, X_test_os, Y_train_os, Y_test_os = train_test_split(X_os, Y_os, test_size=0.2, random_state=42, stratify=Y_os)

### Train model

In [23]:
# Train logistic regression on oversampled data
logreg_os = LogisticRegression()
logreg_os.fit(X_train_os, Y_train_os)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
# Predictions in probabilities
Y_pred_proba_train_os = logreg_os.predict_proba(X_train_os)
Y_pred_proba_test_os = logreg_os.predict_proba(X_test_os)

### Score

In [25]:
# Average precision score
print('Average precision score on train set: ', average_precision_score(Y_train_os, Y_pred_proba_train_os[:,1]))
print('Average precision score on test set: ', average_precision_score(Y_test_os, Y_pred_proba_test_os[:,1]))

Average precision score on train set:  0.8370707589170924
Average precision score on test set:  0.8108961578566243


### Save model

In [26]:
# save the model to disk
filename = './src/logreg_model_os.pkl'
joblib.dump(logreg_os, filename)

['./src/logreg_model_os.pkl']

### Submission

In [27]:
# Load submission data
test_sub = pd.read_csv('./src/test.csv')
test_sub.head()

# Preprocess test data
test_sub = preprocess.transform(test_sub)
test_sub

# Prediction on test data
test_sub_pred = logreg_os.predict_proba(test_sub)[:, 1]
test_sub_pred

# Save prediction
submission = pd.DataFrame({'LABEL': test_sub_pred})
submission.reset_index(inplace=True)
submission.to_csv('./src/submission_os.csv', index=False)



 * The oversampling didn't help much on submission score
 * We've tryed several preprocessing method such as removing columns, feature engineering, removing outliers... But everything seems to lower the score
 * so we'll get back to the original dataset

## Grid search on AdaBoost Model

### Grid model

In [28]:
# Grid search for best parameters on AdaBoost
param = {'n_estimators': [50, 100]}

ada = AdaBoostClassifier()
grid_ada = GridSearchCV(ada, param, cv=2, scoring='average_precision', verbose=2)
grid_ada.fit(X_train, Y_train)

Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV] END ....................................n_estimators=50; total time=87.4min
[CV] END ...................................n_estimators=50; total time=124.3min
[CV] END ..................................n_estimators=100; total time=191.0min
[CV] END ..................................n_estimators=100; total time=172.4min


### Score

In [29]:
# Average precision score
print('Average precision score on train set: ', average_precision_score(Y_train, grid_ada.best_estimator_.predict_proba(X_train)[:,1]))
print('Average precision score on test set: ', average_precision_score(Y_test, grid_ada.best_estimator_.predict_proba(X_test)[:,1]))

Average precision score on train set:  0.25675614697633636
Average precision score on test set:  0.2472660365274354


### Save model

In [30]:
# save the model to disk
filename = './src/ada_model.pkl'
joblib.dump(grid_ada.best_estimator_, filename)

['./src/ada_model.pkl']

### Submission

In [31]:
# Load submission data
test_sub = pd.read_csv('./src/test.csv')
test_sub.head()

# Preprocess test data
test_sub = preprocess.transform(test_sub)
test_sub

# Prediction on test data
test_sub_pred = grid_ada.best_estimator_.predict_proba(test_sub)[:, 1]
test_sub_pred

# Save prediction
submission = pd.DataFrame({'LABEL': test_sub_pred})
submission.reset_index(inplace=True)
submission.to_csv('./src/submission_ada.csv', index=False)



## Grid search on Gradient Boosting

### Grid model

In [32]:
# Grid search for best parameters on Gradient Boosting
param = {'n_estimators': [50, 500]}

gb = GradientBoostingClassifier()
grid_gb = GridSearchCV(gb, param, cv=2, scoring='average_precision', verbose=2)
grid_gb.fit(X_train, Y_train)

Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV] END ....................................n_estimators=50; total time=86.0min
[CV] END ....................................n_estimators=50; total time=87.7min
[CV] END ..................................n_estimators=500; total time=963.9min
[CV] END ..................................n_estimators=500; total time=933.1min


### Score

In [33]:
# Average precision score
print('Average precision score on train set: ', average_precision_score(Y_train, grid_gb.best_estimator_.predict_proba(X_train)[:,1]))
print('Average precision score on test set: ', average_precision_score(Y_test, grid_gb.best_estimator_.predict_proba(X_test)[:,1]))

Average precision score on train set:  0.3376988693646941
Average precision score on test set:  0.2653654480218574


### Save model

In [34]:
# save the model to disk
filename = './src/gb_model.pkl'
joblib.dump(grid_gb.best_estimator_, filename)

['./src/gb_model.pkl']

### Submission

In [35]:
# Load submission data
test_sub = pd.read_csv('./src/test.csv')
test_sub.head()

# Preprocess test data
test_sub = preprocess.transform(test_sub)
test_sub

# Prediction on test data
test_sub_pred = grid_gb.best_estimator_.predict_proba(test_sub)[:, 1]
test_sub_pred

# Save prediction
submission = pd.DataFrame({'LABEL': test_sub_pred})
submission.reset_index(inplace=True)
submission.to_csv('./src/submission_gb.csv', index=False)



## Grid search on XGBoost

### Grid Model

In [36]:
# Grid search for best parameters on XGBoost
param = {'n_estimators': [50, 100]}

xgb = XGBClassifier()
grid_xgb = GridSearchCV(xgb, param, cv=2, scoring='average_precision', verbose=2)
grid_xgb.fit(X_train, Y_train)

Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV] END ....................................n_estimators=50; total time=  43.6s
[CV] END ....................................n_estimators=50; total time=  43.6s
[CV] END ...................................n_estimators=100; total time= 1.4min
[CV] END ...................................n_estimators=100; total time= 1.5min


### Score

In [37]:
# Average precision score
print('Average precision score on train set: ', average_precision_score(Y_train, grid_xgb.best_estimator_.predict_proba(X_train)[:,1]))
print('Average precision score on test set: ', average_precision_score(Y_test, grid_xgb.best_estimator_.predict_proba(X_test)[:,1]))

Average precision score on train set:  0.3756518716051508
Average precision score on test set:  0.27368044800725133


### Save model

In [38]:
# save the model to disk
filename = './src/xgb_model.pkl'
joblib.dump(grid_xgb.best_estimator_, filename)

['./src/xgb_model.pkl']

### Submission

In [39]:
# Load submission data
test_sub = pd.read_csv('./src/test.csv')
test_sub.head()

# Preprocess test data
test_sub = preprocess.transform(test_sub)
test_sub

# Prediction on test data
test_sub_pred = grid_xgb.best_estimator_.predict_proba(test_sub)[:, 1]
test_sub_pred

# Save prediction
submission = pd.DataFrame({'LABEL': test_sub_pred})
submission.reset_index(inplace=True)
submission.to_csv('./src/submission_xgb.csv', index=False)



## Grid Search on LGBM

### Grid model

In [40]:
# Grid search for best parameters on LightGBM
param = {'n_estimators': [50, 100]}

lgb = LGBMClassifier()
grid_lgb = GridSearchCV(lgb, param, cv=2, scoring='average_precision', verbose=2)
grid_lgb.fit(X_train, Y_train)

Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV] END ....................................n_estimators=50; total time=   7.2s
[CV] END ....................................n_estimators=50; total time=   7.5s
[CV] END ...................................n_estimators=100; total time=   9.9s
[CV] END ...................................n_estimators=100; total time=   9.4s


### Score

In [41]:
# Average precision score
print('Average precision score on train set: ', average_precision_score(Y_train, grid_lgb.best_estimator_.predict_proba(X_train)[:,1]))
print('Average precision score on test set: ', average_precision_score(Y_test, grid_lgb.best_estimator_.predict_proba(X_test)[:,1]))

Average precision score on train set:  0.3445083977671313
Average precision score on test set:  0.27734540319295303


### Save model

In [42]:
# save the model to disk
filename = './src/lgb_model.pkl'
joblib.dump(grid_lgb.best_estimator_, filename)

['./src/lgb_model.pkl']

### Submission

In [43]:
# Load submission data
test_sub = pd.read_csv('./src/test.csv')
test_sub.head()

# Preprocess test data
test_sub = preprocess.transform(test_sub)
test_sub

# Prediction on test data
test_sub_pred = grid_lgb.best_estimator_.predict_proba(test_sub)[:, 1]
test_sub_pred

# Save prediction
submission = pd.DataFrame({'LABEL': test_sub_pred})
submission.reset_index(inplace=True)
submission.to_csv('./src/submission_lgb.csv', index=False)



## Grid search on CatBoost

### Grid model

In [44]:
# Grid search for best parameters on CatBoost
param = {'n_estimators': [50, 100]}

cat = CatBoostClassifier()
grid_cat = GridSearchCV(cat, param, cv=2, scoring='average_precision', verbose=2)
grid_cat.fit(X_train, Y_train)

Fitting 2 folds for each of 2 candidates, totalling 4 fits
Learning rate set to 0.5
0:	learn: 0.3269547	total: 1.33s	remaining: 1m 5s
1:	learn: 0.2594057	total: 2.58s	remaining: 1m 2s
2:	learn: 0.2417995	total: 3.67s	remaining: 57.5s
3:	learn: 0.2356122	total: 4.99s	remaining: 57.3s
4:	learn: 0.2312922	total: 6.17s	remaining: 55.5s
5:	learn: 0.2291047	total: 7.3s	remaining: 53.5s
6:	learn: 0.2277881	total: 8.2s	remaining: 50.3s
7:	learn: 0.2263378	total: 9.1s	remaining: 47.8s
8:	learn: 0.2253500	total: 10.1s	remaining: 45.9s
9:	learn: 0.2248085	total: 11s	remaining: 43.8s
10:	learn: 0.2238296	total: 11.9s	remaining: 42s
11:	learn: 0.2231126	total: 12.8s	remaining: 40.7s
12:	learn: 0.2226794	total: 13.7s	remaining: 38.9s
13:	learn: 0.2223611	total: 14.6s	remaining: 37.6s
14:	learn: 0.2218535	total: 15.5s	remaining: 36.2s
15:	learn: 0.2217784	total: 16.4s	remaining: 34.9s
16:	learn: 0.2213521	total: 17.3s	remaining: 33.6s
17:	learn: 0.2212594	total: 18.2s	remaining: 32.3s
18:	learn: 0.22

### Score

In [45]:
# Average precision score
print('Average precision score on train set: ', average_precision_score(Y_train, grid_cat.best_estimator_.predict_proba(X_train)[:,1]))
print('Average precision score on test set: ', average_precision_score(Y_test, grid_cat.best_estimator_.predict_proba(X_test)[:,1]))

Average precision score on train set:  0.3212487692434546
Average precision score on test set:  0.26537162870277714


### Save model

In [46]:
# save the model to disk
filename = './src/cat_model.pkl'
joblib.dump(grid_cat.best_estimator_, filename)

['./src/cat_model.pkl']

### Submission

In [47]:
# Load submission data
test_sub = pd.read_csv('./src/test.csv')
test_sub.head()

# Preprocess test data
test_sub = preprocess.transform(test_sub)
test_sub

# Prediction on test data
test_sub_pred = grid_cat.best_estimator_.predict_proba(test_sub)[:, 1]
test_sub_pred

# Save prediction
submission = pd.DataFrame({'LABEL': test_sub_pred})
submission.reset_index(inplace=True)
submission.to_csv('./src/submission_cat.csv', index=False)



## Voting Classifier

### Voting model

In [49]:
# Voting classifier
voting = VotingClassifier(estimators=[('gb', grid_gb.best_estimator_), 
                                        ('xgb', grid_xgb.best_estimator_), 
                                        ('lgb', grid_lgb.best_estimator_), 
                                        ('cat', grid_cat.best_estimator_)], 
                                    voting='soft')

voting.fit(X_train, Y_train)

Learning rate set to 0.5
0:	learn: 0.3275514	total: 1.27s	remaining: 2m 5s
1:	learn: 0.2561078	total: 2.93s	remaining: 2m 23s
2:	learn: 0.2389439	total: 4.26s	remaining: 2m 17s
3:	learn: 0.2324941	total: 5.76s	remaining: 2m 18s
4:	learn: 0.2302562	total: 7.08s	remaining: 2m 14s
5:	learn: 0.2279162	total: 8.59s	remaining: 2m 14s
6:	learn: 0.2264138	total: 10.1s	remaining: 2m 13s
7:	learn: 0.2253756	total: 11.5s	remaining: 2m 11s
8:	learn: 0.2248441	total: 13s	remaining: 2m 11s
9:	learn: 0.2239860	total: 14.4s	remaining: 2m 9s
10:	learn: 0.2232851	total: 15.7s	remaining: 2m 7s
11:	learn: 0.2227187	total: 17.1s	remaining: 2m 5s
12:	learn: 0.2222705	total: 18.6s	remaining: 2m 4s
13:	learn: 0.2218814	total: 19.9s	remaining: 2m 2s
14:	learn: 0.2214789	total: 21.3s	remaining: 2m
15:	learn: 0.2213236	total: 22.8s	remaining: 1m 59s
16:	learn: 0.2211301	total: 24.1s	remaining: 1m 57s
17:	learn: 0.2208385	total: 25.5s	remaining: 1m 56s
18:	learn: 0.2207620	total: 26.8s	remaining: 1m 54s
19:	learn

### Score

In [50]:
# Average precision score
print('Average precision score on train set: ', average_precision_score(Y_train, voting.predict_proba(X_train)[:,1]))
print('Average precision score on test set: ', average_precision_score(Y_test, voting.predict_proba(X_test)[:,1]))

Average precision score on train set:  0.36046092852673983
Average precision score on test set:  0.2803185723485173


### Save model

In [51]:
# save the model to disk
filename = './src/voting_model.pkl'
joblib.dump(voting, filename)

['./src/voting_model.pkl']

### Submission

In [52]:
# Load submission data
test_sub = pd.read_csv('./src/test.csv')
test_sub.head()

# Preprocess test data
test_sub = preprocess.transform(test_sub)
test_sub

# Prediction on test data
test_sub_pred = voting.predict_proba(test_sub)[:, 1]
test_sub_pred

# Save prediction
submission = pd.DataFrame({'LABEL': test_sub_pred})
submission.reset_index(inplace=True)
submission.to_csv('./src/submission_voting.csv', index=False)

