In [1]:
# Autoreload
%load_ext autoreload
%autoreload 2

# Warnings
import warnings
warnings.filterwarnings('ignore')

## XGBoost

### Import libraries and data

In [2]:
import pandas as pd
import numpy as np

In [3]:
processed_test_df = pd.read_csv('../data/processed_test.csv')
processed_val_df = pd.read_csv('../data/processed_val.csv')

In [4]:
processed_test_df.head()

Unnamed: 0,mean_due_age,mean_due_can,mean_due_rut,mean_due_cli,mean_due_pa,mean_due_pr,mean_due_pcli,mean_due_pcan,mean_due_pca,mean_vh_age,sd_due_acrcp,mean_due_acrcp
0,1.433683,1.529564,1.321099,1.179669,1.901905,1.630124,1.609438,2.068613,1.609438,3.295887,,1.609438
1,1.440236,1.529564,1.392361,1.25245,1.017037,1.006477,,1.192182,,3.351941,,
2,1.552585,1.529564,1.511835,1.76314,1.527452,1.355031,1.098612,1.422284,1.098612,3.370756,0.0,1.098612
3,1.38793,1.529564,1.567975,1.341171,0.742755,1.059591,,1.031969,,3.211019,,
4,1.496363,1.529564,1.617854,1.341662,,,,1.31075,,3.41091,,


In [5]:
processed_val_df.head()

Unnamed: 0,Demanda_uni_equil,log_demanda_uni_equil,mean_due_age,mean_due_can,mean_due_rut,mean_due_cli,mean_due_pa,mean_due_pr,mean_due_pcli,mean_due_pcan,mean_due_pca,mean_vh_age,sd_due_acrcp,mean_due_acrcp
0,1,0.693147,2.010012,1.996967,2.08378,1.719464,1.587694,1.556331,1.599232,1.512998,1.599232,4.113977,0.165949,1.599232
1,2,1.098612,2.010012,1.996967,2.08378,1.719464,1.680625,1.553152,1.212066,1.601887,1.212066,4.113977,0.320511,1.212066
2,2,1.098612,2.010012,1.996967,2.08378,1.719464,1.931563,1.803948,1.746179,1.819226,1.746179,4.113977,0.500848,1.746179
3,1,0.693147,2.010012,1.996967,2.08378,1.719464,1.655291,1.937759,1.174876,1.782192,1.174876,4.113977,0.274877,1.174876
4,10,2.397895,2.010012,1.996967,2.08378,1.719464,2.266708,1.903195,1.82895,1.979976,1.82895,4.113977,0.71154,1.82895


### Prepare data

In [6]:
import copy

In [7]:
val_y = copy.deepcopy(processed_val_df['log_demanda_uni_equil'])
val_X = copy.deepcopy(processed_val_df.drop('log_demanda_uni_equil', axis=1))

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(val_X, val_y, test_size=0.5, random_state=42)

In [9]:
y_val.head()

33031      1.098612
7741310    1.945910
4373345    1.098612
4058089    0.693147
6712458    1.791759
Name: log_demanda_uni_equil, dtype: float64

In [10]:
# Because what we REALLY want is the 
real_y_train = copy.deepcopy(X_train['Demanda_uni_equil'])
real_y_val = copy.deepcopy(X_val['Demanda_uni_equil'])

In [11]:
# We don't need Demanda_uni_equil in the features, so we can drop them now
X_train.drop(['Demanda_uni_equil'], axis=1, inplace=True)
X_val.drop(['Demanda_uni_equil'], axis=1, inplace=True)

### Building the model

In [12]:
import xgboost as xgb

If you have the following error: **"XGBoostError: [15:16:32] src/learner.cc:180: XGBoost version not compiled with GPU support."**

Please try the following:

`$ pip install pipdeptree
$ pipdeptree --reverse --packages xgboost`

In [15]:
from xgboost import XGBRegressor

# model = XGBRegressor(
#     max_depth=8,
#     n_estimators=1000,
#     min_child_weight=300, 
#     colsample_bytree=0.8, 
#     subsample=0.8, 
#     eta=0.3,    
#     seed=42)

model = XGBRegressor(
             base_score=0.5, 
             booster='gbtree',
             colsample_bylevel=1,
             colsample_bynode=1, 
             colsample_bytree=0.8, 
             eta=0.3, 
             gamma=0,
             importance_type='gain', 
             learning_rate=0.3, 
             max_delta_step=0,
             max_depth=7, 
             min_child_weight=300, 
             missing=None, 
             n_estimators=1000,
             n_jobs=1, 
             nthread=None, 
             objective='reg:linear', 
             random_state=0,
             reg_alpha=0, 
             reg_lambda=1, 
             scale_pos_weight=1, 
             seed=42,
             silent=None, 
             subsample=0.7, 
             verbosity=1
         )

model.fit(
    X_train, 
    y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, y_train), (X_val, y_val)], 
    verbose=True, 
    early_stopping_rounds = 1)

[0]	validation_0-rmse:1.02133	validation_1-rmse:1.0217
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 1 rounds.
[1]	validation_0-rmse:0.790236	validation_1-rmse:0.790651
[2]	validation_0-rmse:0.645836	validation_1-rmse:0.646216
[3]	validation_0-rmse:0.561392	validation_1-rmse:0.561776
[4]	validation_0-rmse:0.513013	validation_1-rmse:0.513397
[5]	validation_0-rmse:0.488116	validation_1-rmse:0.488483
[6]	validation_0-rmse:0.474054	validation_1-rmse:0.474456
[7]	validation_0-rmse:0.466546	validation_1-rmse:0.466984
[8]	validation_0-rmse:0.46256	validation_1-rmse:0.463025
[9]	validation_0-rmse:0.460333	validation_1-rmse:0.460823
[10]	validation_0-rmse:0.459101	validation_1-rmse:0.459597
[11]	validation_0-rmse:0.458173	validation_1-rmse:0.458684
[12]	validation_0-rmse:0.457636	validation_1-rmse:0.458176
[13]	validation_0-rmse:0.457249	validation_1-rmse:0.457801
[14]	validation_0-rmse:0.45687

[131]	validation_0-rmse:0.449687	validation_1-rmse:0.45162
[132]	validation_0-rmse:0.44967	validation_1-rmse:0.451608
[133]	validation_0-rmse:0.449657	validation_1-rmse:0.451606
[134]	validation_0-rmse:0.449627	validation_1-rmse:0.451585
[135]	validation_0-rmse:0.449599	validation_1-rmse:0.451561
[136]	validation_0-rmse:0.449585	validation_1-rmse:0.451553
[137]	validation_0-rmse:0.449577	validation_1-rmse:0.451547
[138]	validation_0-rmse:0.449549	validation_1-rmse:0.451533
[139]	validation_0-rmse:0.449524	validation_1-rmse:0.451522
[140]	validation_0-rmse:0.449512	validation_1-rmse:0.451513
[141]	validation_0-rmse:0.449499	validation_1-rmse:0.451504
[142]	validation_0-rmse:0.449462	validation_1-rmse:0.45148
[143]	validation_0-rmse:0.449431	validation_1-rmse:0.451463
[144]	validation_0-rmse:0.449419	validation_1-rmse:0.451457
[145]	validation_0-rmse:0.449405	validation_1-rmse:0.451454
[146]	validation_0-rmse:0.449358	validation_1-rmse:0.451423
[147]	validation_0-rmse:0.449337	validation

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, eta=0.3, gamma=0,
             importance_type='gain', learning_rate=0.3, max_delta_step=0,
             max_depth=7, min_child_weight=300, missing=None, n_estimators=1000,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
             silent=None, subsample=0.7, verbosity=1)

In [None]:
# fourth try
# XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
#              colsample_bynode=1, colsample_bytree=0.8, eta=0.3, gamma=0,
#              importance_type='gain', learning_rate=0.3, max_delta_step=0,
#              max_depth=7, min_child_weight=300, missing=None, n_estimators=1000,
#              n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
#              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
#              silent=None, subsample=0.7, verbosity=1)

In [17]:
import pickle

# save model to file
pickle.dump(model, open("xgboost.pickle.dat", "wb")) 
print("Saved model to: xgboost.pickle.dat")

Saved model to: xgboost.pickle.dat


Mettre early stopping rounds à 2. Stop à 207 pour les paramètres suivants :
`model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)``
et eval_metric="rmse"

In [19]:
y_pred = model.predict(processed_test_df)

y_pred_exp = np.expm1(y_pred).round().astype(int)

In [24]:
len(y_pred_exp)

6999251

In [25]:
test_id = range(len(y_pred_exp))
test_id

range(0, 6999251)

In [26]:
result1 = pd.DataFrame( {'id':test_id,
           'Demanda_uni_equil':y_pred_exp} )

result1.to_csv('pred.csv',index=False)

In [28]:
result1.head()

Unnamed: 0,id,Demanda_uni_equil
0,0,4
1,1,1
2,2,2
3,3,1
4,4,2
