# Modelling

Now that we have the data in the right format, we can start building our model for making predictions

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

Lets load our data

In [2]:
train_X = pd.read_csv("train_data.csv")

test_X = train_X[train_X["date_block_num"] >= train_X["date_block_num"].max()-1]
train_X = train_X[train_X["date_block_num"] < train_X["date_block_num"].max()-1]

train_y = train_X.pop("item_cnt_month")
test_y = test_X.pop("item_cnt_month")

Now that we have our data, we can start defining our model.
For this usecase we will use the XGBRegressor from the xgboost module, and run multiple setups with GridSearchCrossValidation to find the best parameters for our model.

In [3]:
param_dict = {
    "n_estimators": [5_000, 10_000],
    "max_depth": [10],
    "learning_rate": [0.1],
    "tree_method": ["gpu_hist"],
    "min_child_weight": [0.3, 0.5], 
    "colsample_bytree": [0.3, 0.6],
    "subsample": [0.8], 
}

cross_val = GridSearchCV(
    estimator=xgb.XGBRegressor(),
    param_grid=param_dict,
    verbose=2,
    cv=5)

# Training
Now lets run the Training:

In [4]:
cross_val.fit(
    train_X, train_y, 
    early_stopping_rounds=20,
    eval_set=[(train_X, train_y), (test_X, test_y)],
    eval_metric="rmse",
    verbose=False
    )

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] colsample_bytree=0.3, learning_rate=0.1, max_depth=10, min_child_weight=0.3, n_estimators=5000, subsample=0.8, tree_method=gpu_hist 
[CV]  colsample_bytree=0.3, learning_rate=0.1, max_depth=10, min_child_weight=0.3, n_estimators=5000, subsample=0.8, tree_method=gpu_hist, total= 1.5min
[CV] colsample_bytree=0.3, learning_rate=0.1, max_depth=10, min_child_weight=0.3, n_estimators=5000, subsample=0.8, tree_method=gpu_hist 
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min remaining:    0.0s
[CV]  colsample_bytree=0.3, learning_rate=0.1, max_depth=10, min_child_weight=0.3, n_estimators=5000, subsample=0.8, tree_method=gpu_hist, total= 1.6min
[CV] colsample_bytree=0.3, learning_rate=0.1, max_depth=10, min_child_weight=0.3, n_estimators=5000, subsample=0.8, tree_method=gpu_hist 
[CV]  colsample_bytree=0.3, learning_rate=0.1, max_depth=10

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=...
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_parame

Lets use the best model that we found through Cross Validation to make predictions for the submission and save it in a csv file.

In [11]:
model = cross_val.best_estimator_
# load prediction data
submission_data = pd.read_csv("test.csv")
# load submission sample
submission = pd.read_csv("data/sample_submission.csv", index_col="ID")
submission["item_cnt_month"] = model.predict(submission_data)
# some values are slightly negative, indicating no sales, lets set them to 0
submission.loc[submission["item_cnt_month"] < 0] = 0
# save data
submission.to_csv("submission.csv")