Here we will generate predictions for the `data/raw/test.csv` backbone file.

In [12]:
import pandas as pd

from xgboost import XGBRegressor

from src.ToyModel import ToyModel
from src.utilities import check_folder
from src.FeatureGenerator import FeatureGenerator
from src.settings import RAW_PATH, PROCESSED_PATH, PREDS_PATH

# Data initiation

In [2]:
train_feat_generator = FeatureGenerator(verbose=True, save_files=True)


# features df for training
train_features_df = train_feat_generator.generate_features()

cols_di={
    'index':  train_feat_generator.index_cols,
    'target': train_feat_generator.target_col,
    'feats':  train_feat_generator.shifted_cols + train_feat_generator.roll_cols
}

train_features_df.head()


base feats done
2 batches
shifts done
rolls done
batch 1/2 done
------------------------------
shifts done
rolls done
batch 2/2 done
------------------------------
concatenating


Unnamed: 0,shop_id,item_id,date_block_num,item_category_id,item_price_sum_per_shop_lag_1,item_price_mean_per_shop_lag_1,item_cnt_day_sum_per_shop_lag_1,item_cnt_day_mean_per_shop_lag_1,item_price_sum_per_item_lag_1,item_price_mean_per_item_lag_1,...,item_cnt_day_mean_per_category_lag_12,item_price_sum_per_shop_category_lag_12,item_price_mean_per_shop_category_lag_12,item_cnt_day_sum_per_shop_category_lag_12,item_cnt_day_mean_per_shop_category_lag_12,target_lag_12,target_roll_mean_2,target_roll_mean_5,target_roll_mean_12,target
0,20,4633,21,67,0.0,0.0,0.0,0.0,49169.199219,1695.489624,...,1.034101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
1,20,4635,21,67,0.0,0.0,0.0,0.0,32759.0,799.0,...,1.034101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
2,20,4637,21,67,0.0,0.0,0.0,0.0,11985.0,799.0,...,1.034101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,20,4638,21,67,0.0,0.0,0.0,0.0,2877.199951,959.06665,...,1.034101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
4,20,4639,21,67,0.0,0.0,0.0,0.0,32484.199219,1015.131226,...,1.034101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0


In [3]:
test_feat_generator = FeatureGenerator(verbose=True, 
                                       save_files=False)

test_backbone = pd.read_csv(RAW_PATH + 'test.csv')
test_backbone['date_block_num'] = 34

# features df for prediction
test_features_df = test_feat_generator.add_features_to_backbone(test_backbone=test_backbone, target_month=34)
test_features_df.head()

base feats done
8 batches
shifts done
rolls done
batch 1/8 done
------------------------------
shifts done
rolls done
batch 2/8 done
------------------------------
shifts done
rolls done
batch 3/8 done
------------------------------
shifts done
rolls done
batch 4/8 done
------------------------------
shifts done
rolls done
batch 5/8 done
------------------------------
shifts done
rolls done
batch 6/8 done
------------------------------
shifts done
rolls done
batch 7/8 done
------------------------------
shifts done
rolls done
batch 8/8 done
------------------------------
concatenating


Unnamed: 0,ID,shop_id,item_id,date_block_num,item_category_id,item_price_sum_per_shop_lag_1,item_price_mean_per_shop_lag_1,item_cnt_day_sum_per_shop_lag_1,item_cnt_day_mean_per_shop_lag_1,item_price_sum_per_item_lag_1,...,item_cnt_day_sum_per_category_lag_12,item_cnt_day_mean_per_category_lag_12,item_price_sum_per_shop_category_lag_12,item_price_mean_per_shop_category_lag_12,item_cnt_day_sum_per_shop_category_lag_12,item_cnt_day_mean_per_shop_category_lag_12,target_lag_12,target_roll_mean_2,target_roll_mean_5,target_roll_mean_12
0,0,5,5037,34,19,994646.6875,1030.721924,1052.0,1.090155,37475.0,...,6134.0,1.144403,120378.0,1695.464844,77.0,1.084507,1.0,0.5,1.2,1.0
1,1,5,5320,34,55,994646.6875,1030.721924,1052.0,1.090155,0.0,...,9809.0,1.041406,34423.0,273.198425,132.0,1.047619,0.0,0.0,0.0,0.0
2,2,5,5233,34,19,994646.6875,1030.721924,1052.0,1.090155,49159.0,...,6134.0,1.144403,120378.0,1695.464844,77.0,1.084507,0.0,2.0,1.4,0.833333
3,3,5,5232,34,23,994646.6875,1030.721924,1052.0,1.090155,35713.0,...,5275.0,1.099875,120435.5,1605.806641,79.0,1.053333,0.0,0.0,0.2,0.083333
4,4,5,5268,34,20,994646.6875,1030.721924,1052.0,1.090155,0.0,...,12834.0,2.338131,202091.375,2928.860596,160.0,2.318841,0.0,0.0,0.0,0.0


# Train and predict

In [7]:
check_folder(PREDS_PATH, flash_folder=False)

In [14]:
params = {'max_depth': 4, 'learning_rate': 0.056901755640135235, 'n_estimators': 214, 'gamma': 0.44716540258976356, 
          'reg_alpha': 0.30371993367372513, 'reg_lambda': 1.1152200659246774, 'colsample_bytree': 0.8339111689321749, 
          'subsample': 0.4358889399097483}

model = XGBRegressor(**params)

model.fit(X=train_features_df[cols_di['feats']], 
          y=train_features_df[cols_di['target']])

test_backbone.loc[:, 'item_cnt_month'] = model.predict(X = test_features_df[cols_di['feats']])
test_backbone = test_backbone[['ID', 'item_cnt_month']]
test_backbone.head()

Unnamed: 0,ID,item_cnt_month
0,0,1.067363
1,1,0.455084
2,2,1.4318
3,3,1.000047
4,4,2.01218


In [11]:
test_backbone.shape

(214200, 2)

In [16]:
# test_backbone.to_csv(PREDS_PATH + 'xgb_pred.csv', index=False)
test_backbone.to_csv('data/predictions/xgb_pred.csv', index=False)

So far toy model produces much better kaggle score. This needs to be investigated.