In [2]:
import pandas as pd
import load_datasets
import os
import time
import xgboost as xgb

# Load data

In [3]:
training_name = 'rf'
X_train, y_train = load_datasets.train(max_date_block_num=32)
X_val, y_val = load_datasets.val()
X_test = load_datasets.test()

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test)

In [4]:
del X_train, y_train, X_val, y_val, X_test

# Model Training

In [None]:
ts = time.time()
params = {
    'max_depth': 10,
    'colsample_bynode': 0.8, 
    'subsample': 0.8, 
    'learning_rate': 1,
    'num_parallel_tree': 500,
    'seed': 42,
    'objective':'reg:squarederror',
    'eval_metric': 'rmse',
    'nthread': 8
}

evallist = [(dtrain, 'train'), (dval, 'eval')]
num_boost_round = 1
model = xgb.train(params, dtrain, num_boost_round, evallist, early_stopping_rounds=30, verbose_eval=1)

time.time() - ts



# Make Predictions

In [None]:
DATA_DIR = "data"
OUTPUT_DIR = 'output'


pred_test = model.predict(dtest)


pred = pd.read_csv(os.path.join(DATA_DIR, 'sample_submission.csv'))
pred['item_cnt_month'] = pred_test
pred['item_cnt_month'] = pred['item_cnt_month'].clip(lower=0, upper=20)
pred.to_csv(os.path.join(OUTPUT_DIR, training_name+'.csv'), index=False)

Your public LB scores are: 0.943856

In [None]:
# predict for val, and test for emsembling
pred_val = model.predict(dval)
pd.DataFrame({training_name + 'val': pred_val}).to_hdf(os.path.join(OUTPUT_DIR, training_name+'.h5'), key='val', mode='a')
pd.DataFrame({training_name + 'test': pred_test}).to_hdf(os.path.join(OUTPUT_DIR, training_name+'.h5'), key='test', mode='a')

In [None]:
model.save_model(os.path.join(OUTPUT_DIR, 'rf_model.mdl'))