In [1]:
import pandas as pd
import numpy as np

train_df = pd.read_csv("../sales_train.csv.gz")
test_df  = pd.read_csv("../test.csv.gz")

categories = pd.read_csv('../item_categories.csv')
items = pd.read_csv('../items.csv')

In [2]:
%store -r __prepare_data

In [3]:
%%time
__prepare_data

RMSE: 0.913456
CPU times: user 37min 36s, sys: 1min 18s, total: 38min 55s
Wall time: 12min


In [11]:
X_train = train[train.date_block_num <  33][predictors]
X_valid = train[train.date_block_num == 33][predictors]
y_train = train[train.date_block_num <  33].target
y_valid = train[train.date_block_num == 33].target

In [12]:
%%time
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

model1 = RandomForestRegressor(n_estimators=64, max_features=3, n_jobs=4)
model1.fit(X_train, y_train)
ypred = model1.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, ypred))
print("RMSE: %f" % (rmse))

RMSE: 0.965149
CPU times: user 30min 16s, sys: 28.1 s, total: 30min 44s
Wall time: 8min 17s


In [25]:
%%time
model3 = RandomForestRegressor(n_estimators=128, max_features=4, n_jobs=4)
model3.fit(X_train, y_train)
ypred = model3.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, ypred))
print("RMSE: %f" % (rmse))

RMSE: 0.966271


In [18]:
predictors_old = ["target_lag_1",
              "total_monthly_item_sales_lag_1",
              "total_monthly_shop_sales_lag_1",
              "total_monthly_shop_category_sales_lag_1",
              "total_monthly_shop_supercategory_sales_lag_1",
              "total_monthly_supercategory_sales_lag_1"
             ]

X_train_old = train[train.date_block_num <  33][predictors_old]
X_valid_old = train[train.date_block_num == 33][predictors_old]
y_train_old = train[train.date_block_num <  33].target
y_valid_old = train[train.date_block_num == 33].target

In [24]:
%%time
model5 = RandomForestRegressor(n_estimators=64, max_features=3, n_jobs=4)
model5.fit(X_train_old, y_train_old)
ypred = model5.predict(X_valid_old)
rmse = np.sqrt(mean_squared_error(y_valid_old, ypred))
print("RMSE: %f" % (rmse))

RMSE: 0.979642


In [22]:
from tabulate import tabulate
headers = ["name", "score"]
values = sorted(zip(X_valid_old.columns, model5.feature_importances_), key=lambda x: x[1] * -1)
print(tabulate(values, headers, tablefmt="plain"))

name                                              score
total_monthly_item_sales_lag_1                0.42063
total_monthly_shop_category_sales_lag_1       0.191807
total_monthly_shop_sales_lag_1                0.108511
target_lag_1                                  0.105677
total_monthly_shop_supercategory_sales_lag_1  0.0914071
total_monthly_supercategory_sales_lag_1       0.0819669


# Trying out XGBoost

In [29]:
%%time
from xgboost import XGBRegressor

xmodel = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,
    n_jobs=4,
    seed=42)

xmodel.fit(
    X_train, 
    y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, y_train), (X_valid, y_valid)], 
    verbose=True, 
    early_stopping_rounds = 10)

[0]	validation_0-rmse:1.16779	validation_1-rmse:1.12782
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:1.12554	validation_1-rmse:1.09952
[2]	validation_0-rmse:1.09204	validation_1-rmse:1.07708
[3]	validation_0-rmse:1.06798	validation_1-rmse:1.05756
[4]	validation_0-rmse:1.04707	validation_1-rmse:1.04123
[5]	validation_0-rmse:1.02984	validation_1-rmse:1.02805
[6]	validation_0-rmse:1.01391	validation_1-rmse:1.01735
[7]	validation_0-rmse:1.00475	validation_1-rmse:1.0099
[8]	validation_0-rmse:0.99283	validation_1-rmse:1.00225
[9]	validation_0-rmse:0.981569	validation_1-rmse:0.995617
[10]	validation_0-rmse:0.975192	validation_1-rmse:0.991731
[11]	validation_0-rmse:0.967521	validation_1-rmse:0.987597
[12]	validation_0-rmse:0.96091	validation_1-rmse:0.984229
[13]	validation_0-rmse:0.955281	validation_1-rmse:0.981514
[14]	validation_0-rmse:0.950893	validation_1-

[132]	validation_0-rmse:0.910167	validation_1-rmse:0.9651
[133]	validation_0-rmse:0.910149	validation_1-rmse:0.965117
[134]	validation_0-rmse:0.910124	validation_1-rmse:0.965149
[135]	validation_0-rmse:0.9101	validation_1-rmse:0.965154
[136]	validation_0-rmse:0.91009	validation_1-rmse:0.965161
[137]	validation_0-rmse:0.910075	validation_1-rmse:0.965136
[138]	validation_0-rmse:0.910053	validation_1-rmse:0.96515
[139]	validation_0-rmse:0.909992	validation_1-rmse:0.965164
[140]	validation_0-rmse:0.909976	validation_1-rmse:0.965136
[141]	validation_0-rmse:0.909922	validation_1-rmse:0.96504
[142]	validation_0-rmse:0.909896	validation_1-rmse:0.965105
[143]	validation_0-rmse:0.909885	validation_1-rmse:0.96513
[144]	validation_0-rmse:0.909853	validation_1-rmse:0.965133
[145]	validation_0-rmse:0.909834	validation_1-rmse:0.965138
[146]	validation_0-rmse:0.909794	validation_1-rmse:0.965124
[147]	validation_0-rmse:0.909787	validation_1-rmse:0.965144
[148]	validation_0-rmse:0.909705	validation_1-rm

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, eta=0.3, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=8, min_child_weight=300, missing=None, n_estimators=1000,
             n_jobs=4, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
             silent=None, subsample=0.8, verbosity=1)

In [30]:
ypred = xmodel.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, ypred))
print("RMSE: %f" % (rmse))

RMSE: 0.964996


In [31]:
month_34 = all_data[all_data.date_block_num == 34].fillna(0)
month_34['item_cnt_month'] = xmodel.predict(month_34[predictors])
test = pd.merge(test_df,month_34, on=["shop_id","item_id"]).fillna(0)

test = test.loc[:,['ID', 'item_cnt_month']]
test.set_index("ID", inplace=True)
test["item_cnt_month"] = test["item_cnt_month"].round().astype("int64")

test[test.item_cnt_month>20] = 20
test.to_csv("macro2_xgb.csv")