In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn import cross_validation
import xgboost as xgb

def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w

def rmspe(yhat, y):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe

def rmspe_xg(yhat, y):
    # y = y.values
    y = y.get_label()
    y = np.exp(y) - 1
    yhat = np.exp(yhat) - 1
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
    return "rmspe", rmspe

In [4]:
train = pd.read_csv('train_nnew.csv', low_memory=False)
test = pd.read_csv('test_nnew.csv', low_memory=False)
features = [u'Open', u'Promo', u'SchoolHoliday', u'StateHoliday_0',
       u'StateHoliday_a', u'DayOfWeek_1', u'DayOfWeek_2', u'DayOfWeek_3',
       u'DayOfWeek_4', u'DayOfWeek_5', u'DayOfWeek_6', u'DayOfWeek_7',
       u'CompetitionDistance', u'Promo2', 'year', 'Mean_Sales', 'month', 'day',
       u'StoreType_a', u'StoreType_b', u'StoreType_c', u'StoreType_d',
       u'Assortment_a', u'Assortment_b', u'Assortment_c', u'CompetitionOpen']

In [44]:
train['year'] = train.Date.apply(lambda x: x.split('-')[0])
train['year'] = train['year'].astype(float)
train['month'] = train.Date.apply(lambda x: x.split('-')[1])
train['month'] = train['month'].astype(float)
train['day'] = train.Date.apply(lambda x: x.split('-')[2])
train['day'] = train['day'].astype(float)

In [45]:
test['year'] = test.Date.apply(lambda x: x.split('-')[0])
test['year'] = test['year'].astype(float)
test['month'] = test.Date.apply(lambda x: x.split('-')[1])
test['month'] = test['month'].astype(float)
test['day'] = test.Date.apply(lambda x: x.split('-')[2])
test['day'] = test['day'].astype(float)

In [46]:
features = [u'Open', u'Promo', u'SchoolHoliday', u'StateHoliday_0',
       u'StateHoliday_a', u'DayOfWeek_1', u'DayOfWeek_2', u'DayOfWeek_3',
       u'DayOfWeek_4', u'DayOfWeek_5', u'DayOfWeek_6', u'DayOfWeek_7',
       u'CompetitionDistance', u'Promo2', 'year', 'Mean_Sales', 'month', 'day',
       u'StoreType_a', u'StoreType_b', u'StoreType_c', u'StoreType_d',
       u'Assortment_a', u'Assortment_b', u'Assortment_c', u'CompetitionOpen']

In [51]:
params = {"objective": "reg:linear",
          "eta": 0.1,
          "max_depth": 8,
          "subsample": 0.7,
          "colsample_bytree": 0.7,
          "silent": 1,
          #"lambda" : 500
          }
num_trees = 300

In [52]:
print("Train a XGBoost model")
val_size = 100000
#train = train.sort(['Date'])
print(train.tail(1)['Date'])
X_train, X_test = cross_validation.train_test_split(train, test_size=0.05)
#X_train, X_test = train.head(len(train) - val_size), train.tail(val_size)
dtrain = xgb.DMatrix(X_train[features], np.log(X_train["Sales"] + 1))
dvalid = xgb.DMatrix(X_test[features], np.log(X_test["Sales"] + 1))
dtest = xgb.DMatrix(test[features])
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=30, 
                feval=rmspe_xg)#, verbose_eval=True)

print("Validating")
train_probs = gbm.predict(xgb.DMatrix(X_test[features]))
indices = train_probs < 0
train_probs[indices] = 0
error = rmspe(np.exp(train_probs) - 1, X_test['Sales'].values)
print('error', error)

Will train until train error hasn't decreased in 30 rounds.
[0]	eval-rmspe:0.909892	train-rmspe:0.910680
[1]	eval-rmspe:0.909260	train-rmspe:0.910048
[2]	eval-rmspe:0.908107	train-rmspe:0.908895
[3]	eval-rmspe:0.906195	train-rmspe:0.906982
[4]	eval-rmspe:0.903132	train-rmspe:0.903919
[5]	eval-rmspe:0.898640	train-rmspe:0.899425
[6]	eval-rmspe:0.892244	train-rmspe:0.893025
[7]	eval-rmspe:0.883604	train-rmspe:0.884384
[8]	eval-rmspe:0.872363	train-rmspe:0.873143
[9]	eval-rmspe:0.858052	train-rmspe:0.858835
[10]	eval-rmspe:0.840895	train-rmspe:0.841682
[11]	eval-rmspe:0.820573	train-rmspe:0.821379
[12]	eval-rmspe:0.797274	train-rmspe:0.798120
[13]	eval-rmspe:0.771164	train-rmspe:0.772068
[14]	eval-rmspe:0.742548	train-rmspe:0.743544
[15]	eval-rmspe:0.711831	train-rmspe:0.712942
[16]	eval-rmspe:0.679410	train-rmspe:0.680695
[17]	eval-rmspe:0.645651	train-rmspe:0.647147
[18]	eval-rmspe:0.611278	train-rmspe:0.613060
[19]	eval-rmspe:0.576773	train-rmspe:0.578907
[20]	eval-rmspe:0.542175	train

Train a XGBoost model
1017208    2013-01-01
Name: Date, dtype: object
Validating
('error', 0.12750488527963455)


[299]	eval-rmspe:0.127505	train-rmspe:0.155701


In [53]:
print("Make predictions on the test set")
test_probs = gbm.predict(xgb.DMatrix(test[features]))
indices = test_probs < 0
test_probs[indices] = 0
submission = pd.DataFrame({"Id": test["Id"], "Sales": np.exp(test_probs) - 1})
submission.to_csv("xgboost_means.csv", index=False)

Make predictions on the test set
