In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn import cross_validation
import xgboost as xgb

def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w

def rmspe(yhat, y):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe

def rmspe_xg(yhat, y):
    # y = y.values
    y = y.get_label()
    y = np.exp(y) - 1
    yhat = np.exp(yhat) - 1
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
    return "rmspe", rmspe

In [3]:
train = pd.read_csv('train_nnew.csv', low_memory=False)
test = pd.read_csv('test_nnew.csv', low_memory=False)
features = [u'Open', u'Promo', u'SchoolHoliday', u'StateHoliday_0',
       u'StateHoliday_a', u'DayOfWeek_1', u'DayOfWeek_2', u'DayOfWeek_3',
       u'DayOfWeek_4', u'DayOfWeek_5', u'DayOfWeek_6', u'DayOfWeek_7',
       u'CompetitionDistance', u'Promo2', 'year', 'Mean_Sales', 'month', 'day',
       u'StoreType_a', u'StoreType_b', u'StoreType_c', u'StoreType_d',
       u'Assortment_a', u'Assortment_b', u'Assortment_c', u'CompetitionOpen']

In [4]:
train['year'] = train.Date.apply(lambda x: x.split('-')[0])
train['year'] = train['year'].astype(float)
train['month'] = train.Date.apply(lambda x: x.split('-')[1])
train['month'] = train['month'].astype(float)
train['day'] = train.Date.apply(lambda x: x.split('-')[2])
train['day'] = train['day'].astype(float)

In [5]:
test['year'] = test.Date.apply(lambda x: x.split('-')[0])
test['year'] = test['year'].astype(float)
test['month'] = test.Date.apply(lambda x: x.split('-')[1])
test['month'] = test['month'].astype(float)
test['day'] = test.Date.apply(lambda x: x.split('-')[2])
test['day'] = test['day'].astype(float)

In [92]:
train['StoreType'] = train['StoreType_a'] + 2*train['StoreType_b'] + 3*train['StoreType_c'] + 4*train['StoreType_d']

In [93]:
test['StoreType'] = test['StoreType_a'] + 2*test['StoreType_b'] + 3*test['StoreType_c'] + 4*test['StoreType_d']

In [94]:
train['Assortment'] = train['Assortment_a'] + 2*train['Assortment_b'] + 3*train['Assortment_c']

In [95]:
test['Assortment'] = test['Assortment_a'] + 2*test['Assortment_b'] + 3*test['Assortment_c']

In [97]:
features = [u'Open', u'Promo', u'SchoolHoliday', u'StateHoliday_0',
       u'StateHoliday_a', u'DayOfWeek',
       u'CompetitionDistance', u'Promo2', 'year', 'Mean_Sales', 'month', 'day',
       u'StoreType',
       u'Assortment', u'CompetitionOpen']

In [36]:
train['Date'] = train['Date'].astype('datetime64')
test['Date'] = test['Date'].astype('datetime64')

In [105]:
params = {"objective": "reg:linear",
          "eta": 0.2,
          "max_depth": 10,
          "subsample": 0.7,
          "colsample_bytree": 0.7,
          "silent": 1,
          #"lambda" : 100,
          #"alpha" : 1
          }
num_trees = 900

In [106]:
print("Train a XGBoost model")
val_size = 50000
#train = train.sort(['Date'])
print(train.tail(1)['Date'])
X_train, X_test = cross_validation.train_test_split(train, test_size=0.05, random_state = 1)
#X_train, X_test = train.head(len(train) - val_size), train.tail(val_size)
dtrain = xgb.DMatrix(X_train[X_train['Open'] > 0][features], np.log(X_train[X_train['Open'] > 0]["Sales"] + 1))
dvalid = xgb.DMatrix(X_test[X_test['Open'] > 0][features], np.log(X_test[X_test['Open'] > 0]["Sales"] + 1))
dtest = xgb.DMatrix(test[features])
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=50, 
                feval=rmspe_xg)#, verbose_eval=True)

print("Validating")
train_probs = gbm.predict(xgb.DMatrix(X_test[features]))
indices = train_probs < 0
train_probs[indices] = 0
error = rmspe(np.exp(train_probs) - 1, X_test['Sales'].values)
print('error', error)

Will train until train error hasn't decreased in 50 rounds.
[0]	eval-rmspe:0.998710	train-rmspe:0.998698
[1]	eval-rmspe:0.994794	train-rmspe:0.994774
[2]	eval-rmspe:0.984991	train-rmspe:0.984951
[3]	eval-rmspe:0.965241	train-rmspe:0.965169
[4]	eval-rmspe:0.932019	train-rmspe:0.931915
[5]	eval-rmspe:0.883720	train-rmspe:0.883633
[6]	eval-rmspe:0.821104	train-rmspe:0.821146
[7]	eval-rmspe:0.748122	train-rmspe:0.748560
[8]	eval-rmspe:0.668924	train-rmspe:0.670140
[9]	eval-rmspe:0.588143	train-rmspe:0.590191
[10]	eval-rmspe:0.510115	train-rmspe:0.513776
[11]	eval-rmspe:0.437798	train-rmspe:0.443825
[12]	eval-rmspe:0.373008	train-rmspe:0.382114
[13]	eval-rmspe:0.317278	train-rmspe:0.330348
[14]	eval-rmspe:0.270166	train-rmspe:0.287991
[15]	eval-rmspe:0.231980	train-rmspe:0.255221
[16]	eval-rmspe:0.201549	train-rmspe:0.230654
[17]	eval-rmspe:0.178381	train-rmspe:0.212945
[18]	eval-rmspe:0.160874	train-rmspe:0.200534
[19]	eval-rmspe:0.149114	train-rmspe:0.193108
[20]	eval-rmspe:0.140119	train

Train a XGBoost model
0   2015-07-31
Name: Date, dtype: datetime64[ns]
Validating
('error', 0.096955141152086158)


[899]	eval-rmspe:0.106316	train-rmspe:0.072841


In [107]:
print("Make predictions on the test set")
test_probs = gbm.predict(xgb.DMatrix(test[features]))
indices = test_probs < 0
test_probs[indices] = 0
submission = pd.DataFrame({"Id": test["Id"], "Sales": np.exp(test_probs) - 1})
submission.to_csv("xgboost_means_pos_noohe.csv", index=False)

Make predictions on the test set


In [84]:
gbm.update(dtrain, 10)

In [85]:
print("Validating")
train_probs = gbm.predict(xgb.DMatrix(X_test[features]))
indices = train_probs < 0
train_probs[indices] = 0
error = rmspe(np.exp(train_probs) - 1, X_test['Sales'].values)
print('error', error)

Validating
('error', 0.098015826835446962)
