In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn import cross_validation
import xgboost as xgb

def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w

def rmspe(yhat, y):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe

def rmspe_xg(yhat, y):
    # y = y.values
    y = y.get_label()
    y = np.exp(y) - 1
    yhat = np.exp(yhat) - 1
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
    return "rmspe", rmspe

In [3]:
train = pd.read_csv('train_nnew.csv', low_memory=False)
test = pd.read_csv('test_nnew.csv', low_memory=False)
features = [u'Open', u'Promo', u'SchoolHoliday', u'StateHoliday_0',
       u'StateHoliday_a', u'DayOfWeek_1', u'DayOfWeek_2', u'DayOfWeek_3',
       u'DayOfWeek_4', u'DayOfWeek_5', u'DayOfWeek_6', u'DayOfWeek_7',
       u'CompetitionDistance', u'Promo2', 'year', 'Mean_Sales', 'month', 'day',
       u'StoreType_a', u'StoreType_b', u'StoreType_c', u'StoreType_d',
       u'Assortment_a', u'Assortment_b', u'Assortment_c', u'CompetitionOpen']

In [4]:
train['year'] = train.Date.apply(lambda x: x.split('-')[0])
train['year'] = train['year'].astype(float)
train['month'] = train.Date.apply(lambda x: x.split('-')[1])
train['month'] = train['month'].astype(float)
train['day'] = train.Date.apply(lambda x: x.split('-')[2])
train['day'] = train['day'].astype(float)

In [5]:
test['year'] = test.Date.apply(lambda x: x.split('-')[0])
test['year'] = test['year'].astype(float)
test['month'] = test.Date.apply(lambda x: x.split('-')[1])
test['month'] = test['month'].astype(float)
test['day'] = test.Date.apply(lambda x: x.split('-')[2])
test['day'] = test['day'].astype(float)

In [6]:
train['StoreType'] = train['StoreType_a'] + 2*train['StoreType_b'] + 3*train['StoreType_c'] + 4*train['StoreType_d']

test['StoreType'] = test['StoreType_a'] + 2*test['StoreType_b'] + 3*test['StoreType_c'] + 4*test['StoreType_d']

train['Assortment'] = train['Assortment_a'] + 2*train['Assortment_b'] + 3*train['Assortment_c']

test['Assortment'] = test['Assortment_a'] + 2*test['Assortment_b'] + 3*test['Assortment_c']

In [7]:
features = [u'Open', u'Promo', u'SchoolHoliday', u'StateHoliday_0',
       u'StateHoliday_a', u'DayOfWeek',
       u'CompetitionDistance', u'Promo2', 'year', 'Mean_Sales', 'month', 'day',
       u'StoreType',
       u'Assortment', u'CompetitionOpen']

In [8]:
train['Date'] = train['Date'].astype('datetime64')
test['Date'] = test['Date'].astype('datetime64')

In [6]:
params = {"objective": "reg:linear", 
          "eta": 0.05, 
          "max_depth": 20, 
          "subsample": 0.9, 
          "colsample_bytree": 0.9, 
          "silent": 1, 
          "lambda" : 1000, 
          "alpha" : 1 } 
num_trees = 900

In [7]:
print("Train a XGBoost model")
val_size = 50000
#train = train.sort(['Date'])
print(train.tail(1)['Date'])
X_train, X_test = cross_validation.train_test_split(train, test_size=0.05, random_state = 1)
#X_train, X_test = train.head(len(train) - val_size), train.tail(val_size)
dtrain = xgb.DMatrix(X_train[X_train['Open'] > 0][features], np.log(X_train[X_train['Open'] > 0]["Sales"] + 1))
dvalid = xgb.DMatrix(X_test[X_test['Open'] > 0][features], np.log(X_test[X_test['Open'] > 0]["Sales"] + 1))
dtest = xgb.DMatrix(test[features])
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=50, 
                feval=rmspe_xg)#, verbose_eval=True)

print("Validating")
train_probs = gbm.predict(xgb.DMatrix(X_test[features]))
indices = train_probs < 0
train_probs[indices] = 0
error = rmspe(np.exp(train_probs) - 1, X_test['Sales'].values)
print('error', error)

Train a XGBoost model
1017208    2013-01-01
Name: Date, dtype: object


ValueError: all feature_names must be alphanumerics

In [None]:
dtrain = xgb.DMatrix(train[train['Open'] > 0][features], np.log(train[train['Open'] > 0]["Sales"] + 1))
dtest = xgb.DMatrix(test[features])
watchlist = [(dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=50, 
                feval=rmspe_xg)

Will train until train error hasn't decreased in 50 rounds.
[0]	train-rmspe:0.999713
[1]	train-rmspe:0.999508
[2]	train-rmspe:0.999227
[3]	train-rmspe:0.998845
[4]	train-rmspe:0.998335
[5]	train-rmspe:0.997666
[6]	train-rmspe:0.996800
[7]	train-rmspe:0.995700
[8]	train-rmspe:0.994319
[9]	train-rmspe:0.992611
[10]	train-rmspe:0.990528
[11]	train-rmspe:0.988014
[12]	train-rmspe:0.985019
[13]	train-rmspe:0.981496
[14]	train-rmspe:0.977343
[15]	train-rmspe:0.972636
[16]	train-rmspe:0.967263
[17]	train-rmspe:0.961145
[18]	train-rmspe:0.954329
[19]	train-rmspe:0.946743
[20]	train-rmspe:0.938246
[21]	train-rmspe:0.929084
[22]	train-rmspe:0.919124
[23]	train-rmspe:0.908363
[24]	train-rmspe:0.896658
[25]	train-rmspe:0.884170
[26]	train-rmspe:0.870981
[27]	train-rmspe:0.857263
[28]	train-rmspe:0.842884
[29]	train-rmspe:0.827893
[30]	train-rmspe:0.812328
[31]	train-rmspe:0.796061
[32]	train-rmspe:0.779516
[33]	train-rmspe:0.762581
[34]	train-rmspe:0.745251
[35]	train-rmspe:0.727498
[36]	train-rms

In [None]:
print("Make predictions on the test set")
test_probs = gbm.predict(xgb.DMatrix(test[features]))
indices = test_probs < 0
test_probs[indices] = 0
submission = pd.DataFrame({"Id": test["Id"], "Sales": np.exp(test_probs) - 1})
submission.to_csv("xgboost_lb_all.csv", index=False)