In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn import cross_validation
import xgboost as xgb

def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w

def rmspe(yhat, y):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe

def rmspe_xg(yhat, y):
    # y = y.values
    y = y.get_label()
    y = np.exp(y) - 1
    yhat = np.exp(yhat) - 1
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
    return "rmspe", rmspe

In [3]:
train = pd.read_csv('train_nnew.csv', low_memory=False)
test = pd.read_csv('test_nnew.csv', low_memory=False)
features = [u'Open', u'Promo', u'SchoolHoliday', u'StateHoliday_0',
       u'StateHoliday_a', u'DayOfWeek_1', u'DayOfWeek_2', u'DayOfWeek_3',
       u'DayOfWeek_4', u'DayOfWeek_5', u'DayOfWeek_6', u'DayOfWeek_7',
       u'CompetitionDistance', u'Promo2', 'year', 'Mean_Sales', 'month', 'day',
       u'StoreType_a', u'StoreType_b', u'StoreType_c', u'StoreType_d',
       u'Assortment_a', u'Assortment_b', u'Assortment_c', u'CompetitionOpen']

In [4]:
train['year'] = train.Date.apply(lambda x: x.split('-')[0])
train['year'] = train['year'].astype(float)
train['month'] = train.Date.apply(lambda x: x.split('-')[1])
train['month'] = train['month'].astype(float)
train['day'] = train.Date.apply(lambda x: x.split('-')[2])
train['day'] = train['day'].astype(float)

In [5]:
test['year'] = test.Date.apply(lambda x: x.split('-')[0])
test['year'] = test['year'].astype(float)
test['month'] = test.Date.apply(lambda x: x.split('-')[1])
test['month'] = test['month'].astype(float)
test['day'] = test.Date.apply(lambda x: x.split('-')[2])
test['day'] = test['day'].astype(float)

In [6]:
train['Date'] = train['Date'].astype('datetime64')
test['Date'] = test['Date'].astype('datetime64')

In [8]:
train_x = train[train['Store'] == 1]

In [52]:
params = {"objective": "reg:linear", 
          "eta": 0.2, 
          "max_depth": 7, 
          "subsample": 0.7, 
          "colsample_bytree": 0.7, 
          "silent": 1, 
          "lambda" : 20, 
          "alpha" : 1 
         } 
num_trees = 500

In [53]:
print("Train a XGBoost model")
val_size = 50000
#train = train.sort(['Date'])
print(train.tail(1)['Date'])
X_train, X_test = cross_validation.train_test_split(train_x, test_size=0.05, random_state = 1)
#X_train, X_test = train.head(len(train) - val_size), train.tail(val_size)
dtrain = xgb.DMatrix(X_train[X_train['Open'] > 0][features], np.log(X_train[X_train['Open'] > 0]["Sales"] + 1))
dvalid = xgb.DMatrix(X_test[X_test['Open'] > 0][features], np.log(X_test[X_test['Open'] > 0]["Sales"] + 1))
dtest = xgb.DMatrix(test[test['Store'] == 1][features])
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=50, 
                feval=rmspe_xg)#, verbose_eval=True)

print("Validating")
train_probs = gbm.predict(xgb.DMatrix(X_test[features]))
indices = train_probs < 0
train_probs[indices] = 0
error = rmspe(np.exp(train_probs) - 1, X_test['Sales'].values)
print('error', error)

Will train until train error hasn't decreased in 50 rounds.
[0]	eval-rmspe:0.998537	train-rmspe:0.998555
[1]	eval-rmspe:0.994424	train-rmspe:0.994492
[2]	eval-rmspe:0.984488	train-rmspe:0.984678
[3]	eval-rmspe:0.965069	train-rmspe:0.965495
[4]	eval-rmspe:0.932816	train-rmspe:0.933634
[5]	eval-rmspe:0.886360	train-rmspe:0.887738
[6]	eval-rmspe:0.826284	train-rmspe:0.828377
[7]	eval-rmspe:0.756416	train-rmspe:0.759324
[8]	eval-rmspe:0.680261	train-rmspe:0.684028
[9]	eval-rmspe:0.602297	train-rmspe:0.606894
[10]	eval-rmspe:0.526428	train-rmspe:0.531754
[11]	eval-rmspe:0.456031	train-rmspe:0.461912
[12]	eval-rmspe:0.394815	train-rmspe:0.400258
[13]	eval-rmspe:0.338997	train-rmspe:0.345561
[14]	eval-rmspe:0.293662	train-rmspe:0.299762
[15]	eval-rmspe:0.254022	train-rmspe:0.260370
[16]	eval-rmspe:0.223594	train-rmspe:0.228607
[17]	eval-rmspe:0.199555	train-rmspe:0.203360
[18]	eval-rmspe:0.183361	train-rmspe:0.185020
[19]	eval-rmspe:0.172439	train-rmspe:0.172113
[20]	eval-rmspe:0.162736	train

Train a XGBoost model
1017208   2013-01-01
Name: Date, dtype: datetime64[ns]
Validating
('error', 0.09582728093942805)


[498]	eval-rmspe:0.103871	train-rmspe:0.077542
[499]	eval-rmspe:0.103685	train-rmspe:0.077500


In [75]:
stores = np.unique(test.Store)

In [100]:
ans = test[['Id']]

In [60]:
test_x = test[test['Store'] == 1]

In [91]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
for store in stores:
    print 'Store num', store
    train_x = train[train['Store'] == store]
    test_x = test[test['Store'] == store]
    dtrain = xgb.DMatrix(train_x[train_x['Open'] > 0][features], np.log(train_x[train_x['Open'] > 0]["Sales"] + 1))
    dtest = xgb.DMatrix(test[test['Store'] == store][features])
    gbm = xgb.train(params, dtrain, num_trees, 
                feval=rmspe_xg)
    test_probs = gbm.predict(dtest)
    indices = test_probs < 0
    test_probs[indices] = 0
    preds = np.exp(test_probs) - 1
    ans.loc[test_x.Id-1, 'Sales'] = preds

In [101]:
for store in stores:
    print 'Store num', store
    train_x = train[train['Store'] == store]
    test_x = test[test['Store'] == store]
    rf = RandomForestRegressor(n_estimators = 150, n_jobs = -1)
    rf.fit(train_x[train_x['Open'] > 0][features], np.log(train_x[train_x['Open'] > 0]["Sales"] + 1))
    test_probs = rf.predict(test_x[features])
    indices = test_probs < 0
    test_probs[indices] = 0
    preds = np.exp(test_probs) - 1
    ans.loc[test_x.Id-1, 'Sales'] = preds

Store num 1
Store num 3
Store num 7
Store num 8
Store num 9
Store num 10
Store num 11
Store num 12
Store num 13
Store num 14
Store num 15
Store num 16
Store num 19
Store num 20
Store num 21
Store num 22
Store num 23
Store num 24
Store num 25
Store num 27
Store num 29
Store num 30
Store num 31
Store num 32
Store num 33
Store num 35
Store num 36
Store num 38
Store num 39
Store num 40
Store num 41
Store num 42
Store num 43
Store num 45
Store num 46
Store num 47
Store num 48
Store num 49
Store num 50
Store num 51
Store num 52
Store num 53
Store num 56
Store num 58
Store num 61
Store num 62
Store num 63
Store num 64
Store num 66
Store num 67
Store num 68
Store num 69
Store num 70
Store num 71
Store num 72
Store num 73
Store num 74
Store num 75
Store num 76
Store num 77
Store num 79
Store num 80
Store num 81
Store num 82
Store num 83
Store num 84
Store num 86
Store num 89
Store num 90
Store num 91
Store num 92
Store num 93
Store num 94
Store num 98
Store num 99
Store num 100
Store num 101
St

In [102]:
ans

Unnamed: 0,Id,Sales
0,1,4304.657987
1,2,7604.018290
2,3,8980.969743
3,4,8019.223512
4,5,7818.644536
5,6,5820.787738
6,7,7965.396806
7,8,8313.523756
8,9,5621.534121
9,10,5649.012960


In [103]:
ans.to_csv("rf_ind.csv", index=False)