In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn import cross_validation
import xgboost as xgb

def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w

def rmspe(yhat, y):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe

def rmspe_xg(yhat, y):
    # y = y.values
    y = y.get_label()
    y = np.exp(y) - 1
    yhat = np.exp(yhat) - 1
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
    return "rmspe", rmspe

In [3]:
train = pd.read_csv('train_nnew.csv', low_memory=False)
test = pd.read_csv('test_nnew.csv', low_memory=False)
features = [u'Open', u'Promo', u'SchoolHoliday', u'StateHoliday_0',
       u'StateHoliday_a', u'DayOfWeek_1', u'DayOfWeek_2', u'DayOfWeek_3',
       u'DayOfWeek_4', u'DayOfWeek_5', u'DayOfWeek_6', u'DayOfWeek_7',
       u'CompetitionDistance', u'Promo2', 'year', 'Mean_Sales', 'month', 'day',
       u'StoreType_a', u'StoreType_b', u'StoreType_c', u'StoreType_d',
       u'Assortment_a', u'Assortment_b', u'Assortment_c', u'CompetitionOpen']

In [4]:
train['year'] = train.Date.apply(lambda x: x.split('-')[0])
train['year'] = train['year'].astype(float)
train['month'] = train.Date.apply(lambda x: x.split('-')[1])
train['month'] = train['month'].astype(float)
train['day'] = train.Date.apply(lambda x: x.split('-')[2])
train['day'] = train['day'].astype(float)

In [5]:
test['year'] = test.Date.apply(lambda x: x.split('-')[0])
test['year'] = test['year'].astype(float)
test['month'] = test.Date.apply(lambda x: x.split('-')[1])
test['month'] = test['month'].astype(float)
test['day'] = test.Date.apply(lambda x: x.split('-')[2])
test['day'] = test['day'].astype(float)

In [6]:
train['StoreType'] = train['StoreType_a'] + 2*train['StoreType_b'] + 3*train['StoreType_c'] + 4*train['StoreType_d']

test['StoreType'] = test['StoreType_a'] + 2*test['StoreType_b'] + 3*test['StoreType_c'] + 4*test['StoreType_d']

train['Assortment'] = train['Assortment_a'] + 2*train['Assortment_b'] + 3*train['Assortment_c']

test['Assortment'] = test['Assortment_a'] + 2*test['Assortment_b'] + 3*test['Assortment_c']

In [7]:
features = [u'Open', u'Promo', u'SchoolHoliday', u'StateHoliday_0',
       u'StateHoliday_a', u'DayOfWeek',
       u'CompetitionDistance', u'Promo2', 'year', 'Mean_Sales', 'month', 'day',
       u'StoreType',
       u'Assortment', u'CompetitionOpen']

In [8]:
train['Date'] = train['Date'].astype('datetime64')
test['Date'] = test['Date'].astype('datetime64')

In [9]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

In [10]:
X_train, X_test = cross_validation.train_test_split(train, test_size=0.05, random_state = 1)
rf = RandomForestRegressor(n_estimators = 100, n_jobs = -1, random_state = 1)
xt = ExtraTreesRegressor(n_estimators = 100, n_jobs = -1, random_state = 1)
print 'rf'
rf.fit(X_train[features], X_train['Sales'])
print 'acc =', rmspe(rf.predict(X_test[features]), X_test['Sales'])
print 'xt'
xt.fit(X_train[features], X_train['Sales'])
print 'acc =', rmspe(xt.predict(X_test[features]), X_test['Sales'])

rf
acc = 0.000805109792152
xt
acc = 0.000378444497897




In [11]:
preds = xt.predict(test[features])

In [12]:
submission = pd.DataFrame({"Id": test["Id"], "Sales": preds})
submission.to_csv("xtt_n100.csv", index=False)