In [1]:
import pandas as pd
import numpy as np
from sklearn import cross_validation
from sklearn import tree

## Calculate median value based on training set

In [2]:
train = pd.read_csv("../data/train.csv", dtype={'StateHoliday': str}, parse_dates=["Date"])
train = train.loc[train.Sales > 0]
train['month'] = train['Date'].dt.month
train['day'] = train['Date'].dt.day
columns = ['Store', 'DayOfWeek', 'Promo']
dt = tree.DecisionTreeRegressor()
model = dt.fit(train[columns], train['Sales'])

## Prepare solution data with median value

In [4]:
medians = train.groupby( columns )['Sales'].median()
medians = medians.reset_index()
test = pd.read_csv("../data/test.csv", dtype={'StateHoliday': str}, parse_dates=["Date"])
test.loc[ test.Open.isnull(), 'Open' ] = 1
test['month'] = test['Date'].dt.month
test['day'] = test['Date'].dt.day
test2 = pd.merge( test, medians, on = columns, how = 'left' )
assert( len( test2 ) == len( test ))
test2.loc[ test2.Open == 0, 'Sales' ] = 0

test2[['Id', 'Sales']].astype(int).to_csv('../data/solution.csv', index=False)

## Cross validate

In [3]:
(cv_train, cv_test) = cross_validation.train_test_split(train)
dt = tree.DecisionTreeRegressor()
model = dt.fit(cv_train[columns], cv_train['Sales'])
y = model.predict(cv_test[columns])
spe = ((cv_test['Sales'] - y) / cv_test['Sales']) ** 2.0
rmspe = np.sqrt(spe.sum() / len(spe))
print("RMSPE=%f" % rmspe)

RMSPE=0.189173
