Import Modules
========

In [126]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Load Data
=====

In [127]:
data = pd.read_csv('postProcessed.csv')
data.head()

  data = self._reader.read(nrows)


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,DateTime,Year,Month,Day,WeekNumber
0,1,5,2015-07-31,5263,555,1,1,0,1,2015-07-31 00:00:00,2015,7,31,31
1,2,5,2015-07-31,6064,625,1,1,0,1,2015-07-31 00:00:00,2015,7,31,31
2,3,5,2015-07-31,8314,821,1,1,0,1,2015-07-31 00:00:00,2015,7,31,31
3,4,5,2015-07-31,13995,1498,1,1,0,1,2015-07-31 00:00:00,2015,7,31,31
4,5,5,2015-07-31,4822,559,1,1,0,1,2015-07-31 00:00:00,2015,7,31,31


In [128]:
data.describe()

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,SchoolHoliday,Year,Month,Day,WeekNumber
count,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0
mean,558.429727,3.998341,5773.818972,633.145946,0.830107,0.381515,0.178647,2013.832292,5.846762,15.70279,23.615515
std,321.908651,1.997391,3849.926175,464.411734,0.375539,0.485759,0.383056,0.777396,3.326097,8.787638,14.433381
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2013.0,1.0,1.0,1.0
25%,280.0,2.0,3727.0,405.0,1.0,0.0,0.0,2013.0,3.0,8.0,11.0
50%,558.0,4.0,5744.0,609.0,1.0,0.0,0.0,2014.0,6.0,16.0,22.0
75%,838.0,6.0,7856.0,837.0,1.0,1.0,0.0,2014.0,8.0,23.0,35.0
max,1115.0,7.0,41551.0,7388.0,1.0,1.0,1.0,2015.0,12.0,31.0,52.0


In [129]:
data.dtypes

Store             int64
DayOfWeek         int64
Date             object
Sales             int64
Customers         int64
Open              int64
Promo             int64
StateHoliday     object
SchoolHoliday     int64
DateTime         object
Year              int64
Month             int64
Day               int64
WeekNumber        int64
dtype: object

## Fix Variables

In [130]:
from sklearn.preprocessing import LabelEncoder
data.ix[:, ['StateHoliday']] = data.ix[:, ['StateHoliday']].astype(str)

le = LabelEncoder().fit(np.unique(data['StateHoliday']))

data.ix[:, 'StateHoliday'] = le.transform(data.ix[:, 'StateHoliday'])

In [None]:
data.dtypes

## Split the data

In [134]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data.drop(['Sales', 'Date', 'DateTime'], axis=1), 
                                                    data['Sales'], 
                                                    test_size=0.25, 
                                                    random_state=8675309)

## Run some feature selection to see variable importance

In [8]:
from sklearn.feature_selection import SelectKBest, f_regression

kb = SelectKBest(f_regression, k=5).fit(X_train, y_train)
for x in zip(X_train.columns, kb.get_support()):
    print(x)

('Store', False)
('DayOfWeek', True)
('Customers', True)
('Open', True)
('Promo', True)
('StateHoliday', True)
('SchoolHoliday', False)
('Year', False)
('Month', False)
('Day', False)
('WeekNumber', False)


From fiddling with $k$ here, we get the following variable importance (in order of descending importance)

1. Customers
- Open
- DayOfWeek
- Promo
- StateHoliday
- SchoolHoliday
- WeekNumber
- Month
- Year
- Day
- Store

## Fit a tree (using 5 most important variables)

In [11]:
from sklearn.tree import DecisionTreeRegressor

tree_one = DecisionTreeRegressor().fit(kb.transform(X_train), y_train)

In [12]:
fitted_tree_one = tree_one.predict(kb.transform(X_test))

In [13]:
from sklearn.metrics import mean_squared_error
from math import sqrt

sqrt(mean_squared_error(fitted_tree_one, y_test))



1457.4939705563581

## Fit a tree (using all variables)

In [14]:
tree_two = DecisionTreeRegressor().fit(X_train, y_train)

In [15]:
fitted_tree_two = tree_two.predict(X_test)

In [16]:
sqrt(mean_squared_error(fitted_tree_two, y_test))

1115.8512238500925

So we can see all variables are better than the best 5. Good to know

## Test Data doesn't have customers, last tree without using custs

In [131]:
tree_three = DecisionTreeRegressor().fit(X_train.drop('Customers', axis=1), y_train)

## Predicting and submitting, to get a submission in before the deadline

In [210]:
testing_data = pd.read_csv('test.csv', index_col=0)

testing_data['DateTime'] = pd.to_datetime(testing_data['Date'])
testing_data['Year'] = testing_data['DateTime'].apply(lambda x: x.year)
testing_data['Month'] = testing_data['DateTime'].apply(lambda x: x.month)
testing_data['Day'] = testing_data['DateTime'].apply(lambda x: x.day)
testing_data['WeekNumber'] = testing_data['DateTime'].apply(lambda x: x.isocalendar()[1])

testing_data.ix[:, 'StateHoliday'] = le.transform(testing_data.ix[:, 'StateHoliday'])
testing_data.ix[(testing_data['Open'] != 1) & (testing_data['Open'] != 0), 'Open'] = int(1)

testing_data.head()

Unnamed: 0_level_0,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,DateTime,Year,Month,Day,WeekNumber
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1,4,2015-09-17,1,1,0,0,2015-09-17,2015,9,17,38
2,3,4,2015-09-17,1,1,0,0,2015-09-17,2015,9,17,38
3,7,4,2015-09-17,1,1,0,0,2015-09-17,2015,9,17,38
4,8,4,2015-09-17,1,1,0,0,2015-09-17,2015,9,17,38
5,9,4,2015-09-17,1,1,0,0,2015-09-17,2015,9,17,38


In [140]:
testing_data['sales_prediction'] = tree_three.predict(testing_data.drop(['Date', 'DateTime'], axis=1))

In [144]:
testing_data['sales_prediction'].to_csv('firstSubmission.csv', header=['Sales'])

# # Trying some other methods

In [46]:
import statsmodels.api as sm

In [154]:
from sknn.mlp import Regressor, Layer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler


In [150]:

#convert to numpy array
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)






In [195]:
#Neural Network
pipeline = Pipeline([
        ('min/max scaler', MinMaxScaler(feature_range=(0.0, 1.0))),
        ('neural network', Regressor(
    layers=[
        Layer("Rectifier", units=3),
        Layer("Linear")],
    learning_rate=0.02,
    n_iter=1))])
#notes - smaller units are better; a layer linear is better than softmax, a rectifier linear is better than sigmoid

In [196]:
pipeline.fit(X_train, y_train)

  "got %s" % (estimator, X.dtype))
  X *= self.scale_
  X += self.min_


Pipeline(steps=[('min/max scaler', MinMaxScaler(copy=True, feature_range=(0.0, 1.0))), ('neural network', Regressor(batch_size=1, debug=False, dropout_rate=None, f_stable=0.001,
     hidden0=<sknn.nn.Layer `Rectifier`: name='hidden0', units=3, frozen=False>,
     layers=[<sknn.nn.Layer `Rectifier`: name='hid... valid_set=None, valid_size=0.0,

In [197]:
NN_two = pipeline.predict(X_test)


  X *= self.scale_
  X += self.min_


In [198]:
#RSME for a Neural Net
from sklearn.metrics import mean_squared_error
from math import sqrt

sqrt(mean_squared_error(NN_two, y_test))


3865.8275363804696

In [200]:
#Linear Support Vector Regression
from sklearn.svm import LinearSVR
clf = LinearSVR()
clf.fit(X_train, y_train)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0)

In [201]:
#RSME for a LinearSVR
LinearSVR_two = clf.predict(X_test)
sqrt(mean_squared_error(LinearSVR_two, y_test))


1540.9747031386044

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [204]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
AB = AdaBoostRegressor(DecisionTreeRegressor(),n_estimators = 100, random_state=8675309)


In [205]:
AB.fit(X_train, y_train)


AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, random_state=None,
           splitter='best'),
         learning_rate=1.0, loss='linear', n_estimators=100,
         random_state=8675309)

In [207]:
#AdaBoost Regressor
AB_two = AB.predict(X_test)
sqrt(mean_squared_error(AB_two, y_test)) 

789.796713080243

In [212]:
#get the pandas dataframes back 
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data.drop(['Sales', 'Date', 'DateTime'], axis=1), 
                                                    data['Sales'], 
                                                    test_size=0.25, 
                                                    random_state=8675309)

In [213]:

AB_three = AdaBoostRegressor(DecisionTreeRegressor(),n_estimators = 100, random_state=8675309).fit(X_train.drop('Customers', axis=1), y_train)

In [214]:
testing_data['sales_prediction'] = AB_three.predict(testing_data.drop(['Date', 'DateTime'], axis=1))

In [215]:
testing_data['sales_prediction'].to_csv('firstSubmission.csv', header=['Sales'])

In [208]:
#GradientBoosting
from sklearn.ensemble import GradientBoostingRegressor
GB = GradientBoostingRegressor(n_estimators = 300, random_state=8675309, loss='lad')
#Least absolute deviation ('lad'): A robust loss function for regression. The initial model is given by the median of the target values.


In [209]:
#Gradient Boosting Regressor
GB.fit(X_train, y_train)
GB_two = GB.predict(X_test)
sqrt(mean_squared_error(LinearSVR_two, y_test))

1540.9747031386044