Imports
====

In [145]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Load Data
=====

In [146]:
new_data = pd.read_csv('store.csv')
data = pd.read_csv('postProcessed.csv')


## Fix Variables

In [147]:
data.ix[:, 'StateHoliday'] = data.ix[:, 'StateHoliday'].astype(str)
new_data.ix[:, ['Promo2SinceWeek', 
                'Promo2SinceYear', 
                'PromoInterval',
                'CompetitionDistance']] = new_data.ix[:, ['Promo2SinceWeek', 
                                                          'Promo2SinceYear', 
                                                          'PromoInterval',
                                                          'CompetitionDistance']].fillna(0)

In [148]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

enc = LabelEncoder()
enc.fit(np.unique(data['StateHoliday']))

data.ix[:, ['StateHoliday']] = enc.transform(data['StateHoliday'])


enc = LabelEncoder()
enc.fit(np.unique(new_data['StoreType']))
new_data.ix[:, ['StoreType']] = enc.transform(new_data['StoreType'])


enc = LabelEncoder()
enc.fit(np.unique(new_data['Assortment']))
new_data.ix[:, ['Assortment']] = enc.transform(new_data['Assortment'])



In [149]:
from sklearn.feature_extraction import DictVectorizer

#  ohe = OneHotEncoder().fit([list(np.unique(data.ix[:,'StateHoliday'])), 
#                             list(np.unique(new_data.ix[:, 'StoreType'])), 
#                             list(np.unique(new_data.ix[:, 'Assortment']))])
dv = DictVectorizer()    
test = dv.fit(new_data.ix[:, ['StoreType', 'Assortment']].to_dict(orient='records'))

#test.transform(new_data.ix[:, ['StoreType', 'Assortment']])

In [150]:


joined = data.merge(new_data, how='outer', on='Store')
joined.dtypes

Store                          int64
DayOfWeek                      int64
Date                          object
Sales                          int64
Customers                      int64
Open                           int64
Promo                          int64
StateHoliday                   int64
SchoolHoliday                  int64
DateTime                      object
Year                           int64
Month                          int64
Day                            int64
WeekNumber                     int64
StoreType                      int64
Assortment                     int64
CompetitionDistance          float64
CompetitionOpenSinceMonth    float64
CompetitionOpenSinceYear     float64
Promo2                         int64
Promo2SinceWeek              float64
Promo2SinceYear              float64
PromoInterval                 object
dtype: object

In [151]:
joined.describe()

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,Day,WeekNumber,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear
count,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,693861.0,693861.0,1017209.0,1017209.0,1017209.0
mean,558.429727,3.998341,5773.818972,633.145946,0.830107,0.381515,0.045163,0.178647,2013.832292,5.846762,15.70279,23.615515,1.207467,0.935141,5415.982074,7.222866,2008.690228,0.500564,11.647665,1007.010608
std,321.908651,1.997391,3849.926175,464.411734,0.375539,0.485759,0.283656,0.383056,0.777396,3.326097,8.787638,14.433381,1.365376,0.993801,7710.252708,3.211832,5.992644,0.5,15.323928,1005.87693
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2013.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1900.0,0.0,0.0,0.0
25%,280.0,2.0,3727.0,405.0,1.0,0.0,0.0,0.0,2013.0,3.0,8.0,11.0,0.0,0.0,700.0,4.0,2006.0,0.0,0.0,0.0
50%,558.0,4.0,5744.0,609.0,1.0,0.0,0.0,0.0,2014.0,6.0,16.0,22.0,0.0,0.0,2320.0,8.0,2010.0,1.0,1.0,2009.0
75%,838.0,6.0,7856.0,837.0,1.0,1.0,0.0,0.0,2014.0,8.0,23.0,35.0,3.0,2.0,6880.0,10.0,2013.0,1.0,22.0,2012.0
max,1115.0,7.0,41551.0,7388.0,1.0,1.0,3.0,1.0,2015.0,12.0,31.0,52.0,3.0,2.0,75860.0,12.0,2015.0,1.0,50.0,2015.0


## Split data

In [168]:
from sklearn.cross_validation import train_test_split

mask = joined['Sales'] > 0 

response = joined.ix[mask, ['Sales']]

columns = ['Store', 
           'DayOfWeek', 
           'Open', 
           'Promo', 
           'WeekNumber']



X_train, X_test, y_train, y_test = train_test_split(joined.ix[mask,columns], response, test_size=0.25, random_state=8675309)

## Fit Tree

In [169]:
from sklearn.tree import DecisionTreeRegressor

explore_tree = DecisionTreeRegressor().fit(X_train, y_train)
pred = explore_tree.predict(X_test)


In [170]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rmse = sqrt(mean_squared_error(pred, y_test))

print(rmse)

1511.06340951


## Make submission

In [112]:
testing_data = pd.read_csv('test.csv', index_col=0)

testing_data['DateTime'] = pd.to_datetime(testing_data['Date'])
testing_data['Year'] = testing_data['DateTime'].apply(lambda x: x.year)
testing_data['Month'] = testing_data['DateTime'].apply(lambda x: x.month)
testing_data['Day'] = testing_data['DateTime'].apply(lambda x: x.day)
testing_data['WeekNumber'] = testing_data['DateTime'].apply(lambda x: x.isocalendar()[1])
testing_data.ix[:, 'StateHoliday'] = testing_data.ix[:, 'StateHoliday'].astype(str)

enc = LabelEncoder()
enc.fit(np.unique(testing_data['StateHoliday']))

testing_data.ix[:, 'StateHoliday'] = enc.transform(testing_data.ix[:, 'StateHoliday'])
testing_data.ix[(testing_data['Open'] != 1) & (testing_data['Open'] != 0), 'Open'] = int(1)

big_test = testing_data.merge(new_data, how='inner', on='Store')

# can't figure out how to get the index to work
big_test.index = big_test.index + 1

In [113]:
columns = ['Store',  
           'DayOfWeek',
           'Open', 
           'Promo', 
           'StateHoliday', 
           'SchoolHoliday',
           'Year', 
           'Month', 
           'Day', 
           'WeekNumber',
           'CompetitionDistance']

big_test['sales_prediction'] = explore_tree.predict(big_test.ix[:, columns])


In [114]:
big_test['sales_prediction'].to_csv('bigDataSubmission.csv', header=['Sales'])

Unnamed: 0,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,DateTime,Year,Month,...,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,sales_prediction
1,1,4,2015-09-17,1,1,0,0,2015-09-17,2015,9,...,2,0,1270,9,2008,0,0,0,0,3740
2,1,3,2015-09-16,1,1,0,0,2015-09-16,2015,9,...,2,0,1270,9,2008,0,0,0,0,4654
3,1,2,2015-09-15,1,1,0,0,2015-09-15,2015,9,...,2,0,1270,9,2008,0,0,0,0,4381
4,1,1,2015-09-14,1,1,0,0,2015-09-14,2015,9,...,2,0,1270,9,2008,0,0,0,0,5402
5,1,7,2015-09-13,0,0,0,0,2015-09-13,2015,9,...,2,0,1270,9,2008,0,0,0,0,0
