In [37]:
import pandas as pd
import numpy as np

## (1) Select input(explanatory) variables to predict sales.

- DayOfWeek : Mon-Sun can affect sales.
- Date : Date can affect sales. I changed this value into DatetimeIndex.
- Open : When it is 0, the sales become 0.
- Promo : Conducting this can affect increasing sales.
- StateHoliday_0, StateHoliday_a, StateHoliday_b, StateHoliday_c : I used one-hot-encoding as there are 4 types. When I used unique function I found out that 0 value is separated into two(integer 0 and string ‘0’). So I had to merge the values.
- PromoInterval_0, PromoInterval_Feb,May,Aug,Nov, PromoInterval_Jan,Apr,Jul,Oct, PromoInterval_Mar,Jun,Sept,Dec : When I used unique function, only four values (NaN and the other three) existed. Also I checked that if Promo2 is 0 then, Promo2Interval is always NaN. So I did use one-hot-encoding.
- CompetitionOpen_Closed, CompetitionOpen_Not known, CompetitionOpen_Open : I made a new column describing whether the competition store is open or not comparing CompetitionOpenSince column in store.csv and Date column in train.csv.
- SalePerCustomer : To obtain meaning feature, I divided sales into customers.

In [38]:
# Reading csv files
train_file = pd.read_csv("/Users/jeewonkim/Desktop/train.csv")
store_file = pd.read_csv("/Users/jeewonkim/Desktop/store.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [39]:
combined = pd.merge(train_file, store_file, how = 'inner', on = 'Store')

In [40]:
combined.dtypes

Store                          int64
DayOfWeek                      int64
Date                          object
Sales                          int64
Customers                      int64
Open                           int64
Promo                          int64
StateHoliday                  object
SchoolHoliday                  int64
StoreType                     object
Assortment                    object
CompetitionDistance          float64
CompetitionOpenSinceMonth    float64
CompetitionOpenSinceYear     float64
Promo2                         int64
Promo2SinceWeek              float64
Promo2SinceYear              float64
PromoInterval                 object
dtype: object

## Feature Engineering

In [41]:
combined['StateHoliday'].unique()

array(['0', 'a', 'b', 'c', 0], dtype=object)

In [42]:
combined['StateHoliday'].value_counts()

0    855087
0    131072
a     20260
b      6690
c      4100
Name: StateHoliday, dtype: int64

In [43]:
combined_dummies = pd.get_dummies(combined, columns = ['StateHoliday'])
combined_dummies.columns

Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'StateHoliday_0',
       'StateHoliday_0', 'StateHoliday_a', 'StateHoliday_b', 'StateHoliday_c'],
      dtype='object')

In [44]:
combined_dummies['StateHoliday_0_real']= combined_dummies.iloc[:,17:18].values + combined_dummies.iloc[:,18:19].values
combined_dummies.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,SchoolHoliday,StoreType,Assortment,...,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,StateHoliday_0,StateHoliday_0.1,StateHoliday_a,StateHoliday_b,StateHoliday_c,StateHoliday_0_real
0,1,5,2015-07-31,5263,555,1,1,1,c,a,...,0,,,,0,1,0,0,0,1
1,1,4,2015-07-30,5020,546,1,1,1,c,a,...,0,,,,0,1,0,0,0,1
2,1,3,2015-07-29,4782,523,1,1,1,c,a,...,0,,,,0,1,0,0,0,1
3,1,2,2015-07-28,5011,560,1,1,1,c,a,...,0,,,,0,1,0,0,0,1
4,1,1,2015-07-27,6102,612,1,1,1,c,a,...,0,,,,0,1,0,0,0,1


## (2)

(2)-1 Include variables generated from Promo2 and Promo2Interval

In [45]:
combined_dummies.dtypes

Store                          int64
DayOfWeek                      int64
Date                          object
Sales                          int64
Customers                      int64
Open                           int64
Promo                          int64
SchoolHoliday                  int64
StoreType                     object
Assortment                    object
CompetitionDistance          float64
CompetitionOpenSinceMonth    float64
CompetitionOpenSinceYear     float64
Promo2                         int64
Promo2SinceWeek              float64
Promo2SinceYear              float64
PromoInterval                 object
StateHoliday_0                 uint8
StateHoliday_0                 uint8
StateHoliday_a                 uint8
StateHoliday_b                 uint8
StateHoliday_c                 uint8
StateHoliday_0_real            uint8
dtype: object

In [46]:
combined_dummies.query('PromoInterval.isnull() & Promo2 == 0', engine='python').shape

(508031, 23)

In [47]:
combined_dummies.query('PromoInterval.isnull()', engine='python').shape

(508031, 23)

In [48]:
combined_dummies.query('Promo2.isnull()', engine='python').shape

(0, 23)

When Promo2 is 0, PromoInterval is always 0. From this I filled null value as 0.

In [49]:
combined_dummies['PromoInterval'] = combined_dummies['PromoInterval'].replace(np.NaN,0)

In [50]:
combined_dummies['PromoInterval']

0                         0
1                         0
2                         0
3                         0
4                         0
                 ...       
1017204    Mar,Jun,Sept,Dec
1017205    Mar,Jun,Sept,Dec
1017206    Mar,Jun,Sept,Dec
1017207    Mar,Jun,Sept,Dec
1017208    Mar,Jun,Sept,Dec
Name: PromoInterval, Length: 1017209, dtype: object

In [55]:
combined_dummied = pd.get_dummies(combined_dummies, columns = ['PromoInterval'])

In [56]:
combined_dummied

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,SchoolHoliday,StoreType,Assortment,...,StateHoliday_0,StateHoliday_0.1,StateHoliday_a,StateHoliday_b,StateHoliday_c,StateHoliday_0_real,PromoInterval_0,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec"
0,1,5,2015-07-31,5263,555,1,1,1,c,a,...,0,1,0,0,0,1,1,0,0,0
1,1,4,2015-07-30,5020,546,1,1,1,c,a,...,0,1,0,0,0,1,1,0,0,0
2,1,3,2015-07-29,4782,523,1,1,1,c,a,...,0,1,0,0,0,1,1,0,0,0
3,1,2,2015-07-28,5011,560,1,1,1,c,a,...,0,1,0,0,0,1,1,0,0,0
4,1,1,2015-07-27,6102,612,1,1,1,c,a,...,0,1,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017204,1115,6,2013-01-05,4771,339,1,0,1,d,c,...,0,1,0,0,0,1,0,0,0,1
1017205,1115,5,2013-01-04,4540,326,1,0,1,d,c,...,0,1,0,0,0,1,0,0,0,1
1017206,1115,4,2013-01-03,4297,300,1,0,1,d,c,...,0,1,0,0,0,1,0,0,0,1
1017207,1115,3,2013-01-02,3697,305,1,0,1,d,c,...,0,1,0,0,0,1,0,0,0,1


(2)-2 Include variable(s) from CompetitionDistance

In [57]:
combined_dummied.query('CompetitionDistance.isnull()', engine='python').shape

(2642, 26)

I replaced null values with average of CompetitionDistance

In [58]:
combined_dummied['CompetitionDistance'] = combined_dummied['CompetitionDistance'].replace(np.NaN,np.mean(combined_dummied['CompetitionDistance']))

In [59]:
combined_dummied.query('CompetitionDistance.isnull()', engine='python').shape

(0, 26)

In [60]:
from sklearn.preprocessing import StandardScaler

store_df = pd.DataFrame(combined_dummied, columns = combined_dummied.columns)
scaler = StandardScaler()
scaler.fit(store_df[['CompetitionDistance']])
store_df['CompetitionDistance'] = scaler.transform(store_df[['CompetitionDistance']])

In [61]:
combined_dummied['CompetitionDistance'] = store_df['CompetitionDistance']

In [62]:
combined_dummied['CompetitionDistance']

0         -0.539900
1         -0.539900
2         -0.539900
3         -0.539900
4         -0.539900
             ...   
1017204   -0.010394
1017205   -0.010394
1017206   -0.010394
1017207   -0.010394
1017208   -0.010394
Name: CompetitionDistance, Length: 1017209, dtype: float64

(2)-3. Include variable(s) from CompetitionOpenSince[Month/Year]

In [63]:
combined_dummied['CompetitionOpenSinceMonth'].unique()

array([ 9., 11., 12.,  4., 10.,  8., nan,  3.,  6.,  5.,  1.,  2.,  7.])

In [64]:
combined_dummied['CompetitionOpen'] = "Not known"

In [65]:
combined_dummied['Year'] = pd.to_datetime(combined_dummied['Date']).dt.year
combined_dummied['Month'] = pd.to_datetime(combined_dummied['Date']).dt.month

In [80]:
combined_dummied

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,SchoolHoliday,StoreType,Assortment,...,StateHoliday_b,StateHoliday_c,StateHoliday_0_real,PromoInterval_0,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec",CompetitionOpen,Year,Month
0,1,5,2015-07-31,5263,555,1,1,1,c,a,...,0,0,1,1,0,0,0,Not known,2015,7
1,1,4,2015-07-30,5020,546,1,1,1,c,a,...,0,0,1,1,0,0,0,Not known,2015,7
2,1,3,2015-07-29,4782,523,1,1,1,c,a,...,0,0,1,1,0,0,0,Not known,2015,7
3,1,2,2015-07-28,5011,560,1,1,1,c,a,...,0,0,1,1,0,0,0,Not known,2015,7
4,1,1,2015-07-27,6102,612,1,1,1,c,a,...,0,0,1,1,0,0,0,Not known,2015,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017204,1115,6,2013-01-05,4771,339,1,0,1,d,c,...,0,0,1,0,0,0,1,Not known,2013,1
1017205,1115,5,2013-01-04,4540,326,1,0,1,d,c,...,0,0,1,0,0,0,1,Not known,2013,1
1017206,1115,4,2013-01-03,4297,300,1,0,1,d,c,...,0,0,1,0,0,0,1,Not known,2013,1
1017207,1115,3,2013-01-02,3697,305,1,0,1,d,c,...,0,0,1,0,0,0,1,Not known,2013,1


In [67]:
combined_dummied['Year'].unique()

array([2015, 2014, 2013])

In [68]:
combined_dummied['Month'].unique()

array([ 7,  6,  5,  4,  3,  2,  1, 12, 11, 10,  9,  8])

In [69]:
combined_dummied['CompetitionOpenSinceMonth'].unique()

array([ 9., 11., 12.,  4., 10.,  8., nan,  3.,  6.,  5.,  1.,  2.,  7.])

In [70]:
combined_dummied['CompetitionOpenSinceYear'].unique()

array([2008., 2007., 2006., 2009., 2015., 2013., 2014., 2000., 2011.,
         nan, 2010., 2005., 1999., 2003., 2012., 2004., 2002., 1961.,
       1995., 2001., 1990., 1994., 1900., 1998.])

In [71]:
# %time

# for i in range(0, combined_dummied['CompetitionOpenSinceYear'].size):
#     if combined_dummied['CompetitionOpenSinceYear'][i] == 0.0:
#         pass
#     else:
#         if combined_dummied['CompetitionOpenSinceYear'][i] > combined_dummied['Year'][i]:
#             combined_dummied['CompetitionOpen'][i] = "Closed"
#         elif combined_dummied['CompetitionOpenSinceYear'][i] < combined_dummied['Year'][i]:
#             combined_dummied['CompetitionOpen'][i] = "Open"
#         elif combined_dummied['CompetitionOpenSinceYear'][i] == combined_dummied['Year'][i]:
#             if combined_dummied['CompetitionOpenSinceMonth'][i] > combined_dummied['Month'][i]:
#                 combined_dummied['CompetitionOpen'][i] = "Closed"
#             elif combined_dummied['CompetitionOpenSinceMonth'][i] < combined_dummied['Month'][i]:
#                 combined_dummied['CompetitionOpen'][i] = "Open"
#             else:
#                 pass

As it took so long time to get the result, I separately saved a filed called temp_combined.csv to 'CompetitionOpen' feature quickly.

In [74]:
temp_combined = pd.read_csv("/Users/jeewonkim/Desktop/temp_combined.csv")

In [86]:
combined_dummied['CompetitionOpen'] = temp_combined['CompetitionOpen']

In [87]:
combined_dummied

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,SchoolHoliday,StoreType,Assortment,...,StateHoliday_b,StateHoliday_c,StateHoliday_0_real,PromoInterval_0,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec",CompetitionOpen,Year,Month
0,1,5,2015-07-31,5263,555,1,1,1,c,a,...,0,0,1,1,0,0,0,Open,2015,7
1,1,4,2015-07-30,5020,546,1,1,1,c,a,...,0,0,1,1,0,0,0,Open,2015,7
2,1,3,2015-07-29,4782,523,1,1,1,c,a,...,0,0,1,1,0,0,0,Open,2015,7
3,1,2,2015-07-28,5011,560,1,1,1,c,a,...,0,0,1,1,0,0,0,Open,2015,7
4,1,1,2015-07-27,6102,612,1,1,1,c,a,...,0,0,1,1,0,0,0,Open,2015,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017204,1115,6,2013-01-05,4771,339,1,0,1,d,c,...,0,0,1,0,0,0,1,Not known,2013,1
1017205,1115,5,2013-01-04,4540,326,1,0,1,d,c,...,0,0,1,0,0,0,1,Not known,2013,1
1017206,1115,4,2013-01-03,4297,300,1,0,1,d,c,...,0,0,1,0,0,0,1,Not known,2013,1
1017207,1115,3,2013-01-02,3697,305,1,0,1,d,c,...,0,0,1,0,0,0,1,Not known,2013,1


In [88]:
combined_dummied2 = pd.get_dummies(combined_dummied, columns = ['CompetitionOpen'])

In [90]:
combined_dummied2

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,SchoolHoliday,StoreType,Assortment,...,StateHoliday_0_real,PromoInterval_0,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec",Year,Month,CompetitionOpen_Closed,CompetitionOpen_Not known,CompetitionOpen_Open
0,1,5,2015-07-31,5263,555,1,1,1,c,a,...,1,1,0,0,0,2015,7,0,0,1
1,1,4,2015-07-30,5020,546,1,1,1,c,a,...,1,1,0,0,0,2015,7,0,0,1
2,1,3,2015-07-29,4782,523,1,1,1,c,a,...,1,1,0,0,0,2015,7,0,0,1
3,1,2,2015-07-28,5011,560,1,1,1,c,a,...,1,1,0,0,0,2015,7,0,0,1
4,1,1,2015-07-27,6102,612,1,1,1,c,a,...,1,1,0,0,0,2015,7,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017204,1115,6,2013-01-05,4771,339,1,0,1,d,c,...,1,0,0,0,1,2013,1,0,1,0
1017205,1115,5,2013-01-04,4540,326,1,0,1,d,c,...,1,0,0,0,1,2013,1,0,1,0
1017206,1115,4,2013-01-03,4297,300,1,0,1,d,c,...,1,0,0,0,1,2013,1,0,1,0
1017207,1115,3,2013-01-02,3697,305,1,0,1,d,c,...,1,0,0,0,1,2013,1,0,1,0


(2)-4. Include new variable(s) from historical sales or number of customers (Sales per customer)

In [91]:
combined_dummied2['SalePerCustomer'] = combined_dummied2['Sales']/combined_dummied2['Customers']

In [93]:
combined_dummied2.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,SchoolHoliday,StoreType,Assortment,...,PromoInterval_0,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec",Year,Month,CompetitionOpen_Closed,CompetitionOpen_Not known,CompetitionOpen_Open,SalePerCustomer
0,1,5,2015-07-31,5263,555,1,1,1,c,a,...,1,0,0,0,2015,7,0,0,1,9.482883
1,1,4,2015-07-30,5020,546,1,1,1,c,a,...,1,0,0,0,2015,7,0,0,1,9.194139
2,1,3,2015-07-29,4782,523,1,1,1,c,a,...,1,0,0,0,2015,7,0,0,1,9.143403
3,1,2,2015-07-28,5011,560,1,1,1,c,a,...,1,0,0,0,2015,7,0,0,1,8.948214
4,1,1,2015-07-27,6102,612,1,1,1,c,a,...,1,0,0,0,2015,7,0,0,1,9.970588


'Date' preprocessing converting date into unix values

In [94]:
pd.to_datetime(combined_dummied2['Date'])
combined_dummied2['Date'] = pd.DatetimeIndex(combined_dummied2['Date']).astype(np.int32)

combined_dummied2[['Date']].head(2)

Unnamed: 0,Date
0,1438300800000000000
1,1438214400000000000


In [98]:
combined_dummied2

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,SchoolHoliday,StoreType,Assortment,...,PromoInterval_0,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec",Year,Month,CompetitionOpen_Closed,CompetitionOpen_Not known,CompetitionOpen_Open,SalePerCustomer
0,1,5,1438300800000000000,5263,555,1,1,1,c,a,...,1,0,0,0,2015,7,0,0,1,9.482883
1,1,4,1438214400000000000,5020,546,1,1,1,c,a,...,1,0,0,0,2015,7,0,0,1,9.194139
2,1,3,1438128000000000000,4782,523,1,1,1,c,a,...,1,0,0,0,2015,7,0,0,1,9.143403
3,1,2,1438041600000000000,5011,560,1,1,1,c,a,...,1,0,0,0,2015,7,0,0,1,8.948214
4,1,1,1437955200000000000,6102,612,1,1,1,c,a,...,1,0,0,0,2015,7,0,0,1,9.970588
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017204,1115,6,1357344000000000000,4771,339,1,0,1,d,c,...,0,0,0,1,2013,1,0,1,0,14.073746
1017205,1115,5,1357257600000000000,4540,326,1,0,1,d,c,...,0,0,0,1,2013,1,0,1,0,13.926380
1017206,1115,4,1357171200000000000,4297,300,1,0,1,d,c,...,0,0,0,1,2013,1,0,1,0,14.323333
1017207,1115,3,1357084800000000000,3697,305,1,0,1,d,c,...,0,0,0,1,2013,1,0,1,0,12.121311


In [96]:
combined_dummied2.dtypes

Store                               int64
DayOfWeek                           int64
Date                                int64
Sales                               int64
Customers                           int64
Open                                int64
Promo                               int64
SchoolHoliday                       int64
StoreType                          object
Assortment                         object
CompetitionDistance               float64
CompetitionOpenSinceMonth         float64
CompetitionOpenSinceYear          float64
Promo2                              int64
Promo2SinceWeek                   float64
Promo2SinceYear                   float64
StateHoliday_0                      uint8
StateHoliday_0                      uint8
StateHoliday_a                      uint8
StateHoliday_b                      uint8
StateHoliday_c                      uint8
StateHoliday_0_real                 uint8
PromoInterval_0                     uint8
PromoInterval_Feb,May,Aug,Nov     

Final csv

In [101]:
final_data = combined_dummied2[['Year','DayOfWeek','Date','Open','Promo','StateHoliday_0_real','StateHoliday_a','StateHoliday_b','StateHoliday_c','PromoInterval_0','PromoInterval_Feb,May,Aug,Nov','PromoInterval_Jan,Apr,Jul,Oct','PromoInterval_Mar,Jun,Sept,Dec','CompetitionOpen_Closed','CompetitionOpen_Not known','CompetitionOpen_Open','SalePerCustomer','Sales']]
final_data

Unnamed: 0,Year,DayOfWeek,Date,Open,Promo,StateHoliday_0_real,StateHoliday_a,StateHoliday_b,StateHoliday_c,PromoInterval_0,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec",CompetitionOpen_Closed,CompetitionOpen_Not known,CompetitionOpen_Open,SalePerCustomer,Sales
0,2015,5,1438300800000000000,1,1,1,0,0,0,1,0,0,0,0,0,1,9.482883,5263
1,2015,4,1438214400000000000,1,1,1,0,0,0,1,0,0,0,0,0,1,9.194139,5020
2,2015,3,1438128000000000000,1,1,1,0,0,0,1,0,0,0,0,0,1,9.143403,4782
3,2015,2,1438041600000000000,1,1,1,0,0,0,1,0,0,0,0,0,1,8.948214,5011
4,2015,1,1437955200000000000,1,1,1,0,0,0,1,0,0,0,0,0,1,9.970588,6102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017204,2013,6,1357344000000000000,1,0,1,0,0,0,0,0,0,1,0,1,0,14.073746,4771
1017205,2013,5,1357257600000000000,1,0,1,0,0,0,0,0,0,1,0,1,0,13.926380,4540
1017206,2013,4,1357171200000000000,1,0,1,0,0,0,0,0,0,1,0,1,0,14.323333,4297
1017207,2013,3,1357084800000000000,1,0,1,0,0,0,0,0,0,1,0,1,0,12.121311,3697


## (3) Split data

In [203]:
# train : 2013 2014
# test : 2015

%time

train_2013_2014 = pd.DataFrame(columns = final_data.columns)
test_2015 = pd.DataFrame(columns = final_data.columns)

for i in range(0,final_data['Year'].size):
    if final_data.loc[i]['Year'] == 2013 or 2014:
        pass
        train_2013_2014 = train_2013_2014.append(final_data.loc[i])
    else:
        test_2015 = test_2015.append(final_data.loc[i])

As it took so long time to get the result, I saved two files.

In [204]:
train_2013_2014.to_csv('/Users/jeewonkim/Desktop/train_2013_2014.csv',index=False)
test_2015.to_csv('/Users/jeewonkim/Desktop/test_2015.csv',index=False)

In [159]:
train_2013_2014 = pd.read_csv('/Users/jeewonkim/Desktop/train_2013_2014.csv')
test_2015 = pd.read_csv('/Users/jeewonkim/Desktop/test_2015.csv')

In [161]:
train_2013_2014.columns

Index(['Year', 'DayOfWeek', 'Date', 'Open', 'Promo', 'StateHoliday_0_real',
       'StateHoliday_a', 'StateHoliday_b', 'StateHoliday_c', 'PromoInterval_0',
       'PromoInterval_Feb,May,Aug,Nov', 'PromoInterval_Jan,Apr,Jul,Oct',
       'PromoInterval_Mar,Jun,Sept,Dec', 'CompetitionOpen_Closed',
       'CompetitionOpen_Not known', 'CompetitionOpen_Open', 'SalePerCustomer',
       'Sales'],
      dtype='object')

## (4) Lasso : Hyperparameter search with cross validation (only using training set)

In [195]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge

In [189]:
X_trainval = train_2013_2014[['DayOfWeek','Date','Open','Promo','StateHoliday_0_real','StateHoliday_a','StateHoliday_b','StateHoliday_c','PromoInterval_0','PromoInterval_Feb,May,Aug,Nov','PromoInterval_Jan,Apr,Jul,Oct','PromoInterval_Mar,Jun,Sept,Dec','CompetitionOpen_Closed','CompetitionOpen_Not known','CompetitionOpen_Open','SalePerCustomer']].values
y_trainval = train_2013_2014[['Sales']].values
X_test = test_2015[['DayOfWeek','Date','Open','Promo','StateHoliday_0_real','StateHoliday_a','StateHoliday_b','StateHoliday_c','PromoInterval_0','PromoInterval_Feb,May,Aug,Nov','PromoInterval_Jan,Apr,Jul,Oct','PromoInterval_Mar,Jun,Sept,Dec','CompetitionOpen_Closed','CompetitionOpen_Not known','CompetitionOpen_Open','SalePerCustomer']].values
y_test = test_2015[['Sales']].values

In [190]:
kfold = KFold(n_splits=5, shuffle=True, random_state=0)

In [192]:
# Lasso

best_score_r2 = 0
best_score_mse = 0
best_score_mae = 0


for alpha in np.logspace(-4, 1, 30):
    scores_val_r2 = []
    scores_val_mse = []
    scores_val_mae = []
    
    for train_idx, val_idx in kfold.split(X_trainval, y_trainval):
        X_train = X_trainval[train_idx]
        y_train = y_trainval[train_idx]
        X_valid = X_trainval[val_idx]
        y_valid = y_trainval[val_idx]
        
        lasso = Lasso(alpha = alpha, random_state=0, max_iter=1000)
        lasso.fit(X_train, y_train)
        
        y_valid_hat = lasso.predict(X_valid)
        
        scores_val_r2.append(r2_score(y_valid, y_valid_hat))
        scores_val_mse.append(mean_squared_error(y_valid, y_valid_hat))
        scores_val_mae.append(mean_absolute_error(y_valid, y_valid_hat))

    
#     Getting the cross-validation score
    mean_score_r2 = np.mean(scores_val_r2)
    mean_score_mse = np.mean(scores_val_mse)
    mean_score_mae = np.mean(scores_val_mae)
    
    if mean_score_r2 > best_score_r2:
        best_score_r2 = mean_score_r2
        best_parameters_r2 = {'alpha' : alpha}
    
    if mean_score_mse > best_score_mse:
        best_score_mse = mean_score_mse
        best_parameters_mse = {'alpha' : alpha}
        
    if mean_score_mae > best_score_mae:
        best_score_mae = mean_score_mae
        best_parameters_mae = {'alpha' : alpha}
        
        
print("Best score on validation set of r2: {:.7f}".format(best_score_r2))
print("Best hyperparameters of r2: {}".format(best_parameters_r2))
print("Best score on validation set of mse: {:.7f}".format(best_score_mse))
print("Best hyperparameters of mse: {}".format(best_parameters_mse))
print("Best score on validation set of mae: {:.7f}".format(best_score_mae))
print("Best hyperparameters of mae: {}".format(best_parameters_mae))

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best score on validation set of r2: 0.5457030
Best hyperparameters of r2: {'alpha': 0.002395026619987486}
Best score on validation set of mse: 6753958.4964015
Best hyperparameters of mse: {'alpha': 10.0}
Best score on validation set of mae: 1771.1432615
Best hyperparameters of mae: {'alpha': 0.0001}


  model = cd_fast.enet_coordinate_descent(


## (5) - Lasso : Performance comparison on the test set and draw conclusions.

In [193]:
# Lasso

# r2
lasso = Lasso(**best_parameters_r2, random_state = 0, max_iter = 1000)
lasso.fit(X_trainval, y_trainval)

y_test_hat = lasso.predict(X_test)
test_score = r2_score(y_test, y_test_hat)
print("Test set score with best hyperparameters of r2: {:.7f}".format(test_score))


# MSE

lasso = Lasso(**best_parameters_mse, random_state = 0, max_iter = 1000)
lasso.fit(X_trainval, y_trainval)

y_test_hat = lasso.predict(X_test)
test_score = mean_squared_error(y_test, y_test_hat)
print("Test set score with best hyperparameters of MSE: {:.7f}".format(test_score))


# MAE

lasso = Lasso(**best_parameters_mae, random_state = 0, max_iter = 1000)
lasso.fit(X_trainval, y_trainval)

y_test_hat = lasso.predict(X_test)
test_score = mean_absolute_error(y_test, y_test_hat)
print("Test set score with best hyperparameters of MAE: {:.7f}".format(test_score))


  model = cd_fast.enet_coordinate_descent(


Test set score with best hyperparameters of r2: 0.5632825
Test set score with best hyperparameters of MSE: 6460487.4002959
Test set score with best hyperparameters of MAE: 1791.5491156


## (4) Ridge : Hyperparameter search with cross validation (only using training set)

In [197]:
# Ridge

best_score_r2 = 0
best_score_mse = 0
best_score_mae = 0


for alpha in np.logspace(-4, 1, 30):
    scores_val_r2 = []
    scores_val_mse = []
    scores_val_mae = []
    
    for train_idx, val_idx in kfold.split(X_trainval, y_trainval):
        X_train = X_trainval[train_idx]
        y_train = y_trainval[train_idx]
        X_valid = X_trainval[val_idx]
        y_valid = y_trainval[val_idx]
        
        ridge = Ridge(alpha = alpha, random_state=0, max_iter=1000)
        ridge.fit(X_train, y_train)
        
        y_valid_hat = ridge.predict(X_valid)
        
        scores_val_r2.append(r2_score(y_valid, y_valid_hat))
        scores_val_mse.append(mean_squared_error(y_valid, y_valid_hat))
        scores_val_mae.append(mean_absolute_error(y_valid, y_valid_hat))

    
#     Getting the cross-validation score
    mean_score_r2 = np.mean(scores_val_r2)
    mean_score_mse = np.mean(scores_val_mse)
    mean_score_mae = np.mean(scores_val_mae)
    
    if mean_score_r2 > best_score_r2:
        best_score_r2 = mean_score_r2
        best_parameters_r2 = {'alpha' : alpha}
    
    if mean_score_mse > best_score_mse:
        best_score_mse = mean_score_mse
        best_parameters_mse = {'alpha' : alpha}
        
    if mean_score_mae > best_score_mae:
        best_score_mae = mean_score_mae
        best_parameters_mae = {'alpha' : alpha}
        
        
print("Best score on validation set of r2: {:.7f}".format(best_score_r2))
print("Best hyperparameters of r2: {}".format(best_parameters_r2))
print("Best score on validation set of mse: {:.7f}".format(best_score_mse))
print("Best hyperparameters of mse: {}".format(best_parameters_mse))
print("Best score on validation set of mae: {:.7f}".format(best_score_mae))
print("Best hyperparameters of mae: {}".format(best_parameters_mae))

  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=

  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=

  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=

  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=

  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


Best score on validation set of r2: 0.5457030
Best hyperparameters of r2: {'alpha': 3.039195382313201}
Best score on validation set of mse: 6741893.4496684
Best hyperparameters of mse: {'alpha': 10.0}
Best score on validation set of mae: 1771.1433418
Best hyperparameters of mae: {'alpha': 0.0001}


  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


## (5) - Ridge : Performance comparison on the test set and draw conclusions.

In [202]:
# Lasso

# r2
ridge = Ridge(**best_parameters_r2, random_state = 0, max_iter = 1000)
ridge.fit(X_trainval, y_trainval)

y_test_hat = ridge.predict(X_test)
test_score = r2_score(y_test, y_test_hat)
print("Test set score with best hyperparameters of r2: {:.7f}".format(test_score))


# MSE

ridge = Ridge(**best_parameters_mse, random_state = 0, max_iter = 1000)
ridge.fit(X_trainval, y_trainval)

y_test_hat = ridge.predict(X_test)
test_score = mean_squared_error(y_test, y_test_hat)
print("Test set score with best hyperparameters of MSE: {:.7f}".format(test_score))


# MAE

ridge = Ridge(**best_parameters_mae, random_state = 0, max_iter = 1000)
ridge.fit(X_trainval, y_trainval)

y_test_hat = ridge.predict(X_test)
test_score = mean_absolute_error(y_test, y_test_hat)
print("Test set score with best hyperparameters of MAE: {:.7f}".format(test_score))


Test set score with best hyperparameters of r2: 0.5632827
Test set score with best hyperparameters of MSE: 6440303.3306402
Test set score with best hyperparameters of MAE: 1791.5489921


  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
