<a href="https://colab.research.google.com/github/laurence-lin/Kaggle_competition/blob/master/Rossman_Store_Sales_Forecast.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math

import matplotlib.pyplot as plt
import seaborn as sns

%tensorflow_version 1.9
import tensorflow as tf
from tensorflow import keras
import lightgbm as lgb
import sklearn
from sklearn.model_selection import train_test_split

import os
import gc

import warnings
warnings.simplefilter('ignore')
      

`%tensorflow_version` only switches the major version: `1.x` or `2.x`.
You set: `1.9`. This will be interpreted as: `1.x`.


TensorFlow 1.x selected.


In [2]:
from google.colab import files


%cd /root/
!mkdir .kaggle
%cd .kaggle
files.upload()
!kaggle competitions download -c rossmann-store-sales
!unzip test.csv.zip
!unzip train.csv.zip

/root
/root/.kaggle


Saving kaggle.json to kaggle.json
Downloading sample_submission.csv to /root/.kaggle
  0% 0.00/310k [00:00<?, ?B/s]
100% 310k/310k [00:00<00:00, 42.8MB/s]
Downloading test.csv.zip to /root/.kaggle
  0% 0.00/192k [00:00<?, ?B/s]
100% 192k/192k [00:00<00:00, 59.4MB/s]
Downloading train.csv.zip to /root/.kaggle
 75% 5.00M/6.71M [00:00<00:00, 39.4MB/s]
100% 6.71M/6.71M [00:00<00:00, 41.6MB/s]
Downloading store.csv to /root/.kaggle
  0% 0.00/44.0k [00:00<?, ?B/s]
100% 44.0k/44.0k [00:00<00:00, 37.9MB/s]
Archive:  test.csv.zip
  inflating: test.csv                
Archive:  train.csv.zip
  inflating: train.csv               


In [64]:
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(float),
         'PromoInterval': np.dtype(str)}

train = pd.read_csv('train.csv', parse_dates = [2], dtype = types)
test = pd.read_csv('test.csv', parse_dates = [3], dtype = types)
store = pd.read_csv('store.csv')

gc.collect()

826

In [14]:
print(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 9 columns):
Store            1017209 non-null int64
DayOfWeek        1017209 non-null int64
Date             1017209 non-null datetime64[ns]
Sales            1017209 non-null int64
Customers        1017209 non-null int64
Open             1017209 non-null int64
Promo            1017209 non-null int64
StateHoliday     1017209 non-null object
SchoolHoliday    1017209 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(6), object(1)
memory usage: 69.8+ MB
None


In [15]:
print(store.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 10 columns):
Store                        1115 non-null int64
StoreType                    1115 non-null object
Assortment                   1115 non-null object
CompetitionDistance          1112 non-null float64
CompetitionOpenSinceMonth    761 non-null float64
CompetitionOpenSinceYear     761 non-null float64
Promo2                       1115 non-null int64
Promo2SinceWeek              571 non-null float64
Promo2SinceYear              571 non-null float64
PromoInterval                571 non-null object
dtypes: float64(5), int64(2), object(3)
memory usage: 87.2+ KB
None


There are some data types in files that should be convert:  
object type, datetime features, and some features from float to int.

In [0]:
print(store)

   Store StoreType  ... Promo2SinceYear    PromoInterval
0      1         c  ...             NaN              NaN
1      2         a  ...          2010.0  Jan,Apr,Jul,Oct
2      3         a  ...          2011.0  Jan,Apr,Jul,Oct
3      4         c  ...             NaN              NaN
4      5         a  ...             NaN              NaN

[5 rows x 10 columns]
0     9.0
1    11.0
2    12.0
3     9.0
4     4.0
Name: CompetitionOpenSinceMonth, dtype: float64


In [0]:
def rmspe(y_true, y_pred):
  return np.sqrt(np.mean(( 1 - (y_true/y_pred) )**2))

def rmspe_lgb(y_true, y_pred):
  y_pred = np.expm1(y_pred)
  y_true = np.expm1(y_true)
  return 'rmspe', rmspe(y_true, y_pred)

def build_feature(features, data):
  '''
  Feature engineering
  features: final feature that would be used for training
  data: input data matrix
  '''
  # 1. Fill NaN
  # fill NaN values if there is any
  data.fillna(0, inplace = True)
  features.extend(['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday'])

  # 2. Encode categorical features
  # Map categorical features to numerical
  mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
  data['StoreType'].replace(mappings, inplace = True)
  data['Assortment'].replace(mappings, inplace = True)
  data['StateHoliday'].replace(mappings, inplace = True)
  features.extend(['StoreType', 'Assortment', 'StateHoliday'])

  # 3. Create new features
  # Add timestamp features
  data['Year'] = data['Date'].dt.year
  data['Month'] = data['Date'].dt.month
  data['Day'] = data.Date.dt.day
  data['DayOfWeek'] = data.Date.dt.dayofweek
  data['WeekOfYear'] = data.Date.dt.weekofyear
  features.extend(['Year', 'Month', 'Day', 'DayOfWeek', 'WeekOfYear'])

  # The total months passed since the competition store opened
  data['CompetitionOpen'] = 12*(data['Year'] - data['CompetitionOpenSinceYear']) +\
                             (data['Month'] - data['CompetitionOpenSinceMonth'])
  # Total months passed since the Promo2 activity started
  data['PromoOpen'] = 12*(data['Year'] - data['Promo2SinceYear']) + \
                       (data['WeekOfYear'] - data['Promo2SinceWeek'])/4
  data['PromoOpen'] = data['PromoOpen'].apply(lambda x: x if x > 0 else 0)
  data.loc[data['Promo2SinceYear'] == 0, 'PromoOpen'] = 0
  features.append('CompetitionOpen')

  # Add feature: whether the date is within the 'Month when the Promo2 occuring'
  month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', \
               8:'Aug', 9:'Sept', 10:'Opt', 11:'Nov', 12:'Dec'}
  data['MonthStr'] = data.Month.map(month2str) # string month for each date
  data.loc[data['PromoInterval'] == 0, 'PromoInterval'] = ''
  data['IsPromoMonth'] = 0
  for interval in data['PromoInterval'].unique():
    if interval != '':
      for month in interval.split(','):
        data.loc[(data.MonthStr == month) & (data.PromoInterval == interval), 'IsPromoMonth'] = 1

  features.append('IsPromoMonth')

  return features, data



Data description:
Totally 942 days for whole dataset, 48 days to predict.

In these 942 days, up to 1115 stores in each day(some store may )

Training dataset: time line 2013.03.01-2015.07.31

Testing dataset: time line 2015-08-01 to 2015-09-17

Stores: 1115 different stores, each store contains 9 features for the store itself

Target: predict daily sales 6 weeks in advance

In [65]:
# Feature engineering
print('Assume store open, if not provided')
train['Open'].fillna(1, inplace = True)
test['Open'].fillna(1, inplace = True)

# Consider only open store for training.
print('Consider only open store for training, closed store is filtered')
train = train[train.Open == 1]
print('Consider only Sales >0, to simplify calculation of RMSPE')
train = train[train.Sales > 0]

# Use merge to concat store to train & test set, while 'on' could fit the rows with 'Store'
# merge() method, the two merge dataframe should have same column name and value to fit the merging
train = pd.merge(train, store, on = 'Store', how = 'left')
test = pd.merge(test, store, on = 'Store', how = 'left')
print('Train + Store shape:', train.shape)
print('Test + Store shape:', test.shape)

print('Build feature engineering')
features = []
features, train = build_feature(features, train)
_, test = build_feature([], test)
train.loc[:, ['Assortment', 'StateHoliday']] = train.loc[:, ['Assortment', 'StateHoliday']].astype(int)
test.loc[:, ['Assortment', 'StateHoliday']] = test.loc[:, ['Assortment', 'StateHoliday']].astype(int)

x_train, x_valid = train_test_split(train, test_size = 0.012, random_state = 10)
y_train = np.log1p(x_train.Sales)
y_valid = np.log1p(x_valid.Sales)

print('Training data processed')


Assume store open, if not provided
Consider only open store for training, closed store is filtered
Consider only Sales >0, to simplify calculation of RMSPE
Train + Store shape: (844338, 18)
Test + Store shape: (41088, 17)
Build feature engineering
Training data processed


In [66]:
print(x_train[features].info())
print(x_valid[features].info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 834205 entries, 167746 to 345353
Data columns (total 15 columns):
Store                  834205 non-null int64
CompetitionDistance    834205 non-null float64
Promo                  834205 non-null int64
Promo2                 834205 non-null int64
SchoolHoliday          834205 non-null float64
StoreType              834205 non-null int64
Assortment             834205 non-null int64
StateHoliday           834205 non-null int64
Year                   834205 non-null int64
Month                  834205 non-null int64
Day                    834205 non-null int64
DayOfWeek              834205 non-null int64
WeekOfYear             834205 non-null int64
CompetitionOpen        834205 non-null float64
IsPromoMonth           834205 non-null int64
dtypes: float64(3), int64(12)
memory usage: 101.8 MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10133 entries, 688461 to 822858
Data columns (total 15 columns):
Store                  10133 n

In [68]:
# LightGBM
params = {
    'boosting_type':'gbdt',
    'max_depth':10,
    'num_leaves':30,
    'n_estimators':300,
    'early_stopping_rounds':100,
    'learning_rate':0.05,
    'subsample':0.9, # subsampling for next tree 
    'colsample_bytree':0.7, # subsample columns for next tree
    'random_state':10,
}

booster = lgb.LGBMRegressor(**params)
print('Start training...')
booster.fit(x_train[features], y_train,
            eval_set = (x_valid[features], y_valid),
            eval_metric = 'rmse', 
            verbose = 50)


Start training...
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's rmse: 0.332367	valid_0's l2: 0.110468
[100]	valid_0's rmse: 0.303745	valid_0's l2: 0.0922611
[150]	valid_0's rmse: 0.281634	valid_0's l2: 0.0793177
[200]	valid_0's rmse: 0.264841	valid_0's l2: 0.0701408
[250]	valid_0's rmse: 0.248781	valid_0's l2: 0.0618919
[300]	valid_0's rmse: 0.234492	valid_0's l2: 0.0549866
Did not meet early stopping. Best iteration is:
[300]	valid_0's rmse: 0.234492	valid_0's l2: 0.0549866


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
              early_stopping_rounds=100, importance_type='split',
              learning_rate=0.05, max_depth=10, min_child_samples=20,
              min_child_weight=0.001, min_split_gain=0.0, n_estimators=300,
              n_jobs=-1, num_leaves=30, objective=None, random_state=10,
              reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=0.9,
              subsample_for_bin=200000, subsample_freq=0)

In [69]:
# Validating and make predictions on test data
print('Validating')
yhat = booster.predict(x_valid[features])
error = rmspe(x_valid.Sales.values, np.expm1(yhat))
print('RMSPE on valid sets: {:.6f}'.format(error))

print('Make predictions on test set')
prediction = booster.predict(test[features])
result = pd.DataFrame({'Id':test['Id'], 'Sales':np.expm1(prediction)})

result.to_csv('submission.csv', index = False)


Validating
RMSPE on valid sets: 0.242789
Make predictions on test set


In [73]:
result.to_csv('submission.csv', index = False)
!kaggle competitions submit -c rossmann-store-sales -f submission.csv -m "Message"

100% 954k/954k [00:01<00:00, 584kB/s]
Successfully submitted to Rossmann Store Sales