In [1]:
# Import libraries
import joblib
import numpy as np
import pandas as pd
import datetime as dt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
# Get data
train_path = '../data/train.csv'
store_path = '../data/store.csv'

train = pd.read_csv(train_path)
store = pd.read_csv(store_path)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
# Drop records with Open==0 and Customers==0
# No customers or when closed would mean 0 sales

train = train.drop(train[train['Open']==0].index)
train = train.drop(train[train['Customers']==0].index)

In [4]:
# Clean fields

train['StateHoliday'] = train['StateHoliday'].replace(0, '0')

In [5]:
# Ensure datatypes

train['Store'] = train['Store'].astype(int)
train['DayOfWeek'] = train['DayOfWeek'].astype(int)
#train['Date'] = train['Date'].apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%d'))
train['Sales'] = train['Sales'].astype(float)
train['Customers'] = train['Customers'].astype(int)
train['Open'] = train['Open'].astype(int)
train['Promo'] = train['Promo'].astype(int)
train['StateHoliday'] = train['StateHoliday'].astype(str)
train['SchoolHoliday'] = train['SchoolHoliday'].astype(int)

In [6]:
# Clean change datatype of 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'

store['CompetitionOpenSinceMonth'] = store['CompetitionOpenSinceMonth'].fillna(0)
store['CompetitionOpenSinceYear'] = store['CompetitionOpenSinceYear'].fillna(0)

store['CompetitionOpenSinceMonth'] = store['CompetitionOpenSinceMonth'].astype(int).astype(str)
store['CompetitionOpenSinceYear'] = store['CompetitionOpenSinceYear'].astype(int).astype(str)

store['CompetitionOpenSinceMonth'] = store['CompetitionOpenSinceMonth'].str.pad(width=2, side='left', fillchar='0')

store['CompetitionOpenSinceMonth'] = store['CompetitionOpenSinceMonth'].replace('00', '01')
store['CompetitionOpenSinceYear'] = store['CompetitionOpenSinceYear'].replace('0', '1800')

store['CompetitionOpenSince'] = store['CompetitionOpenSinceMonth'] + '_' + store['CompetitionOpenSinceYear']

store['CompetitionOpenSince'] = store['CompetitionOpenSince'].apply(lambda x: dt.datetime.strptime(x, '%m_%Y'))

In [7]:
# Clean change datatype of 'Promo2SinceMonth', 'Promo2SinceYear'

store['Promo2SinceWeek'] = store['Promo2SinceWeek'].fillna(0)
store['Promo2SinceYear'] = store['Promo2SinceYear'].fillna(0)

store['Promo2SinceWeek'] = store['Promo2SinceWeek'].astype(int).astype(str)
store['Promo2SinceYear'] = store['Promo2SinceYear'].astype(int).astype(str)

store['Promo2SinceWeek'] = store['Promo2SinceWeek'].str.pad(width=2, side='left', fillchar='0')

store['Promo2SinceWeek'] = store['Promo2SinceWeek'].replace('00', '01')
store['Promo2SinceYear'] = store['Promo2SinceYear'].replace('0', '1800')

store['Promo2Since'] = store['Promo2SinceWeek'] + '_' + store['Promo2SinceYear'] + ' SUN'

store['Promo2Since'] = store['Promo2Since'].apply(lambda x: dt.datetime.strptime(x, '%U_%Y %a'))

In [8]:
# Clean 'PromoInterval'. Replace all Nan to 'None'
store['PromoInterval'] = store['PromoInterval'].fillna('None')

In [9]:
# Clean 'CompetitionDistance'. Replace all Nan to 0
store['CompetitionDistance'] = store['CompetitionDistance'].fillna(0)

In [11]:
# Get dummies for store data
store = pd.get_dummies(store, columns=['StoreType', 'Assortment'])

In [13]:
# Save processed data
store.to_csv('../data/processed_store_data.csv')

In [14]:
# Merge train and store datasets

data = pd.merge(train, store, on='Store', how='left')

In [23]:
# Create 'DaysSinceCompetitionOpen' feature

data['DaysSinceCompetitionOpen'] = data.apply(lambda x: 0 if x['CompetitionOpenSince'].year<=dt.date(1800, 12, 31).year else (0 if (x['Date'] <= x['CompetitionOpenSince']) else (x['Date']-x['CompetitionOpenSince']).days), axis=1)

In [24]:
# Create 'DaysSincePromo2' feature

data['DaysSincePromo2'] = data.apply(lambda x: 0 if x['Promo2Since'].year<=dt.date(1800, 12, 31).year else (0 if (x['Date'] <= x['Promo2Since']) else (x['Date']-x['Promo2Since']).days), axis=1)

In [25]:
# Select all fields needed for modelling

for_model = data[['Store', 'DayOfWeek', 'Sales', 
                  'Customers', 'Promo', 'StateHoliday', 
                  'SchoolHoliday', 'StoreType', 'Assortment',
                  'CompetitionDistance', 'Promo2', 'DaysSinceCompetitionOpen',
                  'DaysSincePromo2']]

In [26]:
# Select all fields needed for modelling

for_model = pd.get_dummies(for_model, 
                           columns=['StateHoliday'])

In [27]:
# Split dataset to features and target

X = for_model.drop(['Store', 'Sales'], axis=1)
y = for_model[['Sales']]

In [31]:
# Split dataset to train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
# Model using Random Forest Regression

forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, y_train)
print(forest_reg.score(X_train, y_train))
forest_pred = forest_reg.predict(X_test)
print(forest_reg.score(X_test, y_test))
print(mean_squared_error(y_test, forest_pred))

In [None]:
# save model

joblib.dump(forest_reg, '../model/compressed_rf_sales.pkl', compress=5)

In [37]:
# test model loading
import joblib

pkl = joblib.load('../model/compressed_rf_sales.pkl')
pkl_pred = pkl.predict(X_test[:1])
# print(mean_squared_error(y_test, pkl_pred))

In [38]:
pkl_pred

array([7329.75])

In [39]:
y_test[:1]

Unnamed: 0,Sales
341000,7577.0
