In [1]:
#import libraries
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.ensemble import RandomForestRegressor
import category_encoders as ce
import xgboost as xgb

#prepare the data
def my_data_preparation(train_data):
    #merge training and store data
    all_data = pd.merge(train_data, store_data, how = 'left', on = 'Store')
    #convert 'Date' to datetime format
    all_data['Date'] = pd.to_datetime(train_data['Date'])
    #extract year, month, day of month, and week of year
    all_data['Year'] = all_data['Date'].dt.year
    all_data['Month'] = all_data['Date'].dt.month
    all_data['Day'] = all_data['Date'].dt.day
    all_data['WeekOfYear'] = all_data['Date'].dt.isocalendar().week.astype(np.int64)
    #fix type errors in the 'StateHoliday' column
    all_data['StateHoliday'] = all_data['StateHoliday'].replace(0.0, '0')
    #dummy encode all categorical data
    all_data = pd.get_dummies(all_data, columns = ['StoreType', 'Assortment', 'StateHoliday'])
    #select only columns with numerical data
    all_data = all_data.select_dtypes(include=np.number)
    #drop columns with too many nans and drop the 'Customers' column
    all_data = all_data.drop(columns=['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2SinceWeek', 'Promo2SinceYear', 'Customers'])
    return all_data

#ask user for the address of the holdout file
holdout_path = input("Enter the address for the holdout file:")
#list of features
features =  ['Store', 'DayOfWeek', 'WeekOfYear', 'Month', 'Open', 'Promo', 'SchoolHoliday', 
             'CompetitionDistance', 'Promo2', 
             'StoreType_a', 'StoreType_b', 'StoreType_c', 'StoreType_d',
             'Assortment_a', 'Assortment_b', 'Assortment_c',
             'StateHoliday_0', 'StateHoliday_a', 'StateHoliday_b', 'StateHoliday_c']
#list of features and target, the target is 'Sales'
features_and_target = list(features)
features_and_target.append('Sales')
#load data files
store_data = pd.read_csv("./data/store.csv")
train_data = pd.read_csv("./data/train.csv")
holdout_data_0 = pd.read_csv(holdout_path)
all_holdout_data = my_data_preparation(holdout_data_0)
all_holdout_data = all_holdout_data.dropna(axis = 0, subset = features_and_target)
all_holdout_data = all_holdout_data.loc[all_holdout_data['Sales'] != 0.0]
sales_holdout = all_holdout_data['Sales']
all_holdout_data = all_holdout_data.drop(columns=['Sales'])
#prepare the data
all_data = my_data_preparation(train_data)
#drop all rows with nans in the relevant columns
numeric_data = all_data.dropna(axis = 0, subset = features_and_target)
#drop all rows with vanishing sales
numeric_data = numeric_data.loc[numeric_data['Sales'] != 0.0]
data = numeric_data
#specify target and features
y = data.loc[:, 'Sales']
X = data.loc[:, features]
#set up the target encoder
#we target encode only the 'Store' column
ce_te = ce.TargetEncoder(cols = 'Store')
#find transform for target encoding
ce_te.fit(X, y)
#transform features
X = ce_te.transform(X)
#split into test and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#specify features for the holdout data
holdout_data = all_holdout_data.loc[:, features]
#transform the holdout data
X_holdout = ce_te.transform(holdout_data)

#XGBoost regression
#set up xgb regressor
xgbr = xgb.XGBRegressor(max_depth=250,learning_rate=0.2,n_estimators=150,n_jobs=10, 
                        colsample_bytree=0.4,subsample=0.7,reg_alpha=10,reg_lambda=20)
#fit the training data
xgbr.fit(X_train, y_train)
#xgb prediction on the test data
predict_test = xgbr.predict(X_test)
#xgb prediction on the test data
predict_train = xgbr.predict(X_train)

#xgb prediction for the holdout data
predict_holdout = xgbr.predict(X_holdout)
#compute xgb holdout RMSPE
diff = 1 - predict_holdout/sales_holdout
RMSPE = sqrt(np.dot(diff,diff)/len(diff))
print(f'RMSPE: {RMSPE}')

Enter the address for the holdout file: ./data/train.csv


  exec(code_obj, self.user_global_ns, self.user_ns)


RMSPE: 0.14563386596189978
