In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_excel("Walmart_clusters.xlsx")

In [3]:
df0 = df.loc[(df['cluster_sklearn'] == 0)]
df1 = df.loc[(df['cluster_sklearn'] == 1)]
df2 = df.loc[(df['cluster_sklearn'] == 2)]

In [4]:
def dataframe_preparator(dataframe):
    dataframe = dataframe.drop([
        'Date', 'Store', 'mm', 'dd', 'yyyy', 'week_absolute', 'cluster_sklearn'
    ],
                               axis=1)
    divider = len(dataframe) / len(dataframe.week_relative.unique())
    dataframe = dataframe.groupby('week_relative', as_index=False).sum()
    columns = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
    for j in columns:
        dataframe[j] = dataframe[j] / divider
    return dataframe

In [5]:
df0 = dataframe_preparator(df0)
df1 = dataframe_preparator(df1)
df2 = dataframe_preparator(df2)

**------------------------------------------------------------------------------------**

In [6]:
def self_train_test_splitter(df):
    X = df.drop('Weekly_Sales', axis=1)
    X_train = X.loc[(X['week_relative'] < 105)]
    X_test = X.loc[(X['week_relative'] >= 105)]
    y = df.Weekly_Sales.values
    y_train = y[:len(X_train)]
    y_test = y[:len(X_test)]
    return X, X_train, X_test, y, y_train, y_test

In [7]:
X0, X0_train, X0_test, y0, y0_train, y0_test = self_train_test_splitter(df0)
X1, X1_train, X1_test, y1, y1_train, y1_test = self_train_test_splitter(df1)
X2, X2_train, X2_test, y2, y2_train, y2_test = self_train_test_splitter(df2)

In [8]:
def name_of_global_obj(obj):
    for objname, oid in globals().items():
        if oid is obj:
            objname += '.pickle'
            return objname

In [9]:
def pickler(obj, objname):
    with open(objname, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
variable_list = [
    df0, X0, X0_train, X0_test, y0, y0_train, y0_test, df1, X1, X1_train,
    X1_test, y1, y1_train, y1_test, df2, X2, X2_train, X2_test, y2, y2_train,
    y2_test
]
for i in variable_list:
    pickler(i, name_of_global_obj(i))

**------------------------------------------------------------------------------------**

In [11]:
def complex_modeller (X_train, X_test, y_train, y_test):
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    print("MAPE on train :", round(100 * mean_absolute_percentage_error(y_train, y_pred_train), 3),"%")
    print("MAPE on test  :", round(100 * mean_absolute_percentage_error(y_test, y_pred_test), 3),"%")
    print("MSE on train  :", "{:.3e}".format(mean_squared_error(y_train, y_pred_train)))
    print("MSE on test   :", "{:.3e}".format(mean_squared_error(y_test, y_pred_test)))  
    print("R2 on train   :", round(r2_score(y_train, y_pred_train),3))
    print("R2 on test    :", round(r2_score(y_test, y_pred_test),3))

In [12]:
def total_modeller (X, X_train, X_test, y, y_train, y_test):
    model = LinearRegression()
    model.fit(X, y)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    print("MAPE on train :", round(100 * mean_absolute_percentage_error(y_train, y_pred_train), 3),"%")
    print("MAPE on test  :", round(100 * mean_absolute_percentage_error(y_test, y_pred_test), 3),"%")
    print("MSE on train  :", "{:.3e}".format(mean_squared_error(y_train, y_pred_train)))
    print("MSE on test   :", "{:.3e}".format(mean_squared_error(y_test, y_pred_test)))    
    print("R2 on train   :", round(r2_score(y_train, y_pred_train),3))
    print("R2 on test    :", round(r2_score(y_test, y_pred_test),3))

In [19]:
def simple_modeller(X, y):
    model = LinearRegression()
    model.fit(X, y)
    y_pred = model.predict(X)
    print("MAPE :", round(100 * mean_absolute_percentage_error(y, y_pred), 3),"%")
    print("MSE  :", "{:.3e}".format(mean_squared_error(y, y_pred)))
    print("R2   :", round(r2_score(y, y_pred), 3))

In [14]:
print("complex modeller used")
print("fit: 2010-2011, train: 2010-2011, test: 2012")
print("\nResults for cluster 0")
complex_modeller(X0_train, X0_test, y0_train, y0_test)
print("\nResults for cluster 1")
complex_modeller(X1_train, X1_test, y1_train, y1_test)
print("\nResults for cluster 2")
complex_modeller(X2_train, X2_test, y2_train, y2_test)

complex modeller used
fit: 2010-2011, train: 2010-2011, test: 2012

Results for cluster 0
MAPE on train : 3.627 %
MAPE on test  : 12.978 %
MSE on train  : 2.337e+11
MSE on test   : 1.699e+12
R2 on train   : 0.636
R2 on test    : -1.748

Results for cluster 1
MAPE on train : 6.816 %
MAPE on test  : 16.054 %
MSE on train  : 3.188e+12
MSE on test   : 6.592e+12
R2 on train   : 0.137
R2 on test    : -0.684

Results for cluster 2
MAPE on train : 4.846 %
MAPE on test  : 8.017 %
MSE on train  : 1.769e+12
MSE on test   : 9.633e+12
R2 on train   : 0.777
R2 on test    : -0.168


In [15]:
print("total modeller used")
print("fit: 2010-2012, train: 2010-2011, test: 2012")
print("\nResults for cluster 0")
total_modeller(X0, X0_train, X0_test, y0, y0_train, y0_test)
print("\nResults for cluster 1")
total_modeller(X1, X1_train, X1_test, y1, y1_train, y1_test)
print("\nResults for cluster 2")
total_modeller(X2, X2_train, X2_test, y2, y2_train, y2_test)

total modeller used
fit: 2010-2012, train: 2010-2011, test: 2012

Results for cluster 0
MAPE on train : 3.586 %
MAPE on test  : 4.384 %
MSE on train  : 2.793e+11
MSE on test   : 5.770e+11
R2 on train   : 0.565
R2 on test    : 0.067

Results for cluster 1
MAPE on train : 6.351 %
MAPE on test  : 5.962 %
MSE on train  : 3.452e+12
MSE on test   : 4.115e+12
R2 on train   : 0.066
R2 on test    : -0.051

Results for cluster 2
MAPE on train : 4.831 %
MAPE on test  : 6.943 %
MSE on train  : 1.799e+12
MSE on test   : 9.547e+12
R2 on train   : 0.773
R2 on test    : -0.157


In [20]:
print("simple modeller used")
print("fit: 2010-2012, no train set, no test set")
print("\nResults for cluster 0")
simple_modeller(X0, y0)
print("\nResults for cluster 1")
simple_modeller(X1, y1)
print("\nResults for cluster 2")
simple_modeller(X2, y2)

simple modeller used
fit: 2010-2012, no train set, no test set

Results for cluster 0
MAPE : 3.56 %
MSE  : 2.368e+11
R2   : 0.492

Results for cluster 1
MAPE : 5.538 %
MSE  : 2.481e+12
R2   : 0.071

Results for cluster 2
MAPE : 4.63 %
MSE  : 1.587e+12
R2   : 0.716
