# Regression 

This notebook is mainly for basic regressions and evaluations

In [2]:
import pandas as pd
import numpy as np

import warnings

warnings.simplefilter("ignore")
pd.options.display.max_columns = None

In [3]:
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn import neighbors
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Normalizer

from keras.models import Sequential
from keras.models import Model as KerasModel
from keras.layers import Input, Dense, Activation, Reshape, Concatenate
from keras.callbacks import ModelCheckpoint

import pickle

Using TensorFlow backend.


In [28]:
TRAIN_RATIO = 0.9

SHUFFLE_DATA = True
ONE_HOT_AS_INPUT = False # in addition to state holidays, use one hot for other categories

In [30]:
xgb_model_dump = 'models/xgb_model.pickle'
rf_model_dump = 'models/rf_model.pickle'
nn_model_dump = 'models/nn_model.pickle'

# Read training data

In [36]:
processed_path = 'processed/'
processed_file = 'processed_data.csv'
X = pd.read_csv(processed_path + processed_file)

In [37]:
y = X.pop('Sales')

In [39]:
if ONE_HOT_AS_INPUT:
    print("Using one-hot encoding as input for model")
    cols_for_one_hot = ['Store', 'DayOfWeek', 'weekofyear']
    one_hot_df = X[cols_for_one_hot]
    X = X.drop(columns = cols_for_one_hot)
    
    enc = OneHotEncoder(sparse=False)
    enc.fit(one_hot_df)
    one_hot_df = enc.transform(one_hot_df)
    X = X.to_numpy()
    X = pd.concat([X, one_hot_df], axis=1)

Using one-hot encoding as input for model


KeyError: "None of [Index(['Store', 'DayOfWeek', 'weekofyear'], dtype='object')] are in the [columns]"

In [46]:
# Use last 10% data as validation data
num_record = len(X)
train_size = int(TRAIN_RATIO * num_record)
X_train = X[:train_size]
y_train = y[:train_size]
X_val = X[train_size:]
y_val = y[train_size:]

In [47]:
if SHUFFLE_DATA:
    print("Using shuffled data")
    sh = np.arange(X_train.shape[0])
    np.random.shuffle(sh)
    X_train = X_train.iloc[sh]
    y_train = y_train.iloc[sh]

Using shuffled data


AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

# Build models

In [12]:
class Model(object):
    def evaluate(self, X_val, y_val):
        assert (min(y_val) > 0)
        guessed_sales = self.guess(X_val)
        relative_err = np.absolute((y_val - guessed_sales) / y_val)
        result = np.sum(relative_err) / len(y_val)
        return result

    def rsqr_score(self, X_val, y_val):
        assert (min(y_val) > 0)
        guessed_sales = self.guess(X_val)
        SS_Residual = sum((y_val-guessed_sales)**2)
        SS_Total = sum((y_val-np.mean(y_val))**2)
        r_squared = 1 - (float(SS_Residual))/SS_Total
        adjusted_r_squared = 1 - (1-r_squared)*(len(y_val)-1)/(len(y_val)-X_val.shape[1]-1)
        return adjusted_r_squared

In [13]:
class LinearModel(Model):
    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        self.clf = linear_model.LinearRegression()
        self.clf.fit(X_train, np.log(y_train))
        print("Result on validation set is: ", self.evaluate(X_val, y_val))
        print("R squared on validation set is: ", self.rsqr_score(X_val, y_val))

    def guess(self, features):
        return np.exp(self.clf.predict(features))


In [14]:
class RidgeRegression(Model):
    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        self.clf = linear_model.Ridge(alpha=1.0)
        self.clf.fit(X_train, np.log(y_train))
        print("MAPE on validation set is: ", self.evaluate(X_val, y_val))
        print("R squared on validation set is: ", self.rsqr_score(X_val, y_val))

    def guess(self, features):
        return np.exp(self.clf.predict(features))

In [15]:
class RF(Model):
    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        self.clf = RandomForestRegressor(n_estimators=200,
                                         verbose=True,
                                         max_depth=35,
                                         min_samples_split=2,
                                         min_samples_leaf=1
                                         )
        self.clf.fit(X_train, np.log(y_train))
        print("MAPE on validation set is: ", self.evaluate(X_val, y_val))
        print("R squared on validation set is: ", self.rsqr_score(X_val, y_val))

    def guess(self, features):
        return np.exp(self.clf.predict(features))


In [16]:
class SVM(Model):

    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        self.X_train = X_train
        self.y_train = y_train
        self.__normalize_data()
        self.clf = SVR(kernel='linear', degree=3, gamma='auto', coef0=0.0, tol=0.001,
                       C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1)

        self.clf.fit(self.X_train, np.log(self.y_train))
        print("MAPE on validation set is: ", self.evaluate(X_val, y_val))
        print("R squared on validation set is: ", self.rsqr_score(X_val, y_val))

    def __normalize_data(self):
        self.scaler = StandardScaler()
        self.X_train = self.scaler.fit_transform(self.X_train)

    def guess(self, feature):
        return np.exp(self.clf.predict(feature))

In [17]:
class XGB(Model):
    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        dtrain = xgb.DMatrix(X_train, label=np.log(y_train))
        evallist = [(dtrain, 'train')]
        params = {'nthread': -1,
                  'max_depth': 7,
                  'eta': 0.02,
                  'silent': 1,
                  'objective': 'reg:squarederror',
                  'colsample_bytree': 0.7,
                  'subsample': 0.7}
        num_round = 3000
        self.bst = xgb.train(params,
                             dtrain,
                             num_round,
                             evallist,
                             verbose_eval = 50,
                             early_stopping_rounds = 50)
        print("MAPE on validation set is: ", self.evaluate(X_val, y_val))
        print("R squared on validation set is: ", self.rsqr_score(X_val, y_val))

    def guess(self, features):
        dtest = xgb.DMatrix(features)
        return np.exp(self.bst.predict(dtest))

In [18]:
class KNN(Model):

    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        self.normalizer = Normalizer()
        self.normalizer.fit(X_train)
        self.clf = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance', p=1)
        self.clf.fit(self.normalizer.transform(X_train), np.log(y_train))
        print("MAPE on validation set is: ", self.evaluate(self.normalizer.transform(X_val), y_val))
        print("R squared on validation set is: ", self.rsqr_score(self.normalizer.transform(X_val), y_val))

    def guess(self, feature):
        return np.exp(self.clf.predict(self.normalizer.transform(feature)))

In [52]:
class NN(Model):

    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        self.epochs = 10
        self.checkpointer = ModelCheckpoint(filepath="models/best_model_weights.hdf5", verbose=1, save_best_only=True)
        self.max_log_y = max(np.max(np.log(y_train)), np.max(np.log(y_val)))
        self.__build_keras_model()
        self.fit(X_train, y_train, X_val, y_val)

    def __build_keras_model(self):
        self.model = Sequential()
        self.model.add(Dense(1000, kernel_initializer="uniform", input_dim=1181))
        self.model.add(Activation('relu'))
        self.model.add(Dense(500, kernel_initializer="uniform"))
        self.model.add(Activation('relu'))
        self.model.add(Dense(1))
        self.model.add(Activation('sigmoid'))

        self.model.compile(loss='mean_absolute_error', optimizer='adam')

    def _val_for_fit(self, val):
        val = np.log(val) / self.max_log_y
        return val

    def _val_for_pred(self, val):
        return np.exp(val * self.max_log_y)

    def fit(self, X_train, y_train, X_val, y_val):
        self.model.fit(X_train, self._val_for_fit(y_train),
                       validation_data=(X_val, self._val_for_fit(y_val)),
                       epochs=self.epochs, batch_size=128,
                       # callbacks=[self.checkpointer],
                       )
        # self.model.load_weights('models/best_model_weights.hdf5')
        print("MAPE on validation set is: ", self.evaluate(X_val, y_val))
        print("R squared on validation set is: ", self.rsqr_score(X_val, y_val))

    def guess(self, features):
        result = self.model.predict(features).flatten()
        return self._val_for_pred(result)


# Training

In [41]:
print("Fitting Linear Model...")
LinearModel(X_train, y_train, X_val, y_val)

Fitting Linear Model...
Result on validation set is:  0.3025476419720422
R squared on validation set is:  0.12065762984475714


<__main__.LinearModel at 0x7fa760972410>

In [49]:
print("Fitting Ridge Model...")
ridge_model = RidgeRegression(X_train, y_train, X_val, y_val)

Fitting Ridge Model...
MAPE on validation set is:  0.14558901746469502
R squared on validation set is:  0.7905570253579708


In [63]:
print("Fitting KNN...")
KNN(X_train, y_train, X_val, y_val)

Fitting KNN...
MAPE on validation set is:  0.19840082150007332
R squared on validation set is:  0.5062692270813305


<__main__.KNN at 0x7fa7207bff50>

In [None]:
print("Fitting SVM...")
SVM(X_train, y_train, X_val, y_val)

Fitting SVM...


In [20]:
print("Fitting Random Forest Model...")
rf_model = RF(X_train, y_train, X_val, y_val)

Fitting Random Forest Model...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:  7.5min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    3.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


MAPE on validation set is:  0.15194456273543347
R squared on validation set is:  0.7049265011524837


[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    2.9s finished


In [47]:
print("Fitting XGB Model...")
xgb_model = XGB(X_train, y_train, X_val, y_val)

Fitting XGB Model...
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:8.09899
Will train until train-rmse hasn't improved in 50 rounds.
[50]	train-rmse:2.97030
[100]	train-rmse:1.13397
[150]	train-rmse:0.53058
[200]	train-rmse:0.37779
[250]	train-rmse:0.34390
[300]	train-rmse:0.33204
[350]	train-rmse:0.32451
[400]	train-rmse:0.31565
[450]	train-rmse:0.30715
[500]	train-rmse:0.29979
[550]	train-rmse:0.29390
[600]	train-rmse:0.28630
[650]	train-rmse:0.28104
[700]	train-rmse:0.27486
[750]	train-rmse:0.26927
[800]	train-rmse:0.26436
[850]	train-rmse:0.26021
[900]	train-rmse:0.25626
[950]	train-rmse:0.25089
[1000]	train-rmse:0.24656
[1050]	train-rmse:0.24272
[1100]	train-rmse:0.23930
[1150]	train-rmse:0.23561
[1200]	train-rmse:0.23224
[1250]	train

<__main__.XGB at 0x7fa7533bd1d0>

In [53]:
print("Fitting NN...")
nn_model = NN(X_train, y_train, X_val, y_val)

Fitting NN...
Train on 759904 samples, validate on 84434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
MAPE on validation set is:  5.826428694123841
R squared on validation set is:  -127.27416796877921


# Save models

In [21]:
with open(rf_model_dump, 'wb') as f:
    pickle.dump(rf_model, f, -1)

In [None]:
with open(xgb_model_dump, 'wb') as f:
    pickle.dump(xgb_model, f, -1)

In [27]:
with open(nn_model_dump, 'wb') as f:
    pickle.dump(nn_model, f, -1)