
# Explore embedding feature


In [2]:
import pandas as pd
import numpy as np

import warnings

warnings.simplefilter("ignore")
pd.options.display.max_columns = None

In [None]:
from sklearn import linear_model

from keras.models import Sequential
from keras.models import Model as KerasModel
from keras.layers import Input, Dense, Activation, Reshape, Concatenate
from keras.callbacks import ModelCheckpoint
from keras.layers.embeddings import Embedding
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

import pickle

Using TensorFlow backend.


In [None]:
TRAIN_RATIO = 0.9

SHUFFLE_DATA = False
SAVED_EMBEDDING = True
EMBEDDING_AS_INPUT = True

In [None]:
saved_embeddings_fname = "models/embeddings.pickle" 
label_encoder_pickle = 'models/les.pickle'
embedded_ridge_model = 'models/embedded_ridge_model.pickle'

# Read input data

In [5]:
processed_path = 'processed/'
processed_file = 'processed_data.csv'
X = pd.read_csv(processed_path + processed_file)
y = X.pop('Sales')

In [1]:
def consolidate_holiday(df):
    df['holiday'] = 0
    df.loc[df['holiday_public'] ==1, 'holiday'] = 1
    df.loc[df['holiday_easter'] ==1, 'holiday'] = 2
    df.loc[df['holiday_christmas'] ==1, 'holiday'] = 3
    return df.drop(columns = ['holiday_public', 'holiday_no', 'holiday_easter', 'holiday_christmas'])

In [None]:
X = consolidate_holiday(X)

In [None]:
X = X[['Store', 'year','weekofyear','DayOfWeek',  'holiday', 
        'Promo', 'SchoolHoliday']]
X_np = X.to_numpy()

In [10]:
for col in X.columns:
    print(X[col].nunique())

1115
3
52
7
4
2
2


In [12]:
les = []
for i in range(5):
    """ Only need to encode the first 5 columns
    """
    le = LabelEncoder()
    le.fit(X_np[:, i])
    les.append(le)
    X_np[:, i] = le.transform(X_np[:, i])

In [13]:
with open(label_encoder_pickle, 'wb') as f:
    pickle.dump(les, f, -1)

In [14]:
le.classes_

array(['0', '1', '2', '3'], dtype=object)

# Split data

In [17]:
# Use last 10% data as validation data
num_record = len(X_np)
train_size = int(TRAIN_RATIO * num_record)
X_train = X_np[:train_size]
y_train = y[:train_size]
X_val = X_np[train_size:]
y_val = y[train_size:]

In [16]:
if SHUFFLE_DATA:
    print("Using shuffled data")
    sh = np.arange(X_train.shape[0])
    np.random.shuffle(sh)
    X_train = X_train[sh]
    y_train = y_train[sh]

Using shuffled data


# Build models

In [26]:
class Model(object):
    def evaluate(self, X_val, y_val):
        assert min(y_val) > 0
        guessed_sales = self.guess(X_val)
        relative_err = np.absolute((y_val - guessed_sales) / y_val)
        result = np.sum(relative_err) / len(y_val)
        return result

    def rsqr_score(self, X_val, y_val):
        assert min(y_val) > 0
        guessed_sales = self.guess(X_val)
        SS_Residual = sum((y_val - guessed_sales) ** 2)
        SS_Total = sum((y_val - np.mean(y_val)) ** 2)
        r_squared = 1 - (float(SS_Residual)) / SS_Total
        adjusted_r_squared = 1 - (1 - r_squared) * (len(y_val) - 1) / (
            len(y_val) - X_val.shape[1] - 1
        )
        return adjusted_r_squared

In [82]:
class RidgeRegression(Model):
    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        self.clf = linear_model.Ridge(alpha=1.0)
        self.clf.fit(X_train, np.log(y_train))
        print("MAPE on validation set is: ", self.evaluate(X_val, y_val))
        print("R squared on validation set is: ", self.rsqr_score(X_val, y_val))

    def guess(self, features):
        return np.exp(self.clf.predict(features))
    
    def model(self):
        coef = self.clf.coef_
        intercept = self.clf.intercept_
        return coef, intercept, self.clf

In [153]:
X.columns

Index(['Store', 'year', 'weekofyear', 'DayOfWeek', 'holiday', 'Promo',
       'SchoolHoliday'],
      dtype='object')

In [154]:
def split_features(X):
    x_list = []

    store_index = X[..., [0]]
    x_list.append(store_index)

    year = X[..., [1]]
    x_list.append(year)

    week = X[..., [2]]
    x_list.append(week)

    day_of_week = X[..., [3]]
    x_list.append(day_of_week)

    holiday = X[..., [4]]
    x_list.append(holiday)

    promo = X[..., [5]]
    x_list.append(promo)

    SchoolHoliday = X[..., [6]]
    x_list.append(SchoolHoliday)
    
    return x_list

In [155]:
class NN_with_entity_embedding(Model):
    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        self.epochs = 1
        #         self.checkpoint = ModelCheckpoint(filepath='best_model_weights.hdf5', verbose=1, save_best_only=True)
        self.max_log_y = max(np.max(np.log1p(y_train)), np.max(np.log1p(y_val)))
        # self.__build_keras_model()
        self.__build_keras_model_for_ridge_embedding()
        self.fit(X_train, y_train, X_val, y_val)

    def preprocessing(self, X):
        X_list = split_features(X)
        return X_list

    def __build_keras_model_for_ridge_embedding(self):
        input_store = Input(shape=(1,))
        output_store = Embedding(1115, 500, name="store_embedding")(input_store)
        output_store = Reshape(target_shape=(500,))(output_store)

        input_year = Input(shape=(1,))
        output_year = Embedding(3, 2, name="year_embedding")(input_year)
        output_year = Reshape(target_shape=(2,))(output_year)

        input_week = Input(shape=(1,))
        output_week = Embedding(52, 24, name="weekofyear")(input_week)
        output_week = Reshape(target_shape=(24,))(output_week)

        input_dow = Input(shape=(1,))
        output_dow = Embedding(7, 6, name="dow_embedding")(input_dow)
        output_dow = Reshape(target_shape=(6,))(output_dow)

        input_holiday = Input(shape=(1,))
        output_holiday = Embedding(4, 2, name="holiday_embedding")(input_holiday)
        output_holiday = Reshape(target_shape=(2,))(output_holiday)

        input_promo = Input(shape=(1,))
        output_promo = Dense(1)(input_promo)

        input_SchoolHoliday = Input(shape=(1,))
        output_SchoolHoliday = Dense(1)(input_SchoolHoliday)

        input_model = [
            input_store,
            input_year,
            input_week,
            input_dow,
            input_holiday,
            input_promo,
            input_SchoolHoliday,
        ]
        output_embeddings = [
            output_store,
            output_year,
            output_week,
            output_dow,
            output_holiday,
            output_promo,
            output_SchoolHoliday,
        ]

        output_model = Concatenate()(
            output_embeddings
        )  # concatenate all embeddings into a list of inputs
        output_model = Dense(1000, kernel_initializer="uniform")(output_model)
        output_model = Activation("relu")(output_model)
        output_model = Dense(500, kernel_initializer="uniform")(output_model)
        output_model = Activation("relu")(output_model)
        output_model = Dense(1)(output_model)
        output_model = Activation("sigmoid")(output_model)  # Because the target is rescaled to [0,1]

        self.model = KerasModel(inputs=input_model, outputs=output_model)

        # self.model.compile(loss='mean_absolute_error', optimizer='adam')
        self.model.compile(
            loss=tf.keras.losses.MeanAbsolutePercentageError(), optimizer="sgd"
        )

    def _val_for_fit(self, val):
        return np.log(val) / self.max_log_y

    def _val_for_pred(self, val):
        return np.exp(val * self.max_log_y)

    def fit(self, X_train, y_train, X_val, y_val):
        self.model.fit(
            self.preprocessing(X_train),
            self._val_for_fit(y_train),
            validation_data=(self.preprocessing(X_val), self._val_for_fit(y_val)),
            epochs=self.epochs,
            batch_size=128,
            # callbacks=[self.checkpoint]
        )
        # self.model.load_weights('best_model_weights.hdf5')
        print("Result on validation data: ", self.evaluate(X_val, y_val))

    def guess(self, features):
        features = self.preprocessing(features)
        result = self.model.predict(features).flatten()
        return self._val_for_pred(result)


# Model training

In [168]:
print("Fitting NN_with_entity_embedding just for one epoch")
models = []
models.append(NN_with_entity_embedding(X_train, y_train, X_val, y_val))

Fitting NN_with_entity_embedding...
Train on 759904 samples, validate on 84434 samples
Epoch 1/1
Result on validation data:  0.20744442814690786


In [192]:
if SAVED_EMBEDDING:
    model = models[0].model
    store_embedding = model.get_layer('store_embedding').get_weights()[0]
    year_embedding = model.get_layer('year_embedding').get_weights()[0]
    weekofyear = model.get_layer('weekofyear').get_weights()[0]
    dow_embedding = model.get_layer('dow_embedding').get_weights()[0]
    holiday_embedding = model.get_layer('holiday_embedding').get_weights()[0]

    with open(saved_embeddings_fname, 'wb') as f:
        pickle.dump([store_embedding, year_embedding, weekofyear, dow_embedding, holiday_embedding], f, -1)

In [19]:
X.columns

Index(['Store', 'year', 'weekofyear', 'DayOfWeek', 'holiday', 'Promo',
       'SchoolHoliday'],
      dtype='object')

# Add embedding features for Ridge regression

In [20]:
def embed_features(X, saved_embeddings_fname):
    # f_embeddings = open("embeddings_shuffled.pickle", "rb")
    f_embeddings = open(saved_embeddings_fname, "rb")
    embeddings = pickle.load(f_embeddings)

    # first number is the column number of the data
    # second number is the number of embedding file
    index_embedding_mapping = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}
    X_embedded = []

    for record in X:
        embedded_features = []
        for i, feat in enumerate(record):
            feat = int(feat)
            if i not in index_embedding_mapping.keys():
                embedded_features += [feat]
            else:
                embedding_index = index_embedding_mapping[i]
                embedded_features += embeddings[embedding_index][feat].tolist()

        X_embedded.append(embedded_features)

    return np.array(X_embedded)

In [21]:
X_embedded = embed_features(X_np, saved_embeddings_fname)

In [22]:
X_train_embedded = X_embedded[:train_size]
X_val_embedded = X_embedded[train_size:]

In [203]:
print("Fitting Ridge Model without embedding")
ridge_model = RidgeRegression(X_train, y_train, X_val, y_val)

Fitting Ridge Model without embedding
MAPE on validation set is:  0.3025903570648116
R squared on validation set is:  0.12080269757253848


In [84]:
print("Fitting Ridge Model with embedding")
ridge_model = RidgeRegression(X_train_embedded, y_train, X_val_embedded, y_val)

Fitting Ridge Model with embedding
MAPE on validation set is:  0.14669741102180572
R squared on validation set is:  0.788096684990228


Simply using embedding feaures significantly improve the score

In [86]:
coef, intercept, model = ridge_model.model()

In [87]:
with open(embedded_ridge_model, 'wb') as f:
    pickle.dump(model, f, -1)

In [34]:
coef.shape

(536,)

In [47]:
# coef of school holidays 
coef[-1:]

array([0.03361238])

In [46]:
# coef of promotion 
coef[-2:-1]

array([0.32545082])

In [43]:
# coef of holidays
coef[-4:-2]

array([1.60582696, 0.65275221])

In [45]:
# coef of dow
coef[-10:-4]

array([-0.10039099,  0.04165739, -0.00163901,  0.39482345,  0.46062902,
        0.98530572])

In [31]:
intercept

8.61187205595797

# Make a prediction

In [64]:
prediction_data = pd.DataFrame(data = {'Store': [1,1], 
                                      'year':[2015,2015],
                                      'weekofyear':[31,31],
                                      'DayOfWeek': [5,5],
                                      'holiday': ['0','0'],
                                      'Promo': [0,1],
                                      'SchoolHoliday': [0,0]})

In [65]:
prediction_data

Unnamed: 0,Store,year,weekofyear,DayOfWeek,holiday,Promo,SchoolHoliday
0,1,2015,31,5,0,0,0
1,1,2015,31,5,0,1,0


In [66]:
pred_np = prediction_data.to_numpy()

In [67]:
pred_np

array([[1, 2015, 31, 5, '0', 0, 0],
       [1, 2015, 31, 5, '0', 1, 0]], dtype=object)

In [68]:
# Load LabelEncoders
with open(label_encoder_pickle, 'rb') as f:
    les = pickle.load(f)

In [69]:
for i in range(5):
    print(les[i].classes_)

[1 2 3 ... 1113 1114 1115]
[2013 2014 2015]
[1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
 52]
[1 2 3 4 5 6 7]
['0' '1' '2' '3']


In [70]:
for i in range(5):
    """ Only need to encode the first 5 columns
    """
    pred_np[:, i] = les[i].transform(pred_np[:, i])

In [72]:
pred_embedded = embed_features(pred_np, saved_embeddings_fname)

In [78]:
prediction_data['sales_prediction']= ridge_model.guess(pred_embedded).astype(int)

In [79]:
prediction_data

Unnamed: 0,Store,year,weekofyear,DayOfWeek,holiday,Promo,SchoolHoliday,sales_prediction
0,1,2015,31,5,0,0,0,4366
1,1,2015,31,5,0,1,0,6045
