# Baseline of non promo sales

In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pickle
import warnings

warnings.simplefilter("ignore")
pd.options.display.max_columns = None

In [2]:
import matplotlib.pyplot as plt

# Read data

In [37]:
saved_embeddings_fname = "models/embeddings.pickle" 
label_encoder_pickle = 'models/les.pickle'
embedded_ridge_model = 'models/embedded_ridge_model.pickle'

In [38]:
processed_path = 'processed/'
processed_file = 'processed_data.csv'
df = pd.read_csv(processed_path + processed_file)

# Baseline set promo = 0

In [39]:
df['Promo'] = 0

# Preprocessing

In [40]:
def consolidate_holiday(df):
    df['holiday'] = '0'
    df.loc[df['holiday_public'] ==1, 'holiday'] = '1'
    df.loc[df['holiday_easter'] ==1, 'holiday'] = '2'
    df.loc[df['holiday_christmas'] ==1, 'holiday'] = '3'
    return df.drop(columns = ['holiday_public', 'holiday_no', 'holiday_easter', 'holiday_christmas'])

In [41]:
df = consolidate_holiday(df)

In [42]:
X = df[['Store', 'year','weekofyear','DayOfWeek',  'holiday', 
        'Promo', 'SchoolHoliday']]
X_np = X.to_numpy()

In [43]:
# Load LabelEncoders
with open(label_encoder_pickle, 'rb') as f:
    les = pickle.load(f)

In [44]:
for i in range(5):
    """ Only need to encode the first 5 columns
    """
    X_np[:, i] = les[i].transform(X_np[:, i])

In [45]:
def embed_features(X, saved_embeddings_fname):
    # f_embeddings = open("embeddings_shuffled.pickle", "rb")
    f_embeddings = open(saved_embeddings_fname, "rb")
    embeddings = pickle.load(f_embeddings)

    # first number is the column number of the data
    # second number is the number of embedding file
    index_embedding_mapping = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}
    X_embedded = []

    for record in X:
        embedded_features = []
        for i, feat in enumerate(record):
            feat = int(feat)
            if i not in index_embedding_mapping.keys():
                embedded_features += [feat]
            else:
                embedding_index = index_embedding_mapping[i]
                embedded_features += embeddings[embedding_index][feat].tolist()

        X_embedded.append(embedded_features)

    return np.array(X_embedded)

In [46]:
pred_embedded = embed_features(X_np, saved_embeddings_fname)

# Prediction

In [47]:
# Load model
with open(embedded_ridge_model, 'rb') as f:
    ridge_model = pickle.load(f)

In [59]:
df['prediction']= np.exp(ridge_model.predict(pred_embedded)).astype(int)
df['baseline'] = df['prediction'].clip(0,df['Sales'])

In [60]:
df['uplift'] = df['Sales'] - df['baseline']

# Plot one store

In [61]:
test_store = df.loc[df['Store']==1]

In [62]:
df

Unnamed: 0,Store,DayOfWeek,Sales,Promo,SchoolHoliday,year,weekofyear,holiday,sales_prediction,baseline,uplift,prediction
0,1097,2,5961,0,1,2013,1,1,7801,5961,0,7801
1,85,2,4220,0,1,2013,1,1,5666,4220,0,5666
2,259,2,6851,0,1,2013,1,1,9103,6851,0,9103
3,262,2,17267,0,1,2013,1,1,15929,15929,1338,15929
4,274,2,3102,0,1,2013,1,1,2919,2919,183,2919
...,...,...,...,...,...,...,...,...,...,...,...,...
844333,745,5,8363,0,1,2015,31,0,6552,6552,1811,6552
844334,746,5,9082,0,1,2015,31,0,5546,5546,3536,5546
844335,747,5,10708,0,1,2015,31,0,7347,7347,3361,7347
844336,741,5,11253,0,1,2015,31,0,7195,7195,4058,7195
