# Supermarket Sales Dataset

Description:
* 1000 rows, 6 parent and 7 child features

Parent features:
* Branch, City, Customer type, Gender, Product line, Unit price

Child features:
* Quantity, Tax 5%, Total, Month, Date, Time, Payment

Source: https://www.kaggle.com/aungpyaeap/supermarket-sales

In [1]:
import sys
sys.path.append("C:/Users/notes/grab")

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
sm = pd.read_csv("../data/supermarket.csv")

In [4]:
parent = sm[['Branch', 'City', 'Customer type', 'Gender', 'Product line', 'Unit price']]
parent.head()

Unnamed: 0,Branch,City,Customer type,Gender,Product line,Unit price
0,A,Yangon,Member,Female,Health and beauty,74.69
1,C,Naypyitaw,Normal,Female,Electronic accessories,15.28
2,A,Yangon,Normal,Male,Home and lifestyle,46.33
3,A,Yangon,Member,Male,Health and beauty,58.22
4,A,Yangon,Normal,Male,Sports and travel,86.31


In [5]:
child = sm[['Quantity', 'Tax 5%', 'Total', 'Date', 'Time', 'Payment']]
child.head()

Unnamed: 0,Quantity,Tax 5%,Total,Date,Time,Payment
0,7,26.1415,548.9715,1/5/2019,13:08,Ewallet
1,5,3.82,80.22,3/8/2019,10:29,Cash
2,7,16.2155,340.5255,3/3/2019,13:23,Credit card
3,8,23.288,489.048,1/27/2019,20:33,Ewallet
4,7,30.2085,634.3785,2/8/2019,10:37,Ewallet


In [6]:
from sklearn.preprocessing import LabelEncoder

t_parent = []
for feature in ['Branch', 'City', 'Customer type', 'Gender', 'Product line']:
    le = LabelEncoder()
    t_parent.append(le.fit_transform(parent[feature]))

t_parent = np.stack([t_parent], axis=-1).reshape((1000, 5))

X = np.append(t_parent, parent[['Unit price']], axis=1).astype(np.float32)

In [7]:
le = LabelEncoder()
t_child = le.fit_transform(child['Payment'])[:,np.newaxis]

child_month = pd.to_datetime(child['Date']).dt.month
child_day = pd.to_datetime(child['Date']).dt.day
child_hour = pd.to_datetime(child['Time']).dt.hour
child_dt = pd.concat([child_month, child_day, child_hour], axis=1).values

y = np.concatenate([child[['Quantity', 'Tax 5%', 'Total']], t_child, child_dt], axis=1).astype(np.float32)

# Model Evaluation

In [61]:
from model.gmm import GMM
from model.cgan import CGAN
from model.bayesnn import BayesNN
from model.mcdropout import MCDropout
from model.deepensemble import DeepEnsemble
from hyperopt.hyperopt import (hyperopt,
                               hyperopt_log,
                               hyperparam_plot,
                               performance_plot)
from visualisation.visualisation import (plot_surface,
                                         plot_prob_fixed,
                                         plot_violin,
                                         plot_prob_violin,
                                         plot_binned_violin)
from data.synthetic import plot_data

# Model: Gaussian Mixture Density Network

In [None]:
params = {
    "x_features": 6,
    "y_features": 7,
    "epochs": 20000,
}
opt_params = {
    'n_components': [1,100],
    'n_hidden': [1,100],
}

gmm, best_params, best_vals, experiment, exp_model = hyperopt(
    GMM,
    params,
    opt_params,
    X, y,
    trials=30,
    val_split=0.8)

y_hat = gmm.predict(X)
plot_data(X[:,4:6], y[:,:2])
plot_data(X[:,4:6], y_hat[:,:2])

[INFO 07-14 07:52:01] ax.modelbridge.dispatch_utils: Using Bayesian Optimization generation strategy: GenerationStrategy(name='Sobol+GPEI', steps=[Sobol for 5 trials, GPEI for subsequent trials]). Iterations after 5 will take longer to generate due to  model-fitting.
[INFO 07-14 07:52:01] ax.service.managed_loop: Started full optimization with 30 steps.
[INFO 07-14 07:52:01] ax.service.managed_loop: Running optimization trial 1...
[INFO 07-14 07:53:28] ax.service.managed_loop: Running optimization trial 2...
[INFO 07-14 07:55:00] ax.service.managed_loop: Running optimization trial 3...
[INFO 07-14 07:57:22] ax.service.managed_loop: Running optimization trial 4...
[INFO 07-14 08:01:29] ax.service.managed_loop: Running optimization trial 5...
[INFO 07-14 08:06:19] ax.service.managed_loop: Running optimization trial 6...
[INFO 07-14 08:08:46] ax.service.managed_loop: Running optimization trial 7...
[INFO 07-14 08:12:15] ax.service.managed_loop: Running optimization trial 8...
[INFO 07-14 

In [None]:
print(f"n_components: {gmm.n_components}")
print(f"n_hidden: {gmm.n_hidden}")
display(hyperopt_log(experiment))
hyperparam_plot(exp_model, "n_components", "n_hidden")

In [None]:
from performance.performance import prob_overlap, kl, js
try:
    p, q = prob_overlap(y, y_hat, bins=5)
    print("fKL:", kl(p, q))
    print("rKL:", kl(q, p))
    print("JS:", js(p, q))
except Exception as e:
    print(f"Exception: {e}")
    pass

# Model: Conditional Generative Adversarial Network

In [None]:
params = {
    "x_features": 6,
    "y_features": 7,
    "epochs": 20000,
}
opt_params = {
    'latent_dim': [1,100],
    'g_hidden': [1,100],
    'd_hidden': [1,100],
    'label_smooth': [0.0,1.0],
    'd_dropout': [0.0, 0.9],
    'gp_weight': [0.0, 1.0],
    'ds_weight': [0.0, 1.0],
}

cgan, best_params, best_vals, experiment, exp_model = hyperopt(
    CGAN,
    params,
    opt_params,
    X, y,
    trials=30,
    val_split=0.8)

y_hat = cgan.predict(X)
plot_data(X[:,4:6], y[:,:2])
plot_data(X[:,4:6], y_hat[:,:2])

In [None]:
print(f"latent_dim: {cgan.latent_dim}")
print(f"g_hidden: {cgan.g_hidden}")
print(f"d_hidden: {cgan.d_hidden}")
print(f"label_smooth: {cgan.label_smooth}")
print(f"d_dropout: {cgan.d_dropout}")
print(f"gp_weight: {cgan.gp_weight}")
print(f"ds_weight: {cgan.ds_weight}")

display(hyperopt_log(experiment))
hyperparam_plot(exp_model, "g_hidden", "d_hidden")
hyperparam_plot(exp_model, "label_smooth", "d_dropout")
hyperparam_plot(exp_model, "gp_weight", "ds_weight")

In [None]:
from performance.performance import prob_overlap, kl, js
try:
    p, q = prob_overlap(y, y_hat, bins=5)
    print("fKL:", kl(p, q))
    print("rKL:", kl(q, p))
    print("JS:", js(p, q))
except Exception as e:
    print(f"Exception: {e}")
    pass

# Bayesian Neural Network

In [None]:
from model.bayesnn import BayesNN
from hyperopt.hyperopt import hyperopt
from hyperopt.hyperopt import hyperopt_log, hyperparam_plot, performance_plot

params = {
    "x_features": 6,
    "y_features": 7,
    "epochs": 2000,
}
opt_params = {
    'n_hidden': [1,100],
    'n_layers': [1,10],
}

bayesnn, best_params, best_vals, experiment, exp_model = hyperopt(
    BayesNN,
    params,
    opt_params,
    X, y,
    trials=30,
    val_split=0.8)

y_hat = bayesnn.predict(X)
plot_data(X[:,4:6], y[:,:2])
plot_data(X[:,4:6], y_hat[:,:2])

In [None]:
print(f"n_hidden: {bayesnn.n_hidden}")
print(f"n_layers: {bayesnn.n_layers}")

display(hyperopt_log(experiment))
hyperparam_plot(exp_model, "n_hidden", "n_layers")

In [None]:
from performance.performance import prob_overlap, kl, js
try:
    p, q = prob_overlap(y, y_hat, bins=5)
    print("fKL:", kl(p, q))
    print("rKL:", kl(q, p))
    print("JS:", js(p, q))
except Exception as e:
    print(f"Exception: {e}")
    pass

# Monte Carlo Dropout

In [None]:
from model.mcdropout import MCDropout
from hyperopt.hyperopt import hyperopt
from hyperopt.hyperopt import hyperopt_log, hyperparam_plot, performance_plot

params = {
    "x_features": 6,
    "y_features": 7,
    "epochs": 2000,
}
opt_params = {
    'n_hidden': [1,100],
    'dropout': [0.1,0.9],
}

mcdropout, best_params, best_vals, experiment, exp_model = hyperopt(
    MCDropout,
    params,
    opt_params,
    X, y,
    trials=30,
    val_split=0.8)

y_hat, y_std = mcdropout.predict(X, return_std=True)
y_hat = mcdropout.predict(X)
plot_data(X[:,4:6], y[:,:2])
plot_data(X[:,4:6], y_hat[:,:2])

In [None]:
print(f"n_hidden: {mcdropout.n_hidden}")
print(f"dropout: {mcdropout.dropout}")

display(hyperopt_log(experiment))
hyperparam_plot(exp_model, "n_hidden", "dropout")

In [None]:
from performance.performance import prob_overlap, kl, js
try:
    p, q = prob_overlap(y, y_hat, bins=5)
    print("fKL:", kl(p, q))
    print("rKL:", kl(q, p))
    print("JS:", js(p, q))
except Exception as e:
    print(f"Exception: {e}")
    pass

# Deep Ensemble

In [None]:
from model.deepensemble import DeepEnsemble
from hyperopt.hyperopt import hyperopt
from hyperopt.hyperopt import hyperopt_log, hyperparam_plot, performance_plot

params = {
    "x_features": 6,
    "y_features": 7,
    "epochs": 2000,
}
opt_params = {
    'n_hidden': [1,100],
    'dropout': [0.1,0.5],
}

deepensemble, best_params, best_vals, experiment, exp_model = hyperopt(
    DeepEnsemble,
    params,
    opt_params,
    X, y,
    trials=30,
    val_split=0.8)

y_hat, y_std = deepensemble.predict(X, return_std=True)
plot_data(X[:,4:6], y[:,:2])
plot_data(X[:,4:6], y_hat[:,:2])

In [None]:
print(f"n_hidden: {deepensemble.n_hidden}")
print(f"dropout: {deepensemble.dropout}")

display(hyperopt_log(experiment))
hyperparam_plot(exp_model, "n_hidden", "dropout")

In [None]:
from performance.performance import prob_overlap, kl, js
try:
    p, q = prob_overlap(y, y_hat, bins=5)
    print("fKL:", kl(p, q))
    print("rKL:", kl(q, p))
    print("JS:", js(p, q))
except Exception as e:
    print(f"Exception: {e}")
    pass