# Supermarket Sales Dataset

Description:
* 1000 rows, 6 parent and 5 child features

Parent features:
* Branch, City, Customer type, Gender, Product line, Unit price

Child features:
* Quantity, Tax 5%, Total, Date, Time, Payment

Source: https://www.kaggle.com/aungpyaeap/supermarket-sales

In [1]:
import sys
sys.path.append("C:/Users/notes/grab")

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
sm = pd.read_csv("../data/supermarket.csv")

In [4]:
parent = sm[['Branch', 'City', 'Customer type', 'Gender', 'Product line', 'Unit price']]
parent.head()

Unnamed: 0,Branch,City,Customer type,Gender,Product line,Unit price
0,A,Yangon,Member,Female,Health and beauty,74.69
1,C,Naypyitaw,Normal,Female,Electronic accessories,15.28
2,A,Yangon,Normal,Male,Home and lifestyle,46.33
3,A,Yangon,Member,Male,Health and beauty,58.22
4,A,Yangon,Normal,Male,Sports and travel,86.31


In [5]:
child = sm[['Quantity', 'Tax 5%', 'Total', 'Date', 'Time', 'Payment']]
child.head()

Unnamed: 0,Quantity,Tax 5%,Total,Date,Time,Payment
0,7,26.1415,548.9715,1/5/2019,13:08,Ewallet
1,5,3.82,80.22,3/8/2019,10:29,Cash
2,7,16.2155,340.5255,3/3/2019,13:23,Credit card
3,8,23.288,489.048,1/27/2019,20:33,Ewallet
4,7,30.2085,634.3785,2/8/2019,10:37,Ewallet


In [6]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(sparse=False)
t_parent = enc.fit_transform(parent[['Branch', 'City', 'Customer type', 'Gender', 'Product line']])

X = np.append(t_parent, parent[['Unit price']], axis=1)

In [7]:
enc = OneHotEncoder(sparse=False)
t_child = enc.fit_transform(child[['Payment']])

child_month = pd.to_datetime(child['Date']).dt.month
child_day = pd.to_datetime(child['Date']).dt.day
child_hour = pd.to_datetime(child['Time']).dt.hour
child_dt = pd.concat([child_month, child_day, child_hour], axis=1).values

y = np.concatenate([child[['Quantity', 'Tax 5%', 'Total']], t_child, child_dt], axis=1)

# Modelling

In [None]:
from model.gmm import GMM
from hyperopt.hyperopt import hyperopt
from hyperopt.hyperopt import hyperopt_log, hyperparam_plot, performance_plot
from visualisation.visualisation import (plot_surface,
                                         plot_prob_fixed,
                                         plot_violin,
                                         plot_prob_violin,
                                         plot_binned_violin)

In [None]:
params = {
    "x_features": 6,
    "y_features": 5,
    "epochs": 10000,
}
opt_params = {
    'n_components': [1,200],
    'n_hidden': [1,200],
}

gmm, best_params, best_vals, experiment, exp_model = hyperopt(
    GMM,
    params,
    opt_params,
    X, y,
    trials=30,
    val_split=0.8)

y_hat = gmm.predict(X)
plot_data(X, y_hat)

In [None]:
print(f"n_components: {gmm.n_components}")
print(f"n_hidden: {gmm.n_hidden}")
hyperparam_plot(exp_model, "n_components", "n_hidden")

In [None]:
# Given X and y, plot a violin of y based on binned X, to compare between fitted and actual.
# For the actual distribution, bin X and plot violin plots for the density of y.
# For the fitted distribution, input the mean of binned X and plot violin plots for the density of sampled y.
plot_binned_violin(gmm, X, y, bins=5)

In [None]:
from performance.performance import prob_overlap, kl, js

p, q = prob_overlap(y, y_hat, bins=10)
print("fKL:", kl(p, q))
print("rKL:", kl(q, p))
print("JS:", js(p, q))