## Generate Datasets

_Last Updated_: 2021/06/20

### Authors
* Kevin Chu (kevin@velexi.com)

### Overview
This Jupyter notebook generates a synthetic tomato attribute and sales dataset.

### Data Fields

#### Tomato Attributes
* color
    * R-G value (floating point values between -1.0 and 1.0)
    * Y-B value (floating point values between -1.0 and 1.0)    
* is_heirloom: bool
* diameter: centimeters
* weight: grams
* number_of_blemishes: #

#### Sales Attributes
* zipcode
* date
* number_of_competitor_stands

#### Outcomes
* was_sold: bool
* was_returned: bool
* customer_rating: 1 (worst), 2, 3, 4, 5 (best)

### Data Model
* TODO

### User Parameters

#### Dataset parameters
* num_records: number of records to generate

#### Data model parameters
* model_params: dictionary mapping outcome tuples `(was_sold, was_returned, customer_rating)` to tomato and sales attribute parameters

### Preparations 

In [1]:
# --- User Parameters

# Imports
import datetime

# Dataset parameters
num_records = 1000

# Data model parameters
model_params = {
    5: {
        "P_sold": 0.9,
        "P_returned": 0.1,
        "RG_mu": 0.9,
        "RG_sigma": 0.1,
        "YB_mu": 0.5,
        "YB_sigma": 0.5,
        "diameter_mu": 9,
        "diameter_sigma": 2,
        "weight_mu": 400,
        "weight_sigma": 50,
        "number_of_blemishes_avg": 1,
        "zipcodes": ["94301", "94302", "94303", "94304", "94305", "94306", "94309"],
        "sale_date": datetime.datetime(2021, 6, 1).timestamp(),
        "number_of_competitor_stands_min": 1,
        "number_of_competitor_stands_max": 4,
    },
    4: {
        "P_sold": 0.8,
        "P_returned": 0.2,
        "RG_mu": 0.7,
        "RG_sigma": 0.1,
        "YB_mu": 0.5,
        "YB_sigma": 0.5,
        "diameter_mu": 8,
        "diameter_sigma": 2,
        "weight_mu": 350,
        "weight_sigma": 50,
        "number_of_blemishes_avg": 1,
        "zipcodes": ["94301", "94302", "94303", "94304", "94305", "94306", "94309"],
        "sale_date": datetime.datetime(2021, 6, 1).timestamp(),
        "number_of_competitor_stands_min": 1,
        "number_of_competitor_stands_max": 5,
    },
    3: {
        "P_sold": 0.7,
        "P_returned": 0.2,
        "RG_mu": 0.4,
        "RG_sigma": 0.1,
        "YB_mu": 0.5,
        "YB_sigma": 0.5,
        "diameter_mu": 7,
        "diameter_sigma": 2,
        "weight_mu": 300,
        "weight_sigma": 50,
        "number_of_blemishes_avg": 2,
        "zipcodes": ["94301", "94302", "94303", "94304", "94305", "94306", "94309"],
        "sale_date": datetime.datetime(2021, 6, 1).timestamp(),
        "number_of_competitor_stands_min": 2,
        "number_of_competitor_stands_max": 6,
    },
    2: {
        "P_sold": 0.6,
        "P_returned": 0.4,
        "RG_mu": 0.2,
        "RG_sigma": 0.1,
        "YB_mu": 0.5,
        "YB_sigma": 0.5,
        "diameter_mu": 7,
        "diameter_sigma": 2,
        "weight_mu": 300,
        "weight_sigma": 100,
        "number_of_blemishes_avg": 3,
        "zipcodes": ["94301", "94302", "94303", "94304", "94305", "94306", "94309"],
        "sale_date": datetime.datetime(2021, 6, 1).timestamp(),
        "number_of_competitor_stands_min": 2,
        "number_of_competitor_stands_max": 7,
    },
    1: {
        "P_sold": 0.4,
        "P_returned": 0.6,
        "RG_mu": 0.0,
        "RG_sigma": 0.1,
        "YB_mu": 0.5,
        "YB_sigma": 0.5,
        "diameter_mu": 5,
        "diameter_sigma": 2,
        "weight_mu": 250,
        "weight_sigma": 100,
        "number_of_blemishes_avg": 4,
        "zipcodes": ["94301", "94302", "94303", "94304", "94305", "94306", "94309"],
        "sale_date": datetime.datetime(2021, 6, 1).timestamp(),
        "number_of_competitor_stands_min": 4,
        "number_of_competitor_stands_max": 10,
    },
}

In [2]:
# --- Imports

# Standard library
import math
import os
import pickle

# External packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import scipy.stats

### Generate Dataset

In [3]:
# --- Preparations

# Initialize dataset
columns = ["customer_rating", "was_sold", "was_returned",
           "R-G", "Y-B", "diameter", "weight", "number_of_blemishes",
           "zipcode", "sale_date", "number_of_competitor_stands"]
dataset = pd.DataFrame(columns=columns)

# --- Generate empty records containing only customer rating

# customer_rating
dataset["customer_rating"] = np.random.choice([1, 2, 3, 4, 5], num_records)

In [4]:
# --- Generate attributes for each record

for index, record in dataset.iterrows():
    # Get outcome tuple
    outcome = record["customer_rating"]
    params = model_params.get(outcome, None)
    
    # Generate attributes
    if params is not None:
        truncnorm_a = (-1 - params["RG_mu"]) / params["RG_sigma"]
        truncnorm_b = (1 - params["RG_mu"]) / params["RG_sigma"]        
        record["R-G"] = params["RG_mu"] + \
            scipy.stats.truncnorm.rvs(truncnorm_a, truncnorm_b) * params["RG_sigma"] 
        
        truncnorm_a = (-1 - params["YB_mu"]) / params["YB_sigma"]
        truncnorm_b = (1 - params["YB_mu"]) / params["YB_sigma"]        
        record["Y-B"] = params["YB_mu"] + \
            scipy.stats.truncnorm.rvs(truncnorm_a, truncnorm_b) * params["YB_sigma"] 

        truncnorm_a = (0 - params["diameter_mu"]) / params["diameter_sigma"]
        truncnorm_b = (20 - params["diameter_mu"]) / params["diameter_sigma"]        
        record["diameter"] = params["diameter_mu"] + \
            scipy.stats.truncnorm.rvs(truncnorm_a, truncnorm_b) * params["diameter_sigma"]
        
        truncnorm_a = (0 - params["weight_mu"]) / params["weight_sigma"]
        truncnorm_b = (20 - params["weight_mu"]) / params["weight_sigma"]        
        record["weight"] = params["weight_mu"] + \
            scipy.stats.truncnorm.rvs(truncnorm_a, truncnorm_b) * params["weight_sigma"]
    
        record["number_of_blemishes"] = np.random.poisson(params["number_of_blemishes_avg"])

        record["zipcode"] = np.random.choice(params["zipcodes"])

        record["sale_date"] = params["sale_date"]

        record["number_of_competitor_stands"] = \
            np.random.choice(range(params["number_of_competitor_stands_min"],
                                   params["number_of_competitor_stands_max"]+1))

    # Update record in dataset
    dataset.iloc[index] = record

In [5]:
# --- Generate sales outcomes

customer_ratings = model_params.keys()
for rating in customer_ratings:
    P_sold = model_params[rating]["P_sold"]
    P_returned = model_params[rating]["P_returned"]
    num_records = dataset.value_counts("customer_rating")[rating]
    
    dataset.loc[dataset["customer_rating"]==rating, "was_sold"] = \
        np.random.random_sample(num_records) > P_sold
    dataset.loc[dataset["customer_rating"]==rating, "was_returned"] = \
        np.random.random_sample(num_records) > P_returned

### Build model

In [6]:
# --- Prepare data

input_features = ["R-G", "Y-B", "diameter", "weight", "number_of_blemishes",
                  "zipcode", "sale_date", "number_of_competitor_stands"]
output = ["was_sold"]

# Split data
training_data, testing_data = train_test_split(dataset)

# Prepare training data
X_train = training_data[input_features].to_numpy()
Y_train = training_data[output].astype('bool').to_numpy().ravel()

# Prepare testing data
X_test = testing_data[input_features].to_numpy()
Y_test = testing_data[output].astype('bool').to_numpy().ravel()

In [7]:
# --- Build model

model = LogisticRegression()
model = RandomForestClassifier()
model.fit(X_train, Y_train)

RandomForestClassifier()

In [8]:
# model.predict_proba(X_test)

In [9]:
# --- Save model as pickle file

model_filename = os.path.join("assets", "model.pkl")
with open("model.pkl", "wb") as file_:
    pickle.dump(model, file_)

In [10]:
# --- Test pickle file loading

with open("assets/model.pkl", "rb") as file_:
    m = pickle.load(file_)

# m.predict_proba(X_test)

In [11]:
# --- Save datasets

training_data.to_csv(os.path.join("data", "train_data.csv"), index=False)
testing_data.to_csv(os.path.join("data", "test_data.csv"), index=False)