In [None]:
!pip install -q eccd_datasets category_encoders shap pygradus

In [None]:
STUDENT_NAME = "Nombre Apellido"
COURSE_NAME = "eccd-oct23"
EXERCISE_NAME = "price-recommendation"

# Objective

Explore how a pricing automation / recommendation project looks like.

In a pricing recommendation problem, often the most accurate prediction is not necessary the most important goal.
Indeed, sometimes offering a range of possible values or an explanation on how a certain variable affects the outcomes can be more useful for an end-user.

We will use a very basic data cleaning of a popular dataset before proceeding.

In [None]:
import pandas as pd
import numpy as np

from eccd_datasets import load_mercari
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import shap

from pygradus import create_exercise, check_solution

In [None]:
df = load_mercari()

In [None]:
df.head()

# Data Cleaning

In this excercise we are going to ignore both `name` and `item_description` categories.

For the `category_name` feature, we are going to split it in three.

Then, we are going to use a categorical encoder to encode all string atributes into numbers.

In [None]:
def split_cat(text):
    try: return text.split("/")
    except: return ("No Label", "No Label", "No Label")

In [None]:
df["cat_1"], df["cat_2"], df["cat_3"] = zip(*df["category_name"].apply(split_cat))

In [None]:
df.head()

In [None]:
df = df[[
    "item_condition_id", "brand_name", "shipping", "cat_1", "cat_2", "cat_3", "price"
]]

In [None]:
df.head()

# Data preparation

As always, we divide our dataset into a training and test datasets.

We fix the `random_seed` to make sure that our experiment is reproducible!

In [None]:
y = df.pop("price")
X = df.copy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, shuffle=True)

In [None]:
X_train.head()

## Target Encoder

We will proceed to build a target encoder for the columns that still have strings in them

In [None]:
def build_target_encoder(X: pd.DataFrame, y: pd.DataFrame) -> TargetEncoder:
    """
    Train a target encoder on columns "brand_name, "cat_1", "cat_2", "cat_3"
    using the train dataset and return the "fitted" encoder.
    """
    # Write your code here


In [None]:
te = build_target_encoder(X_train, y_train)

row1 = X_train.iloc[:1]
row1_t = te.transform(row1)

In [None]:
assert np.allclose(row1_t["cat_1"], y_train.loc[X_train["cat_1"] == row1["cat_1"].iloc[0]].mean())

In [None]:
row2 = X_test.iloc[:1]
row2_t = te.transform(row2)
answer_target_encoder = row2_t["cat_2"].values[0]
print("cat_2 target encoder", answer_target_encoder)

# Training

For training we are going to use a very popular machine learning model named `LightGBM` from Micrsoft.

One of the advantages of this model is that it includes a `quantile loss` that we can use to obtain intervals.

We will train the models three times, one for each quantile: `10%, 50% (the median) and 90%`.


In [None]:
X_train_t = te.transform(X_train)

params = {
    'objective': 'quantile',
    'metric': 'quantile',
    'max_depth': 4,
    'num_leaves': 15,
    'learning_rate': 0.1,
    'n_estimators': 100,
    'boosting_type': 'gbdt',
    'seed': 42,
    'num_threads': 1
}

quantiles = [.1, .5, .9]

preds = []

for i in range(len(quantiles)):

    reg = lgb.LGBMRegressor(alpha=quantiles[i], **params)

    model = reg.fit(X_train_t, y_train)

    X_test_t = te.transform(X_test)

    y_pred = model.predict(X_test_t)

    preds.append(y_pred)

Here we process the three predicitions, one for each model and use them to build the corresponding intervals

In [None]:
df_preds = pd.DataFrame(preds).T
df_preds["y_test"] = y_test.values
df_preds.columns = ["q10", "q50", "q90", "y_test"]

In [None]:
df_preds.head()

In [None]:
def get_result_within_interval(df_preds: pd.DataFrame) -> int:
    """
    Implement a function that counts for how many
    rows it holds that the true value $y \in [q10, q90]$

    For example, if in a row the real value of the target variable is 10,
    q10 is 5 and q90 is 15, that row counts.
    If in a different row, the target variable is 20, q10 is 5 and q90 is 15,
    that row does not count.
    """
    # Write your code here


In [None]:
answer_interval =  get_result_within_interval(df_preds)
print("Results within interval", answer_interval)

# Shapely value

Finally, we can use the shapley value to obtain an explanation of how each feature contributes to the final prediction.

In a price recommendatino problem this information is very helpful to end-users, possibly even more than the actual price

In [None]:
shap.initjs()

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test_t)

In [None]:
shap.summary_plot(shap_values, X_test_t)

We can also use the Shapley value to predict a single element

In [None]:
shap.force_plot(explainer.expected_value, shap_values[0,:], X_test_t.iloc[0,:])

In [None]:

proposed_solution = {
'attempt': {
    'course_name': COURSE_NAME,
    'exercise_name': EXERCISE_NAME,
    'username': STUDENT_NAME,
},
'task_attempts': [
	{
		"name": "target-encoder",
		"answer": answer_target_encoder,
	},
	{
		"name": "results-within-interval",
		"answer": answer_interval,
	},
]

}
check_solution(proposed_solution)
