In [None]:
%matplotlib inline

In [None]:
# Expects namespace to be assigned on parameters
# set for namespaced execution, otherwise leave empty
# namespace = ""
# point to pipeline parameters
conf_params = "**/data_science/parameters_train_model.yml"

In [None]:
# catalog entries
conf_test_set_metrics = f"{namespace}.test_set_metrics"
conf_train_model = f"{namespace}.train_model"
conf_train_set = f"{namespace}.train_set"
conf_td = f"td"
conf_test_set_predictions = f"{namespace}.test_set_predictions"
conf_feature_importances = f"{namespace}.train_set_feature_importance"

In [None]:
import logging
from datetime import datetime

import kedro

import matplotlib.pyplot as plt
import pandas as pd
import shap

logger = logging.getLogger(__name__)
SMALL_SIZE = 12
plt.rc('font', size=SMALL_SIZE)
plt.rc('axes', titlesize=SMALL_SIZE)
plt.rc('xtick', labelsize=SMALL_SIZE)
plt.rc('ytick', labelsize=SMALL_SIZE)
plt.rc('legend', fontsize=SMALL_SIZE)
plt.rcParams['figure.figsize'] = [18, 8]

In [None]:
# silence warnings
import warnings

warnings.filterwarnings("ignore")

In [None]:
# load kedro context
from project_clisham.optimus_core.reporting_html.utils import load_context, mprint
from project_clisham.optimus_core.model_helpers import shap as opt_shap

logging.getLogger("kedro").setLevel(logging.WARNING)

context = load_context()
parameters = context.params
io = context.io

In [None]:
# silence logging
logging.getLogger("kedro.io").setLevel(logging.WARNING)
logging.getLogger("kedro.config").setLevel(logging.WARNING)
logging.getLogger("kedro.pipeline").setLevel(logging.WARNING)
logging.getLogger("numexpr.utils").setLevel(logging.WARNING)

In [None]:
# load config
conf_train_model_params = context.config_loader.get(conf_params)

# load data
test_prediction_metrics_df = context.catalog.load(conf_test_set_metrics)
model = context.catalog.load(conf_train_model)
train_dataset = context.catalog.load(conf_train_set)
td = context.catalog.load(conf_td)
test_predictions_df = context.catalog.load(conf_test_set_predictions)
feature_importances = context.catalog.load(conf_feature_importances)
feature_importances.name = 'feature_importance'
feature_importances.index = [tag + " - " + str(td.name(tag)) for tag in feature_importances.index]

column_features = parameters[namespace]['model_feature']
name_target = parameters[namespace]['model_target']

time_col = parameters['timestamp_col_name']

In [None]:
# get columns from TagDict
control_cols = td.select("tag_type", "control")
feat_cols = td.select(column_features)
target_col = td.select('target', name_target)[0]

# Model Perfomance Report

In [None]:
report_timestamp = (
    f"#### generated {datetime.now().strftime('%b-%d-%Y %H:%M:%S %z')} "
    f"with environment `{context.env}`"
)
mprint(report_timestamp)

In [None]:
regressor_type = (
    f"### Regressor used for Model Run:\n"
    f"{conf_train_model_params.get('train_model', {}).get('regressor', {}).get('class', '')}"
)
mprint(regressor_type)
mprint("### Details:")
print(model)

## Test Dataset Performance Metrics for Best Parameters

In [None]:
test_prediction_metrics_df.rename(columns={"test_perf_metrics": " "}, inplace=True)
test_prediction_metrics_df

## Feature Importance Plot

In [None]:
fig, ax = plt.subplots(figsize=(10, 15))
feature_importances.sort_values(ascending=True).plot.barh(
    ax=ax, title="Feature Importance"
)
for p in ax.patches:
    ax.annotate(f"{p.get_width():.2f}", (p.get_width() * 1.005, p.get_y() * 1.005))
for p in ax.get_yticklabels():
    tag = p.get_text().split(" - ")[0]
    if tag in control_cols:
        p.set_fontweight("bold")

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
feature_importances.sort_values(ascending=True)[-20:].plot.barh(
    ax=ax, title="TOP Feature Importance"
)
for p in ax.patches:
    ax.annotate(f"{p.get_width():.2f}", (p.get_width() * 1.005, p.get_y() * 1.005))
for p in ax.get_yticklabels():
    tag = p.get_text().split(" - ")[0]
    if tag in control_cols:
        p.set_fontweight("bold")

## Test Dataset Actual vs. Predicted Plot

In [None]:
y_preds = test_predictions_df["prediction"].rename("Predicted")
y_true = test_predictions_df[target_col].rename("Actual")
pred_target: pd.DataFrame = pd.concat([y_true, y_preds], 1)
scatter_ax = pred_target.plot.scatter(
    x="Actual", y="Predicted", figsize=(6, 6), title="Actual vs. Predicted"
)
xmin_lim, xmax_lim = scatter_ax.get_xlim()
scatter_ax.set_ylim(xmin_lim, xmax_lim)
plt.plot(
    scatter_ax.get_xlim(),
    scatter_ax.get_ylim(),
    color="0.8",
    linestyle="--",
    linewidth=0.75,
)
plt.show()

## Test Dataset Actual vs. Residual Plot

In [None]:
residuals: pd.Series = (y_true - y_preds).rename("Residual")
res_df: pd.DataFrame = pd.concat([y_true, residuals], 1)
axis_val = max(abs(residuals.min()), abs(residuals.max())) * 1.05
residuals_ax = res_df.plot.scatter(
    "Actual", "Residual", figsize=(10, 3), title="Actual vs. Residuals"
)
residuals_ax.set_ylim(-1 * axis_val, axis_val)
plt.axhline(
    y=0, color="r", linestyle="-", linewidth=2,
)
plt.show()

## Train and Test Timelines

In [None]:
train_pred = train_dataset[[time_col, target_col]].copy()
train_pred[time_col] = pd.to_datetime(train_pred[time_col])
train_pred["prediction"] = model.predict(train_dataset)
train_pred = train_pred.set_index(time_col)


fig, ax = plt.subplots(figsize=(18, 8))
train_pred.plot(ax=ax)
plt.title("Train");

In [None]:
test_pred = test_predictions_df[[time_col, target_col, "prediction"]].copy()
test_pred[time_col] = pd.to_datetime(test_pred[time_col])
test_pred = test_pred.set_index(time_col)

fig, ax = plt.subplots(figsize=(18, 8))
test_pred.plot(ax=ax)
plt.title("Test");

## Shap Summary Plot

In [None]:
# number of shap values to calculate
n_shap = min(200, len(test_predictions_df))
shap_test_df = test_predictions_df.sample(n=n_shap, random_state=0)

# number of trees to take into account for SHAP calculation
tree_limit = 250

selected_model = model.named_steps["regressor"]
shap_result = opt_shap.calculate_shap_values(
    selected_model,
    train_dataset[feat_cols],
    shap_test_df[feat_cols],
    shap.TreeExplainer,
    tree_limit=tree_limit,
)

In [None]:
shap_values = shap_result.shap_values
X = shap_result.raw_features
feature_names_new = [tag + " - " + str(td.name(tag)) for tag in X.columns]
shap.summary_plot(shap_values.to_numpy(), X, feature_names=feature_names_new, plot_type="dot")

## Shap results for Feature Columns in Dataset

In [None]:
def plot_single_dependence(col, shap_result):
    shap_values = shap_result.shap_values

    if isinstance(col, str):
        col_int = shap_values.columns.get_loc(col)
        if not isinstance(col_int, int):
            msg = "Duplicate column found in shap values? Col was: {}".format(col)
            raise ValueError(msg)
    else:
        col_int = col

    fig = plt.figure()
    shap.dependence_plot(
        col_int, shap_values.values, shap_result.raw_features, color="C0", show=False
    )
    plt.xlabel(str(col))
    plt.ylabel("SHAP value for {}".format(col))
    plt.close(fig)
    return fig

In [None]:
for feature in feat_cols:
    plot = plot_single_dependence(feature, shap_result)
    plt.show()