## Reading the libraries, data and trained models

In [9]:
# Libraries

import pandas as pd
import numpy as np

import joblib

import dalex as dx

In [2]:
# Data

X_test = pd.read_csv("../data/X_test_data.csv")
X_train = pd.read_csv("../data/X_train_data.csv")
y_test = pd.read_csv("../data/y_test_data.csv")
y_train = pd.read_csv("../data/y_train_data.csv")

print("Train set shape:", X_train.shape, y_train.shape)
print("Test set shape :", X_test.shape, y_test.shape)

Train set shape: (6154, 27) (6154, 1)
Test set shape : (2638, 27) (2638, 1)


In [13]:
X_test

Unnamed: 0,age_of_series,r_rated,no_of_seasons,sentiment,multiple_platforms,netflix,hulu,prime_video,true_crime,world_war,...,genre_crime,genre_drama,genre_anime,genre_comedy,genre_documentary,genre_reality,genre_fiction,genre_actionadventure,genre_homegarden,genre_standuptalk
0,45,0,40,0.6344,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,2,1,2,-0.1531,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
2,6,1,4,-0.7512,0,0,0,1,0,0,...,0,1,0,1,0,0,0,0,0,0
3,5,0,1,-0.9403,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,8,0,5,-0.8451,0,0,0,0,1,0,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2633,10,0,7,-0.3088,1,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2634,4,0,1,-0.5267,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2635,10,0,4,0.8779,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2636,13,0,1,0.8360,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [3]:
# Models

model_files = {
    'XGB all vars':   '../models/shows_xgb_all_vars.joblib',
    'XGB selected vars': '../models/shows_xgb_limited_vars.joblib'
}

models = {name: joblib.load(path) for name, path in model_files.items()}

In [5]:
X_test_transformed_all = pd.DataFrame(
        models['XGB all vars'].named_steps['preprocess'].transform(X_test),
        columns=models['XGB all vars'].named_steps['preprocess'].get_feature_names_out()
    )

X_test_transformed_limited = pd.DataFrame(
        models['XGB selected vars'].named_steps['preprocess'].transform(X_test),
        columns=models['XGB selected vars'].named_steps['preprocess'].get_feature_names_out()
    )

## Residual analysis

In [10]:
# Creating the explainers

# Set an empty dictionary
dalex_explainers = {}

for name, model in models.items():
    
    X_test_transformed = pd.DataFrame(
        model.named_steps['preprocess'].fit_transform(X_test),
        columns=model.named_steps['preprocess'].get_feature_names_out()
    )

    # Explainer on original scale (1-10)
    dalex_explainers[name] = dx.Explainer(
        model.named_steps['model'],
        X_test_transformed,
        y=y_test,
        label=name,
        predict_function=lambda m, x: m.predict(x).flatten()
    )


Preparation of a new explainer is initiated

  -> data              : 2638 rows 27 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray.
  -> target variable   : 2638 values
  -> model_class       : xgboost.sklearn.XGBRegressor (default)
  -> label             : XGB all vars
  -> predict function  : <function <lambda> at 0x147f1ac10> will be used
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 4.4, mean = 6.93, max = 8.59
  -> model type        : regression will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -5.88, mean = 0.0141, max = 3.14
  -> model_info        : package xgboost

A new explainer has been created!
Preparation of a new explainer is initiated

  -> data              : 2638 rows 20 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray.
  -> target variable   : 2638 val

In [11]:
# Compare the performance of two models

pd.concat((dalex_explainers['XGB all vars'].model_performance().result, dalex_explainers['XGB selected vars'].model_performance().result))

Unnamed: 0,mse,rmse,r2,mae,mad
XGB all vars,1.143736,1.069456,0.175867,0.793841,0.618788
XGB selected vars,1.146663,1.070823,0.173759,0.79482,0.628344


The errors and the R2 are not the best but the data is quite complex - we are having quite scarce information and we are trying to get some insight sfrom it

In [12]:
# Residual plot
dalex_explainers['XGB all vars'].model_diagnostics().plot(dalex_explainers['XGB selected vars'].model_diagnostics())


We see some problems - a slight trend sloping downwards and quite high variability of the residuals. That shows that the model's predictive accuracy is not that good.

## Permutation importance

In [15]:
for name, explainer in dalex_explainers.items():
    print(f"{name}")
    explainer.model_parts().plot(title=f"Permutation Importance – {name}")

XGB all vars


XGB selected vars


Permutation importance helps us identify which variables are most influential in the model. By shuffling a given variable and observing the drop in model performance, we can determine how much the model depends on that variable. In our case, the variables that have the greatest impact appear to be whether the series is a documentary, reality show, or drama, as well as the number of seasons.

In [16]:
# Compute permutation-based variable importance
vi = dalex_explainers['XGB all vars'].model_parts()

# Sort by importance (drop in performance when permuted)
top_features = vi.result.sort_values(by="dropout_loss", ascending=False)

# Display top 10 features
top_features[["variable", "dropout_loss"]].head(20)

Unnamed: 0,variable,dropout_loss
28,_baseline_,1.312713
27,all_vars__genre_documentary,1.143629
26,all_vars__genre_reality,1.137474
25,all_vars__genre_drama,1.132156
24,all_vars__no_of_seasons,1.115826
23,all_vars__age_of_series,1.109963
22,all_vars__sentiment,1.10569
21,all_vars__genre_anime,1.100851
20,all_vars__r_rated,1.099757
19,all_vars__genre_crime,1.095068


## Partial dependence plots

## Accumulated local effects

## Shapley additive explanations