# Homework 4

## Imports

In [None]:
!pip install pyCeterisParibus

In [1]:
from ceteris_paribus.explainer import explain
from ceteris_paribus.plots.plots import plot, plot_notebook
from ceteris_paribus.profiles import individual_variable_profile
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import datasets
from sklearn.datasets import load_wine
from sklearn.decomposition import PCA
from sklearn.ensemble import (
    GradientBoostingClassifier,
    RandomForestClassifier,
)
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.manifold import TSNE
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
)
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    StratifiedKFold,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

## Data and model preparation

In [2]:
wine = load_wine()
# scaled_wine_data = StandardScaler().fit_transform(wine['data'])
# data = pd.DataFrame(
#     data=np.c_[scaled_wine_data, wine['target']],
#     columns=wine['feature_names'] + ['target']
# )
data = pd.DataFrame(np.c_[wine['data'], wine['target']], columns=wine['feature_names'] + ['target'])

features = list(data.columns)
features.remove('target')
x = data.loc[:, features].values
y = data.loc[:, ['target']].values
x = pd.DataFrame(x, columns=features)
y = pd.DataFrame(y, columns=['target'])

x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    test_size=0.33,
    random_state=42
)

In [3]:
mlpc_model = MLPClassifier(
    random_state=1,
    max_iter=1000
).fit(x_train, y_train.values.ravel())

score = mlpc_model.score(x_test, y_test)
print(f'Score: {score}')

Score: 0.9830508474576272




In [4]:
gbc_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=1.0,
    max_depth=1,
    random_state=0
).fit(x_train, y_train.values.ravel())

score = gbc_model.score(x_test, y_test)
print(f'Score: {score}')

Score: 0.9830508474576272


Both models have **exactly** the same accuracy. Let's check if they fail on the same inputs.

In [5]:
models = [mlpc_model, gbc_model]
fail_idxs = [[], []]

for m_idx, m in enumerate(models):
    for test_idx in range(x_test.shape[0]):
        observation = x_test.iloc[[test_idx]]
        prediction = m.predict(observation)
        ground_truth = y_test.iloc[test_idx, 0]

        if prediction != ground_truth:
            fail_idxs[m_idx].append(test_idx)
        
print(f'First model failed on: {fail_idxs[0]}')
print(f'Second model failed on: {fail_idxs[1]}')

First model failed on: [10]
Second model failed on: [43]


## CP profies analysis

### Utilities

In [6]:
def show_prediction(model, test_idx):
    observation = x_test.iloc[[test_idx]]
    print(observation.to_markdown())
    print()

    prediction = model.predict(observation)
    print(f'Prediction: {int(prediction[0])}')
    print()

    probabilities = model.predict_proba(observation)
    print(f'Probabilities:')
    for idx, prob in enumerate(probabilities[0]):
        prob = round(prob * 100, 5)
        print(f'    Class {idx} probability: {prob}%')
    print()

    ground_truth = int(y_test.iloc[test_idx, 0])
    print(f'Ground truth: {ground_truth}')
    print()

In [7]:
def gbc_predict(x):
    probabilities = gbc_model.predict_proba(x)
    res = []
    idx = 1 # 0, 1 or 2
    for prob in probabilities:
        res.append(prob[idx])
    return res

In [8]:
def mlpc_predict(x):
    probabilities = mlpc_model.predict_proba(x)
    res = []
    idx = 1 # 0, 1 or 2
    for prob in probabilities:
        res.append(prob[idx])
    return res

In [9]:
def plot_cp(obs_idx):
    obs = x_test.iloc[obs_idx]
    label = y_test.iloc[obs_idx]
    cp_gbc = individual_variable_profile(explainer_gbc, obs, label)
    plot(cp_gbc, selected_variables=['flavanoids'])

In [14]:
def plot_both(obs_idx):
    obs = x_test.iloc[obs_idx]
    label = y_test.iloc[obs_idx]
    
    cp_gbc = individual_variable_profile(explainer_gbc, obs, label)
    cp_mlpc = individual_variable_profile(explainer_mlpc, obs, label)
    
    plot(cp_gbc, cp_mlpc, selected_variables=['flavanoids'])

### Gradient Boosting Classifier

In [11]:
explainer_gbc = explain(gbc_model, data=x_test, y=y_test.squeeze(), label='Gradient Boosting Classifier', predict_function=lambda x: gbc_predict(x))

In [169]:
plot_cp(43) # flavanoids in 8 and 43 for class 1

### Multi-layer Perceptron

In [12]:
explainer_mlpc = explain(mlpc_model, data=x_test, y=y_test.squeeze(), label='Multi-layer Perceptron Classifier', predict_function=lambda x: mlpc_predict(x))

In [16]:
plot_both(27)