In [38]:
import tqdm
import itertools
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
    

from sklearn.preprocessing import minmax_scale
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC

%matplotlib inline
plt.style.use('dark_background')

random_seed = 42

def get_labels(predicted_ints, map_):
    """
    Mapping from int class labels to string. E.g. [1,1,2] => [setosa, setosa, veridosa]
    Input:
        - predicted_ints: list of ints,
          list of integers denoting the class of each sample
        - map: iterable or dict,
          list or array or dict denoting the corresponding label for each class
    """
    return [map_[item] for item in predicted_ints]

    

data = load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names
class_names = data.target_names

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=42)

clf = DecisionTreeClassifier(max_depth=10, random_state=42)
#clf = SVC(probability=True)
clf = RandomForestClassifier(max_depth=10, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_probas = clf.predict_proba(X_test).max(axis=1)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.94      0.93        53
           1       0.97      0.96      0.96        90

    accuracy                           0.95       143
   macro avg       0.95      0.95      0.95       143
weighted avg       0.95      0.95      0.95       143




The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



In [43]:
def get_pertubation_results(clf, test_sample, X_train, feature_names=None, class_names=None, sampling='linspace', num_tries=10):
    """
    Function to generate pertubations per feature (independently) and perform predictions on the perturbed samples. In the end the result of the tests will be returned as rows in a dataframe.
    Input:
    - clf: sklearn-like classifier,
    The classifier needs to have the predict_proba function as implemented in sklearn.
    - test_sample: np.array,
    The feature-vector of the test sample to be tested (e.g. X_test[0] for the first)
    - X_train: np.array,
    The feature-matrix that was used (or will be used) to fit the classifier. The form is expected to be N x F, where N is the number of samples and F the number of features.
    - feature_names: list,
    iterable with the names of the features. If not given, Feat_1, Feat_2, ..., Feat_N will be used.
    - class_names: list,
    iterable with the names of the classes. If not given, the labels of the clf will be used.
    - sampling: str,
    The way the sampling procedure of the feature values will be performed. Currently:
        - linspace: linspace between min and max values
        - uniform: random sample between min and max values
    - num_tries: int,
    Output:
    - df: pd.DataFrame,
    pandas dataframe with columns ['Feat', 'Feat_Value', 'Normalized_Feat_Value', 'Class_Int', 'Class_Label', 'Proba']. Each row corresponds to a different sample where the 'Feat' feature was perturbed from the original and was given the value 'Feat_Value'. We also calculate the 'Normalized_Feat_Value' so as to have a common x-axis when plotting. 
    """
    if class_names is None:
        class_names = clf.classes_.astype(str)
    if feature_names is None:
        feature_names = [f'Feat_{i + 1}' for i in range(X_train.shape[1])]
    # Creature new feature values to train
    max_feat_values = X_train.max(axis=0)
    min_feat_values = X_train.min(axis=0)
    if sampling == 'linspace':
        values_to_try = np.linspace(min_feat_values, max_feat_values, num_tries, axis=1)
    elif sampling == 'uniform':
        values_to_try = np.random.uniform(low=min_feat_values, high=max_feat_values, size=(num_tries, len(feature_names))).T
    else:
        print(f'Sampling: {sampling} was not understood!')
        raise NotImplementedError
    # Create pertubed Data points
    X_perturbed = np.tile(test_sample, (values_to_try.size)).reshape(values_to_try.size,len(feature_names))
    
    for feat_index in range(len(feature_names)):
        X_perturbed[feat_index*num_tries:feat_index*num_tries + num_tries, feat_index] = values_to_try[feat_index,:]
    # Predict and get probas
    y_perturbed = clf.predict(X_perturbed).reshape(values_to_try.shape)
    y_perturbed_probas = clf.predict_proba(X_perturbed).max(axis=1).reshape(values_to_try.shape)


    # Create the needed df
    # Also we normalize the feature value per feature to have all features range between 0 and 1
    df = pd.DataFrame.from_dict({"Feat":list(itertools.chain.from_iterable([[feat  for i in range(num_tries)] for feat in feature_names])), "Feat_Value":values_to_try.flatten(), 'Normalized_Feat_Value': minmax_scale(values_to_try, axis=1).flatten(), 'Class_Int': y_perturbed.flatten(), 'Class_Label': get_labels(y_perturbed.flatten(), class_names), 'Proba': y_perturbed_probas.flatten(), 'Proba_Size':(minmax_scale(y_perturbed_probas.flatten()) + 0.01)})
    return df

df = get_pertubation_results(clf, X_test[11], X_train, num_tries=100, feature_names=feature_names, sampling='uniform', class_names=class_names)

# The color is related to the class label.
# The size of the dot is related to the class probability of the predicted class
## --> The bigger the size of the dot the higher the probability
# Opacity is the same for all


In [35]:
import plotly.express as px
#size_max = 20
df['Proba_Size'] = (minmax_scale(df['Proba'].values)+0.01)
fig = px.scatter(df, x="Normalized_Feat_Value", y="Feat", color="Class_Label",
                 size='Proba_Size', hover_data=['Feat', 'Feat_Value', 'Proba', 'Class_Label'], width=900, height=1500, opacity=0.5)
fig.show()

In [41]:
print(f'Index\t\tTrue\t\tPredicted\t\tProba')
print(f'~'*50)
for i, y_ in enumerate(y_pred):
    if y_test[i] != y_:
        print(f'{i}\t\t{y_test[i]}\t\t{y_}\t\t{y_probas[i]}')

Index		True		Predicted		Proba
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2		1		0		0.5
4		0		1		0.7
44		1		0		0.6
88		1		0		0.5
92		1		0		0.9
118		0		1		0.8
135		0		1		0.7


In [49]:
df = get_pertubation_results(clf, X_test[88], X_train, num_tries=100, feature_names=feature_names, sampling='uniform', class_names=class_names)

fig = px.scatter(df, x="Normalized_Feat_Value", y="Feat", color="Class_Label",
                 size='Proba_Size', hover_data=['Feat', 'Feat_Value', 'Proba', 'Class_Label'], width=900, height=1500, opacity=0.5)
fig.show()

In [39]:
np.random.uniform(low=min_feat_values, high=max_feat_values, size=(10, len(feature_names))).T.shape

(4, 10)

In [178]:
sampling = 'uniform'
num_tries = 100
x = X_test[0]
max_feat_values = X_train.max(axis=0)
min_feat_values = X_train.min(axis=0)
if sampling == 'linspace':
    values_to_try = np.linspace(min_feat_values, max_feat_values, num_tries, axis=1)
elif sampling == 'uniform':
    values_to_try = np.random.uniform(low=min_feat_values, high=max_feat_values, size=(num_tries, len(feature_names))).T
else:
    print(f'Sampling: {sampling} was not understood!')
    raise NotImplementedError
X_perturbed = np.tile(x, (values_to_try.size)).reshape(values_to_try.size,len(feature_names))
for feat_index in range(len(feature_names)):
    X_perturbed[feat_index*num_tries:feat_index*num_tries + num_tries, feat_index] = values_to_try[feat_index,:]
y_perturbed = clf.predict(X_perturbed).reshape(values_to_try.shape)
y_perturbed_probas = clf.predict_proba(X_perturbed).max(axis=1).reshape(values_to_try.shape)

In [146]:
import pandas as pd
import itertools
from sklearn.preprocessing import minmax_scale


df = pd.DataFrame.from_dict({"Feat":list(itertools.chain.from_iterable([[feat  for i in range(num_tries)] for feat in feature_names])), "Feat_Value":values_to_try.flatten(), 'Normalized_Feat_Value': minmax_scale(values_to_try, axis=1).flatten(), 'Class_Int': y_perturbed.flatten(), 'Class_Label': get_labels(y_perturbed.flatten(), class_names), 'Proba': y_perturbed_probas.flatten()})

In [149]:
import plotly.express as px
fig = px.scatter(df, x="Normalized_Feat_Value", y="Feat", color="Class_Label",
                 size='Proba', hover_data=['Feat', 'Feat_Value', 'Proba', 'Class_Label'], width=1000, height=1500)
fig.show()

In [83]:
pd.DataFrame({"Feat":[[feat in for i in range(num_tries)] for feat in feature_names], "Feat_Value":values_to_try.flatten(), 'Class_Int': y_perturbed.flatten(), 'Class_Label': get_labels(y_perturbed.flatten(), map_), 'Proba': y_perturbed_probas.flatten()})

SyntaxError: invalid syntax (<ipython-input-83-77baeea19b25>, line 1)

In [69]:
values_to_try = np.random.uniform(low=min_feat_values, high=max_feat_values, size=(num_tries, len(feature_names)))
values_to_try.shape

(10, 4)