## Breast Cancer Dataset: Tutorial and Examples

### **Imports**

In [None]:
import pandas as pd
import numpy as np

from mlbugdetection.load_data import load_dataset
from mlbugdetection.monotonic import monotonicity_mse, check_monotonicity_single_sample, check_monotonicity_multiple_samples
from mlbugdetection.critical_values import highest_and_lowest_indexes, find_critical_values, find_several_critical_values
from mlbugdetection.calibration import calibration_check
from mlbugdetection.sanity import sanity_check, sanity_check_with_indexes

### **Load Data**
#### First, divide the data into the categories of malignant and benign tumors

In [None]:
df = load_dataset()
df.head(5)

In [None]:
mal = df[df.diagnosis == "M"]
ben = df[df.diagnosis == "B"]

In [None]:
X_ben = ben.drop(columns=["diagnosis"])
X_ben.drop('id', axis=1, inplace=True) #drop redundant columns
X_mal = mal.drop(columns=["diagnosis"])
X_mal.drop('id', axis=1, inplace=True) #drop redundant columns

In [None]:
single_sample_ben = X_ben.sample(1, random_state=42)
single_sample_mal = X_mal.sample(1, random_state=42)

In [None]:
sample_ben = X_ben.sample(100, random_state=42)
sample_mal = X_mal.sample(100, random_state=23)


### Load the trained models that will be analyzed

In [None]:
model_path_knn = "models/KNN/KNNBreastCancer.pkl"
model_path_nn = "models/NN/NNBreastCancer.pkl"

### **Monotonicity analysis**
#### The monotonicity analysis module has two main functions: *check_monotonicity_single_sample* and *check_monotonicity_multiple_samples*.
#### The function *check_monotonicity_single_sample* receives a model, a single sample (one dataframe row), the feature that will be analyzed, the value interval of this feature, and the number of points analysed between this interval
#### The graph bellow shows that for a random sample of a benign tumor, the feature *perimeter_mean* has a monotonic behavior related to the prediction probability between the ranges 0 and 2000 using a MPLClassifier

In [None]:
report_ben_mono = check_monotonicity_single_sample(model_path_nn, single_sample_ben, "perimeter_mean", 0,200,200)

#### For a similar example, but from a sample of a malignant tumor, the same feature presents a monotonic behavior as well

In [None]:
report_mal_mono = check_monotonicity_single_sample(model_path_nn, single_sample_mal, "perimeter_mean", 0,200,200)

#### The function *check_monotonicity_multiple_samples* receives a model, a sample containing multiple dataframe rows, the feature that will be analyzed, the value interval of this feature, and the number of points analysed between this interval
#### The graph bellow shows the mean prediction probability of the whole sample for each point analysed. In this case, the feature *area_mean* does not have a monotonic relationship with the mean prediction probability.

In [None]:
report_ben_mono_mult_nn = check_monotonicity_multiple_samples(model_path_nn, X_ben, "area_mean", 0,2000,2000)

#### It is hard to identify how close the data is to a monotonic behavior only visualizing the graph. The method "monotonic_score" of the analysis report helps identifying it.

In [None]:
report_ben_mono_mult_nn.metrics

#### The MSE between the aproximated monotonic curve and the real curve is so small that it is possible to consider the real curve as monotonic.
#### Running the same sample in a KNN model shows a completely different result.

In [None]:
report_ben_mono_mult_knn = check_monotonicity_multiple_samples(model_path_knn, X_ben, "area_mean", 0,2000,1000)

In [None]:
report_ben_mono_mult_knn.metrics

### **Critical values analysis**
#### The critical values analysis module has two main functions: *find_critical_values* and *find_several_critical_values*.
#### The analysed feature will be again *area_mean*, and this module identifies data examples and feature ranges that generate the biggest changes in the model's prediction probability, which can sometimes result in classification changes.


In [None]:
#First it will be analysed the model's behavior using the feature range from the training data
min_v = df["perimeter_mean"].min()
max_v = df["perimeter_mean"].max()

In [None]:
teste2 = find_several_critical_values(model_path_nn, sample_mal, "perimeter_mean",min_v,max_v,200, keep_n = 100)

In [None]:
teste3 = find_several_critical_values(model_path_nn, sample_ben, "perimeter_mean", min_v,max_v,200, keep_n = 100)

In [None]:
print(len(teste2.metrics["critical_indexes"]))
print(len(teste3.metrics["critical_indexes"]))

In [None]:
teste4 = find_several_critical_values(model_path_nn, sample_mal, "perimeter_mean",(min_v - (min_v * 1.5)),(max_v + (max_v * 1.5)),200, keep_n = 100)

In [None]:
teste5 = find_several_critical_values(model_path_nn, sample_ben, "perimeter_mean",(min_v - (min_v * 1.5)),(max_v + (max_v * 1.5)),200, keep_n = 100)

In [None]:
print(len(teste4.metrics["critical_indexes"]))
print(len(teste5.metrics["critical_indexes"]))