 # **Accuracy Degradation Profile (ADP) and Accuracy Degradation Factor (ADF) for large datasets**


This notebook shows possibilities to run accuracy degradation experiments with the metrics accuracy degradation profile (ADP) and accuracy degradation factor (ADF) on large datasets.

In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from holisticai.robustness.metrics import (
    accuracy_degradation_profile,
    accuracy_degradation_factor,
    pre_process_data,
)

from holisticai.robustness.plots import (
                                    plot_2d,
                                    plot_adp_and_adf,
                                    plot_label_and_prediction,
                                    plot_neighborhood,
                                    )

from holisticai.utils.models.neighbors import (
    NearestNeighbors
)

Applying ADP and ADF to **real datasets:**

(You can import your own dataset)

In [None]:
from holisticai.datasets import load_dataset

# Choose any of the following datasets:
# 'adult'
# 'law_school'
# 'student_multiclass'
# 'us_crime_multiclass'
# 'clinical_records'

# New datasets:
# 'german_credit'
# 'census_kdd'
# 'bank_marketing'
# 'compass'
# 'diabetes'
# 'acsincome'
# 'acspublic'

# Load dataset
dataset = load_dataset('adult')
print(f'Original X shape: {dataset["X"].shape}')
print(f'Original y shape: {dataset["y"].shape}')

# Shrink the dataset
n_rows = 40000 # Select only the first n rows
# n_rows = dataset.data.shape[0] # Select all rows

X = dataset['X'].iloc[:n_rows,:]
y = dataset['y'].iloc[:n_rows]


In [3]:
# Split data into training and test sets
random_state = 42
X_train, X_test, y_train, y_test, test_indices = pre_process_data(X, y, test_size = 0.3, random_state = random_state)

In [None]:
# Imports
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score

# Train a classifier over the data
clf = tree.DecisionTreeClassifier(random_state=random_state)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test.values)

# Accuracy over the entire test set
accuracy_test = accuracy_score(y_test, y_pred)
print(f"Accuracy test set: {accuracy_test:.4f}")

In [None]:
# Accuracy over the test set
accuracy_test = accuracy_score(y_test, y_pred)
print(f"Accuracy test set: {accuracy_test:.4f}")

# Perform accuracy degradation profile (ADP)

In [7]:
# Perform accuracy degradation profile (ADP)

import random

# Auxiliary data structures
indexes = list(range(len(X_test)))
random.shuffle(indexes)
n_samples = X_test.shape[0]
partition_size = 1000
results_list = []
n_bootstrap = 10
batch_size = 1000

# Bootstrapping
for i in range(n_bootstrap):

    index_batch = random.sample(indexes, partition_size)

    X_test_partition = X_test.iloc[index_batch]
    y_test_partition = y_test.iloc[index_batch]
    y_pred_test_partition = y_pred[index_batch]

    results = accuracy_degradation_profile(X_test_partition, 
                                        y_test_partition, 
                                        y_pred_test_partition,
                                        neighbor_estimator = NearestNeighbors(batch_size=batch_size, n_neighbors = 50),
                                        step_size = 0.02,
                                        )
    results_list.append(results.data)

# Result data structures
df_results = [x.iloc[:,:-1] for x in results_list]
df_table = sum(df_results) / len(df_results)
df_table["decision"] = df_table["degradate"].apply(lambda x: "acc degrad!" if x > 0 else "OK")
del df_table["degradate"]
# df_table

plot_adp_and_adf(df_table)

In [None]:
# Using faiss to optimize

import faiss

class Nearest:
    def fit(self, X):
        self.index = faiss.IndexFlatL2(X.shape[1])
        self.index.add(X.astype('float32'))
    def kneighbors(self, X, n_neighbors, return_distance=False):
        Ids = []
        D, I = self.index.search(X, n_neighbors)
        return I

In [None]:
# Perform accuracy degradation profile (ADP)
results = accuracy_degradation_profile(X_test, 
                                    y_test, 
                                    y_pred,
                                    neighbor_estimator = Nearest(),
                                    n_neighbors = 50,
                                    step_size = 0.02,
                                    )
results

In [None]:
# # Perform accuracy degradation profile (ADP)
# results = accuracy_degradation_profile(X_test, 
#                                     y_test, 
#                                     y_pred,
#                                     neighbor_estimator = NearestNeighbors(batch_size=5000, n_neighbors = 50),
#                                     n_neighbors = 50,
#                                     step_size = 0.02,
#                                     )
# results