 ### **A Metric to Explicitly Tell You When to Retrain a Machine Learning Model**

Blog: link.

- Application on Sintetic datasets

In [None]:
# Import libs
from holisticai.robustness.metrics.dataset_complexity import *

In [None]:
# Generate sintetic datasets

dataset_generators = [
    ("Make Moons", generate_moons),
    # ("Make Circles", generate_circles),
    # ("Make Classification", generate_classification),
    # ("Make Blobs", generate_blobs),
    # ("XOR", generate_xor),
    # ("Swiss Roll", generate_swiss_roll),
    # ("Gaussian Quantiles", generate_gaussian_quantiles),
    # ("Make Friedman 1", generate_friedman1),
    # ("Spirals", generate_spirals),
    ("Two Intertwined Spirals", generate_two_intertwined_spirals)
]

In [None]:
# Evaluate MAGOC on the datasets
evaluate_datasets(dataset_generators)

- Application on Real datasets

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

from holisticai.robustness.metrics.dataset_complexity import *
from holisticai.datasets import load_dataset

# List of datasets
datasets = [
    "adult",
    # "law_school",
    # "student_multiclass",
    # "us_crime_multiclass",
    # "clinical_records",
    # # New datasets
    # "german_credit",
    # "census_kdd",
    # "bank_marketing",
    # "compass",
    # "diabetes",
    # "acsincome",
    "acspublic"
]

# Models to evaluate
models = {
    "Logistic Regression": LogisticRegression(max_iter=10000),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Linear Discriminant Analysis": LinearDiscriminantAnalysis()
}

# Initialize dictionaries to store complexities and accuracies
complexities_data = {}
accuracies_data = {model_name: [] for model_name in models.keys()}

for dataset_name in datasets:
    try:
        # Load dataset
        dataset = load_dataset(dataset_name)

        # Shrink the dataset to a maximum of 1000 rows
        n_rows = min(1000, dataset["X"].shape[0])
        X = dataset["X"].iloc[:n_rows, :]
        y = dataset["y"].iloc[:n_rows]

        # Standardize the data
        scaler = StandardScaler()
        X = scaler.fit_transform(X)

        # Split the dataset into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

        # Initialize ComplexityCalculator
        cc = px.ComplexityCalculator()

        # Fit ComplexityCalculator with data
        cc.fit(X, y)

        # Extract complexities from the report
        complexities = cc.report()["complexities"]
        complexities_data[dataset_name] = complexities

        # Evaluate models on the dataset
        for model_name, model in models.items():
            try:
                # Train the model
                model.fit(X_train, y_train)

                # Predict on the test set
                y_pred = model.predict(X_test)

                # Calculate accuracy
                accuracy = accuracy_score(y_test, y_pred)

                # Append accuracy for the model and dataset
                accuracies_data[model_name].append(accuracy)
            except Exception as e:
                # Append NaN if the model fails
                accuracies_data[model_name].append(float("nan"))
                print(f"Error with model {model_name} on dataset {dataset_name}: {e}")

    except Exception as e:
        # Handle dataset loading or processing errors
        print(f"Error processing dataset {dataset_name}: {e}")
        for model_name in models.keys():
            accuracies_data[model_name].append(float("nan"))

# Convert the complexities data into a DataFrame
complexities_df = pd.DataFrame(complexities_data)

# Convert the accuracies data into a DataFrame
accuracies_df = pd.DataFrame(accuracies_data, index=datasets).T

# Display the DataFrames
display("Accuracies DataFrame:")
display(accuracies_df)

# Extract the `t1` row from `complexities_df`
t1_series = complexities_df.loc["t1"]  # Extract t1 values for each dataset

# Divide accuracies_df by the T1
src_df = accuracies_df.div(t1_series, axis=1)

# MAGOC dataframe
print("The MAGOC metric by model and dataset:\n(values below 1.0 indicate that retraining the model on the dataset, if needed, is NOT recommended)")
display(src_df.dropna(axis=1))
