- https://pypi.org/project/visual_auditor/
- https://github.com/poloclub/visual-auditor/tree/258c59d7052d4fe2a7b3fafc129a2184b4c25f8c/visual-auditor-package/notebook-widget
- https://github.com/poloclub/visual-auditor/blob/258c59d7052d4fe2a7b3fafc129a2184b4c25f8c/visual-auditor-package/notebook-widget/visual_auditor/visual_auditor.py#L480
- https://pypi.org/project/visual_auditor/#files

In [None]:
from visual_auditor.visual_auditor import find_slices_and_visualize

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [None]:
DATA_URL: str = "https://raw.githubusercontent.com/poloclub/visual-auditor/258c59d7052d4fe2a7b3fafc129a2184b4c25f8c/visual-auditor-package/notebook-widget/visual_auditor/data/adult.data"

In [None]:
# Helper function for binning numerical features
def bin_feature(feature):
    bins = np.histogram_bin_edges(
        adult_data[feature], bins=10, range=None, weights=None
    )
    adult_data[feature] = pd.cut(
        adult_data[feature],
        bins,
        labels=[x for x in range(len(bins) - 1)],
        right=True,
        include_lowest=True,
        duplicates="drop",
    )
    intervals = []
    for i in range(len(bins) - 1):
        intervals.append(f" {int(bins[i])} - {int(bins[i+1])}")
    return intervals

In [None]:
# Load Adult dataset
adult_data = pd.read_csv(
    DATA_URL,
    names=[
        "Age",
        "Workclass",
        "Final Weight",
        "Education",
        "Education-Num",
        "Marital Status",
        "Occupation",
        "Relationship",
        "Race",
        "Sex",
        "Capital Gain",
        "Capital Loss",
        "Hours Per Week",
        "Country",
        "Target",
    ],
    sep=r"\s*,\s*",
    engine="python",
    na_values="?",
)

# Drop NA values
adult_data = adult_data.dropna()

# Drop irrelevant fields
adult_data = adult_data.drop(columns=["Final Weight", "Education-Num"])

# Bin numerical features
encoders = {}
encodings = {}
numerical_features = ["Age", "Capital Gain", "Capital Loss", "Hours Per Week"]
for feature in numerical_features:
    encodings[feature] = bin_feature(feature)

# Encode categorical features
for column in adult_data.columns.difference(numerical_features):
    if adult_data.dtypes[column] == object:
        le = LabelEncoder()
        adult_data[column] = le.fit_transform(adult_data[column])
        encoders[column] = le
        encodings[column] = le.classes_
        print(column, le.classes_, le.transform(le.classes_))

# Separate Target values
X, y = adult_data[adult_data.columns.difference(["Target"])], adult_data["Target"]

# Train a classifier
classifier = RandomForestClassifier(max_depth=5, n_estimators=10)
classifier.fit(X, y)

In [None]:
find_slices_and_visualize(classifier, (X, y))

---