In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict


def generate_folds(df, k=5, seed=42):
    """
    Generate label-disjunct folds for each (dataset, model) combination using a local random number generator.

    Args:
    df (pd.DataFrame): Input dataframe with columns 'dataset', 'model', 'label'
    k (int): Number of folds
    seed (int): Random seed for reproducibility

    Returns:
    pd.DataFrame: Copy of input dataframe with an additional 'fold' column
    """
    # Create a copy of the input dataframe
    df_with_folds = df.copy()
    df_with_folds["fold"] = -1  # Initialize fold column

    # Create a local random number generator
    rng = np.random.default_rng(seed)

    # Group by dataset and model
    for (dataset, model), group in df.groupby(["dataset", "model"]):
        # Get unique labels for this group
        unique_labels = group["label"].unique()

        # Shuffle labels using the local RNG
        rng.shuffle(unique_labels)

        # Split labels into k groups
        label_folds = np.array_split(unique_labels, k)

        # Create a mapping from label to fold
        label_to_fold = {}
        for fold, labels in enumerate(label_folds):
            for label in labels:
                label_to_fold[label] = fold

        # Assign folds to rows
        mask = (df_with_folds["dataset"] == dataset) & (df_with_folds["model"] == model)
        df_with_folds.loc[mask, "fold"] = df_with_folds.loc[mask, "label"].map(label_to_fold)

    # Ensure all rows have been assigned a fold
    assert (df_with_folds["fold"] != -1).all(), "Some rows were not assigned a fold"

    return df_with_folds

In [None]:
import pandas as pd
import numpy as np
from typing import List, Tuple


def stripe_samples(
    train_group: pd.DataFrame, test_group: pd.DataFrame, min_samples: int, max_samples: int, rng: np.random.Generator
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Move a random number of samples (between min_samples and max_samples) from train to test for a given label.

    Args:
    train_group (pd.DataFrame): Train group of samples for a single label
    test_group (pd.DataFrame): Test group of samples for a single label
    min_samples (int): Minimum number of samples to move
    max_samples (int): Maximum number of samples to move
    rng (np.random.Generator): Random number generator

    Returns:
    Tuple[pd.DataFrame, pd.DataFrame]: Updated train and test groups
    """
    num_samples = min(rng.integers(min_samples, max_samples + 1), len(train_group))
    samples_to_move = train_group.sample(n=num_samples, random_state=rng)

    updated_train = train_group.drop(samples_to_move.index)
    updated_test = pd.concat([test_group, samples_to_move], ignore_index=True)

    return updated_train, updated_test


def split(
    df: pd.DataFrame, k: int = 5, n: int = None, min_samples: int = None, max_samples: int = None, seed: int = 42
) -> List[Tuple[pd.DataFrame, pd.DataFrame]]:
    """
    Split the dataframe into k train-test combinations based on the 'fold' column.
    Then moves between (min_samples, max_samples) samples from train to test for n randomly selected labels in train.

    Args:
    df (pd.DataFrame): Input dataframe with 'fold', 'dataset', 'model', and 'label' columns
    k (int): Number of folds (default 5)
    n (int): Number of labels to move samples from train to test
    min_samples (int): Minimum number of samples to move per label
    max_samples (int): Maximum number of samples to move per label
    seed (int): Random seed for reproducibility

    Returns:
    List[Tuple[pd.DataFrame, pd.DataFrame]]: List of (train, test) dataframe pairs
    """
    assert "fold" in df.columns, "Dataframe must have a 'fold' column"
    assert "dataset" in df.columns and "model" in df.columns, "Dataframe must have 'dataset' and 'model' columns"
    assert "label" in df.columns, "Dataframe must have a 'label' column"
    assert (
        len(df["dataset"].unique()) == 1 and len(df["model"].unique()) == 1
    ), "Dataframe should contain only one dataset and model combination"
    # assert new_label not in df["label"].unique(), "New label already exists in the dataframe"

    rng = np.random.default_rng(seed)

    splits = []
    for i in range(k):
        # is_new column to track 'new' class samples.
        test_df = df[df["fold"] == i].copy()
        test_df["is_new"] = True
        train_df = df[df["fold"] != i].copy()
        train_df["is_new"] = False

        if n is not None and min_samples is not None and max_samples is not None:
            # Get unique labels in train set
            train_labels = train_df["label"].unique()

            # Randomly select n labels to move samples from
            labels_to_move = rng.choice(train_labels, size=min(n, len(train_labels)), replace=False)

            for label in labels_to_move:
                train_label_group = train_df[train_df["label"] == label]
                test_label_group = test_df[test_df["label"] == label]

                updated_train, updated_test = stripe_samples(
                    train_label_group, test_label_group, min_samples, max_samples, rng
                )

                train_df = pd.concat([train_df[train_df["label"] != label], updated_train], ignore_index=True)
                test_df = pd.concat([test_df[test_df["label"] != label], updated_test], ignore_index=True)

        splits.append((train_df, test_df))

    return splits

In [None]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score
from typing import List, Tuple


def knn_openset_recognition(
    splits: List[Tuple[pd.DataFrame, pd.DataFrame]],
    n_neighbors_range: List[int] = [1, 3, 5, 7],
    threshold_range: List[float] = np.arange(0.1, 1.0, 0.1),
):
    """
    Perform KNN + threshold grid search for open-set recognition.

    Args:
    splits (List[Tuple[pd.DataFrame, pd.DataFrame]]): List of (train, test) dataframe pairs
    n_neighbors_range (List[int]): Range of n_neighbors to search
    threshold_range (List[float]): Range of thresholds to search

    Returns:
    dict: Best parameters and scores
    """
    best_params = {"n_neighbors": 0, "threshold": 0}
    best_score = {"f1": 0, "auroc": 0, "combined": 0}

    for train_df, test_df in splits:
        # Prepare the data
        X_train = np.stack(train_df["embedding"].values)
        y_train = train_df["label"].values
        X_test = np.stack(test_df["embedding"].values)
        y_test = test_df["label"].values

        # Standardize the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Identify known and unknown labels
        known_labels = set(y_train)
        unknown_labels = set(y_test) - known_labels
        y_test_binary = np.where(np.isin(y_test, list(unknown_labels)), 1, 0)

        for n_neighbors in n_neighbors_range:
            # Train KNN
            knn = KNeighborsClassifier(n_neighbors=n_neighbors)
            knn.fit(X_train_scaled, y_train)

            # Get distances and predictions
            distances, indices = knn.kneighbors(X_test_scaled)
            probabilities = knn.predict_proba(X_test_scaled)

            for threshold in threshold_range:
                # Classify based on threshold
                y_pred = np.where(
                    np.max(probabilities, axis=1) >= threshold, knn.classes_[np.argmax(probabilities, axis=1)], -1
                )  # -1 for unknown

                # Calculate metrics
                known_mask = y_pred != -1
                f1 = f1_score(y_test[known_mask], y_pred[known_mask], average="weighted")

                # For AUROC, use the max probability as the score
                auroc = roc_auc_score(y_test_binary, 1 - np.max(probabilities, axis=1))

                # Combine metrics (you can adjust the weights as needed)
                combined_score = 0.5 * f1 + 0.5 * auroc

                # Update best parameters if better score is found
                if combined_score > best_score["combined"]:
                    best_params["n_neighbors"] = n_neighbors
                    best_params["threshold"] = threshold
                    best_score["f1"] = f1
                    best_score["auroc"] = auroc
                    best_score["combined"] = combined_score

    return best_params, best_score

In [None]:
import math
import numpy as np
import pandas as pd
from sklearn.calibration import label_binarize
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, average_precision_score, roc_auc_score
from typing import List, Tuple, Dict
from gorillatracker.classification.clustering import EXT_MERGED_DF
from gorillatracker.classification.metrics import analyse_embedding_space

# Assuming generate_folds and split functions are available from the previous code
# If not, please provide their implementations


def knn_openset_recognition(
    dataset: pd.DataFrame,
    queryset: pd.DataFrame,
    thresholds: List[float],
    method: str = "knn1",
    snapshot: List[float] = None,
) -> Dict[float, Dict[str, float]]:
    """
    Perform KNN + threshold grid search for open-set recognition.

    Args:
    dataset (pd.DataFrame): Training dataset with 'label' and 'embedding' columns
    queryset (pd.DataFrame): Query dataset with 'label' and 'embedding' columns
    thresholds (List[float]): List of thresholds to search
    k (int): Number of neighbors to consider (default: 5)
    method (str): Method to use for classification ('knn1', 'knn5', ...)
    snapshot (List[float]): List of thresholds to store results for.

    Returns:
    Dict[float, Dict[str, float]]: Results for each threshold
    """
    # Prepare the data
    X_train = np.stack(dataset["embedding"].values)
    y_train = dataset["label"].values
    X_query = np.stack(queryset["embedding"].values)
    y_query = queryset["label"].values

    # Standardize the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_query_scaled = scaler.transform(X_query)

    # Fit the NearestNeighbors model
    nbrs = NearestNeighbors(n_neighbors=5).fit(X_train_scaled)

    # Find the nearest neighbors for queryset
    distances, indices = nbrs.kneighbors(X_query_scaled)

    results = {}
    for t in thresholds:
        if method == "knn1":
            predictions = knn1_predict(dataset, indices, distances, t)
        elif method == "knn5":
            predictions = knn5_predict(dataset, indices, distances, t)
        elif method == "knn5distance":
            predictions = knn_weighted_distance_predict(dataset, indices, distances, t)
        elif method == "knn1centroid":
            raise NotImplementedError("Method not implemented")
        elif method == "knn1centroid_iqr":
            raise NotImplementedError("Method not implemented")
        else:
            raise ValueError(f"Unknown method: {method}")

        # print(t)
        # print(y_query, predictions)

        results[t] = compute_metrics(y_query, predictions, dataset["label"].unique())

        if snapshot:
            if any(s for s in snapshot if math.isclose(t, s, abs_tol=1e-6)):
                results[t]["y_true"] = y_query
                results[t]["y_pred"] = predictions
    return results


def knn1_predict(dataset, indices, distances, threshold):
    return dataset.iloc[indices[:, 0]]["label"].where(distances[:, 0] <= threshold, -1).values


def knn5_predict(dataset, indices, distances, threshold):
    predictions = []
    for idx, dist in zip(indices, distances):
        valid = dataset.iloc[idx][dist <= threshold]
        if not valid.empty:
            prediction = valid["label"].mode()[0]
        else:
            prediction = -1
        predictions.append(prediction)
    return np.array(predictions)


def knn_weighted_distance_predict(dataset, indices, distances, threshold):
    predictions = []
    for idx, dist in zip(indices, distances):
        if dist[0] > threshold:
            predictions.append(-1)
        else:
            weights = 1 / (1 + dist)
            labels = dataset.iloc[idx]["label"]
            df = pd.DataFrame({"label": labels, "weight": weights})
            weighted_votes = df.groupby("label")["weight"].sum()
            predictions.append(weighted_votes.idxmax())
    return np.array(predictions)


import numpy as np
from sklearn.metrics import accuracy_score, f1_score


def compute_metrics(y_true, y_pred, unique_labels):
    """
    Compute accuracy and macro F1-score for new vs. known classification and multiclass classification among known classes.
    New Class has label -1. The other classes are multiclass.

    Args:
    y_true (array-like): True labels
    y_pred (array-like): Predicted labels
    unique_labels (array-like): List of known class labels

    Returns:
    dict: Dictionary containing computed metrics


    T|F
    1|1 - Correct
    1|2 - Incorrect known
    1|-1 - Incorrect new & new vs. known
    -1|1 - Incorrect known & new vs. known
    -1|-1 - Correct new

    multiclass -> always correct if left equal to right side (no matter if new or known class, it's just classification over n+1)
    """
    # Convert to numpy arrays for easier manipulation
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Create binary labels for new vs. known classification
    y_true_binary = np.where(y_true == -1, -1, 1)
    y_pred_binary = np.where(y_pred == -1, -1, 1)

    # Compute metrics for binary classification (new vs. known)
    binary_accuracy = accuracy_score(y_true_binary, y_pred_binary)
    binary_f1 = f1_score(y_true_binary, y_pred_binary, labels=[-1, 1])

    # Compute metrics for multiclass classification (only for known classes)
    mask_true = y_true != -1
    mask_pred = y_pred != -1
    mask = mask_true & mask_pred

    only_known_accuracy = accuracy_score(y_true[mask], y_pred[mask])
    # we will not add the '-1' new class to the labels; this f1 is only about the known classes
    # we want to exclude the 'new' class from the f1 calculation via labels= (note we are not using the mask here)
    # this will give us the f1 score over the known classes only [zero_division=1 is used to allow labels to be missing]
    only_known_f1 = f1_score(y_true, y_pred, labels=unique_labels, average="macro", zero_division=1)

    # Create a dictionary to store the metrics
    metrics = {
        # how often was new class predicted correctly
        # threshold graph: should start at 1 and go down to 0
        "new_vs_all_accuracy": binary_accuracy,
        # threshold graph: should start at 1 and go down to 0
        "new_vs_all_f1": binary_f1,
        # t=-1 or p=-1 will be excluded
        "only_known_accuracy": only_known_accuracy,
        "only_known_f1": only_known_f1,
        # normal multiclass over n+1 classes
        # threshold graph: should start at
        "multiclass_accuracy": accuracy_score(y_true, y_pred),
        "multiclass_f1": f1_score(y_true, y_pred, average="macro"),
        "multiclass_f1_weighted": f1_score(y_true, y_pred, average="weighted"),
    }

    return metrics


def with_centroid_queryset(dataset, queryset, function, *args, **kwargs):
    def calculate_centroid(embeddings):
        return np.mean(np.vstack(embeddings), axis=0)

    centroid_qs = queryset.groupby("label")["embedding"].apply(calculate_centroid).reset_index()
    centroid_qs.columns = ["label", "embedding"]
    return function(dataset, centroid_qs, *args, **kwargs)


def run_knn_openset_recognition_cv(
    thresholds: List[float],
    df: pd.DataFrame,
    k_fold: int = 5,
    n: int = 3,
    min_samples: int = 1,
    max_samples: int = 5,
    seed: int = 42,
    method: str = "knn1",
    snapshots: List[float] = None,
) -> Dict[float, Dict[str, List[float]]]:
    """
    Run KNN open-set recognition with cross-validation.

    Args:
    df (pd.DataFrame): Input dataframe
    k_fold (int): Number of folds for cross-validation
    n, min_samples, max_samples, seed: Parameters for the split function
    thresholds (List[float]): List of thresholds to search
    knn_k (int): Number of neighbors for KNN
    method (str): KNN method to use

    Returns:
    Dict[float, Dict[str, List[float]]]: Cross-validation results for each threshold
    """
    new_label = -1
    assert new_label not in df["label"].unique(), "New label already exists in the dataframe"
    splits = split(df, k=k_fold, n=n, min_samples=min_samples, max_samples=max_samples, seed=seed)

    cv_results = defaultdict(lambda: defaultdict(list))
    for train_df, test_df in splits:
        classes_total = test_df["label"].nunique()
        classes_new = test_df[test_df["is_new"]]["label"].nunique()
        images_total = test_df.count()
        images_new = test_df[test_df["is_new"]].count()
        # set all test_df labels to new_label if is_new column set
        test_df.loc[test_df["is_new"], "label"] = new_label
        test_df.loc[test_df["is_new"], "label_string"] = "new"
        fold_results = knn_openset_recognition(train_df, test_df, thresholds, method=method, snapshot=snapshots)
        for t, metrics in fold_results.items():
            for metric, value in metrics.items():
                cv_results[t][metric].append(value)

                cv_results[t]["count_queryset_images_new"].append(images_new)
                cv_results[t]["count_queryset_classes_new"].append(classes_new)
                cv_results[t]["count_queryset_images_total"].append(images_total)
                cv_results[t]["count_queryset_classes_total"].append(classes_total)

    return cv_results


edf = EXT_MERGED_DF

# Filter and prepare the data
df = generate_folds(
    edf[(edf["dataset"] == "SPAC+min3+max10") & (edf["model"] == "ViT-Finetuned")].reset_index(drop=True)
)
analysis = analyse_embedding_space(df)
max_distance = analysis["global_max_dist"]
min_distance = analysis["global_min_dist"]
# Set up parameters
thresholds = np.linspace(0, max_distance + 10, 30)
method = "knn1"

# Run cross-validation
cv_results = run_knn_openset_recognition_cv(thresholds, df, method=method, n=30, max_samples=2)
# TODO(liamvdv): find best threshold

In [None]:
def test(true, pred):
    results = compute_metrics(true, pred, unique_labels)
    for metric, value in results.items():
        print(f"{metric}: {value}")


# Define our known class labels
unique_labels = [0, 1, 2]

print("Perfect classification")
y_true = np.array([0, 1, 2, -1])
y_pred = np.array([0, 1, 2, -1])
test(y_true, y_pred)

print("\nLabel in Unique not in True nor Pred")
# tricWarning: F-score is ill-defined and being set to 0.0 in labels with no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
y_true = np.array([0, 1, 1, -1])
y_pred = np.array([0, 1, 1, -1])
test(y_true, y_pred)

print("\nMisclassify a known class as new class")
y_true = np.array([0, 1, 2, -1, -1])
y_pred = np.array([0, 1, 1, -1, 1])
test(y_true, y_pred)

print("\n+Misclassify a known class as a different known")
y_true = np.array([0, 1, 2, -1])
y_pred = np.array([0, 1, 1, -1])
test(y_true, y_pred)


print("\n+asdfasdf")
y_true = np.array([0, 1, 2, 2])
y_pred = np.array([0, 1, 1, -1])
test(y_true, y_pred)

print("\n+Only New")
y_true = np.array([0, 1, 2, -1])
y_pred = np.array([-1, -1, -1, -1])
test(y_true, y_pred)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from typing import Dict, List
import math


def visualize_metrics(cv_results: Dict[float, Dict[str, List[float]]], thresholds: List[float]):
    """
    Visualize metrics from cross-validation results, handling None values,
    and combining count_* metrics into a single subplot.

    Args:
    cv_results (Dict[float, Dict[str, List[float]]]): Cross-validation results
    thresholds (List[float]): List of thresholds used

    Returns:
    None (displays the plot)
    """
    metrics = list(cv_results[thresholds[0]].keys())
    count_metrics = [m for m in metrics if m.startswith("count_")]
    other_metrics = [m for m in metrics if not m.startswith("count_")]

    num_metrics = len(other_metrics) + 1  # +1 for the combined count metrics
    num_cols = 3
    num_rows = math.ceil(num_metrics / num_cols)

    fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5 * num_rows))
    fig.suptitle("Metrics across Different Thresholds", fontsize=16)

    # Flatten axes array for easier indexing
    axes = axes.flatten() if num_rows > 1 else [axes]

    # Plot other metrics
    for i, metric in enumerate(other_metrics):
        ax = axes[i]

        valid_thresholds = []
        mean_values = []
        std_values = []

        for t in thresholds:
            values = [v for v in cv_results[t][metric] if v is not None]
            if values:
                valid_thresholds.append(t)
                mean_values.append(np.mean(values))
                std_values.append(np.std(values))

        if valid_thresholds:
            ax.plot(valid_thresholds, mean_values, marker="o")
            ax.fill_between(
                valid_thresholds,
                [m - s for m, s in zip(mean_values, std_values)],
                [m + s for m, s in zip(mean_values, std_values)],
                alpha=0.2,
            )

        ax.set_xlabel("Threshold")
        ax.set_ylabel(metric)
        ax.set_title(metric)
        ax.grid(True, linestyle="--", alpha=0.7)

    # Plot combined count metrics
    ax = axes[len(other_metrics)]
    ax.set_title("Count Metrics")
    ax.set_xlabel("Threshold")
    ax.set_ylabel("Count")

    for metric in count_metrics:
        mean_values = []
        std_values = []

        for t in thresholds:
            values = [v for v in cv_results[t][metric] if v is not None]
            if values:
                mean_values.append(np.mean(values))
                std_values.append(np.std(values))
            else:
                mean_values.append(np.nan)
                std_values.append(np.nan)

        ax.plot(thresholds, mean_values, marker="o", label=metric)
        ax.fill_between(
            thresholds,
            [m - s for m, s in zip(mean_values, std_values)],
            [m + s for m, s in zip(mean_values, std_values)],
            alpha=0.2,
        )

    ax.legend()
    ax.grid(True, linestyle="--", alpha=0.7)

    # Remove any unused subplots
    for i in range(num_metrics, len(axes)):
        fig.delaxes(axes[i])

    plt.tight_layout()
    plt.show()


visualize_metrics(cv_results, thresholds)

## Metrics
-1 for new class.
The threshold will make the new class at first be assigned to too many samples, later at too little (as threshold grows larger that max-cross-point distance)

multiclass_* look at all classes (including 'new')
multiclass_accuracy - how often are our predictions correct (compare for same value in true|pred columns)
multiclass_f1 - is macro weighted: all classes have same importance. 
multiclass_f1_weighted - is weighted by sample count At the start we should see only -1 in resultset, i.e. strong class imbalance. 

only_known_* looks 


new_* looks at '-1' vs rest. It binarizes both columns to 0/1 [actually 1, -1] and then checks for equality.
new_precision will start at % of non-new images (they are classified wrong, as all have 'new' label). 


=> Precision is screwed towards the number of samples. E. g. 50% new / 50% known will place starting precision at 0.5; it will then grow to an optimum. 
