In [None]:
from gorillatracker.utils.embedding_generator import generate_embeddings_from_run, read_embeddings_from_disk
import numpy as np

regen = False

examples = [
    (
        "https://wandb.ai/gorillas/Embedding-SwinV2Large-CXL-Open/runs/bp5e1rnx/workspace?nw=nwuserliamvdv",
        "../swin-large-example-embeddings.pkl",
        {
            "dataset_cls": "gorillatracker.datasets.cxl.CXLDataset",
            "data_dir": "/workspaces/gorillatracker/data/splits/ground_truth-cxl-face_images-openset-reid-val-0-test-0-mintraincount-3-seed-42-train-50-val-25-test-25",
        },
    ),
    (
        "https://wandb.ai/gorillas/Embedding-SwinV2Large-CXL-Open/runs/bp5e1rnx/workspace?nw=nwuserliamvdv",
        "../bristol_embeddings-cxl_trained.pkl",
        {
            "dataset_cls": "gorillatracker.datasets.bristol.BristolDataset",
            "data_dir": "/workspaces/gorillatracker/data/splits/ground_truth-bristol-cropped_images_face-closedset-mintraincount-3-seed-42-train-0-val-100-test-0",
        },
    ),
]
current = 1

if regen:
    df = generate_embeddings_from_run(examples[current][0], examples[current][1], **examples[current][2])
else:
    # df = read_embeddings_from_disk("../example-embeddings.pkl")
    df = read_embeddings_from_disk(examples[current][1])

df["embedding"] = df["embedding"].apply(lambda x: np.array(x))
df.head()

In [None]:
vc = df["label_string"].value_counts()
vc.plot(kind="box")
vc.mean(), vc.std()

In [None]:
df["label_string"].unique(), len(df["label_string"].unique())

In [None]:
if df["label_string"].nunique() != df["label"].nunique():
    grouped = df.groupby("label")[
        "label_string"
    ].nunique()  # label_string -> label unique, label -> label_string NOT unique
    non_unique_mappings = grouped[grouped > 1]
    mismatches = df[df["label"].isin(non_unique_mappings.index)]
    print(mismatches.head())
    # TODO(liamvdv)
    # raise ValueError("WARNING: Label does not have a 1:1 mapping with label_string")

In [None]:
from gorillatracker.scripts.visualize_embeddings import EmbeddingProjector
from io import BytesIO
from bokeh.io import show, output_notebook, reset_output
import base64

output_notebook()

images = []
for image in df["input"]:
    buffer = BytesIO()
    image.save(buffer, format="JPEG")
    image_byte = base64.b64encode(buffer.getvalue()).decode("utf-8")
    images.append(image_byte)

ep = EmbeddingProjector()
low_dim_embeddings = ep.reduce_dimensions(np.stack(df["embedding"]), method="tsne")
fig = ep.plot_clusters(
    low_dim_embeddings, df["label"], df["label_string"], images, title="Embedding Projector", figsize=(12, 10)
)
show(fig)
reset_output()

In [None]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist

centroids = df.groupby(["label", "label_string"])["embedding"].apply(lambda x: np.mean(np.vstack(x), axis=0))
centroid_df = pd.DataFrame({"centroid": centroids.values})
centroid_df[["label", "label_string"]] = pd.DataFrame(centroids.index.tolist(), index=centroid_df.index)

for label in centroid_df["label"]:
    centroid = centroid_df[centroid_df["label"] == label]["centroid"].values[0]
    embeddings = df[df["label"] == label]["embedding"].tolist()
    distances = cdist(embeddings, [centroid])
    min_distance = np.min(distances)
    max_distance = np.max(distances)
    avg_distance = np.mean(distances)
    centroid_df.loc[centroid_df["label"] == label, "min_distance"] = min_distance
    centroid_df.loc[centroid_df["label"] == label, "max_distance"] = max_distance
    centroid_df.loc[centroid_df["label"] == label, "avg_distance"] = avg_distance

all_dist = cdist(df["embedding"].tolist(), df["embedding"].tolist())
all_centroid_dist = cdist(centroid_df["centroid"].tolist(), centroid_df["centroid"].tolist())
print("all: Global Maximum Embedding Distance", np.max(all_dist))
print("all: Global Minimum Embedding Distance", np.min(all_dist))
print("all: Global Average Embedding Distance", np.mean(all_dist))
print("all: Standard deviation Embedding Distance", np.std(all_dist))
print("=" * 40)
print("intra: In Class (Self, Centroid) Minimum Distance", centroid_df["min_distance"].min())
print("intra: In Class (Self, Centroid) Maximum Distance", centroid_df["max_distance"].max())
print("intra: In Class (Self, Centroid) Average Distance", centroid_df["avg_distance"].mean())
print("intra: In Class (Self, Centroid) Standard deviation Distance", centroid_df["avg_distance"].std())
print("=" * 40)
print("inter: Between Class (Centroid1, Centroid2) Minimum Distance", np.min(all_centroid_dist))
print("inter: Between Class (Centroid1, Centroid2) Maximum Distance", np.max(all_centroid_dist))
print("inter: Between Class (Centroid1, Centroid2) Average Distance", np.mean(all_centroid_dist))
print("inter: Between Class (Centroid1, Centroid2) Standard deviation Distance", np.std(all_centroid_dist))

In [None]:
# Add a frequency plot
import matplotlib.pyplot as plt

# Assuming dataframe[label_string] contains the number of individuals
df["label_string"].value_counts().plot(kind="bar", title="Image Distribution")

In [None]:
inverse_df = df.groupby('label_string').size().reset_index(name='counts')
inverse_df['counts'].value_counts().sort_index().plot(kind='bar', title="Image/Label Frequency")

In [None]:
from gorillatracker.metrics import tsne
import torch

centroid_marker = 1000000
# p = tsne(torch.tensor(centroid_df.centroid.tolist()), torch.tensor(centroid_df.label.tolist()), perplexity=min(30, len(centroid_df)-1))
p = tsne(
    torch.tensor(df.embedding.tolist() + centroid_df.centroid.tolist()),
    torch.tensor(df.label.tolist() + [centroid_marker + c for c in centroid_df.label.tolist()]),
)

In [None]:
%load_ext autoreload
# Set autoreload to automatically reload all modules
%autoreload 2

# from gorillatracker.clustering.thresholds import norm_label_distribution, find_threshold

# result = find_threshold(
#     df,
#     label_column="label_string",
#     grid_start=10.0,
#     grid_end=22.0,
#     grid_num=50,
#     unique_percentage=0.2,
#     normalize_label_distribution=False, # because we use f1 score average="weighted", thus distribution is cancelled out
#     seed=47,
# )

# result

from gorillatracker.clustering.thresholds2 import (
    k_fold_threshold_search,
    knn1,
    knn1_centroid,
    knn1_centroid_iqr,
    knn5,
    knnk_weighted_by_distance,
    knnk_weighted_by_geometric_sequence,
    downsample_class,
)

# NOTE(liamvdv): we do not normalize the distribution because we use f1 score average="weighted", thus distribution is cancelled out
results, new_perc_folds = k_fold_threshold_search(
    downsample_class(df, "label_string"),
    label_column="label_string",
    grid_start=2.0,
    grid_end=20.0,
    grid_num=100,
    unique_percentage=0.2,
    seed=47,
    function=knn1,
)
results

In [None]:
import matplotlib.pyplot as plt


# Extracting the data for plotting
def plot_metric(eval, new_perc_folds, metric: str = "f1"):
    thresholds = list(eval.keys())
    mAP_overall = [values[f"all/{metric}"] for values in eval.values()]
    mAP_new = [values[f"new/{metric}"] for values in eval.values()]
    mAP_non_new = [values[f"non_new/{metric}"] for values in eval.values()]
    perc_of_new_label = np.mean(new_perc_folds)
    assert all(a == b for a, b in zip(new_perc_folds, new_perc_folds))
    # Creating the plot
    plt.figure(figsize=(10, 6))
    plt.plot(thresholds, mAP_overall, label=f"Overall {metric}", marker="o")
    plt.plot(thresholds, mAP_new, label=f"{metric} for new labels", marker="o")
    plt.plot(thresholds, mAP_non_new, label=f"{metric} for non-new labels", marker="o")

    # Adding title and labels
    plt.title(f"Threshold vs {metric}" + f" @ mean {perc_of_new_label * 100:.2f}% new individuals in query set")
    plt.xlabel("Threshold")
    plt.ylabel(metric)
    plt.legend()

    # Showing the plot
    plt.grid(True)
    plt.show()


# NOTE(liamvdv): only supports one fold
plot_metric(results, new_perc_folds, "f1")