In [None]:
import pandas as pd

# Usage
df = pd.read_pickle("merged.pkl")  # Load your DataFrame
df = df[(df["model"] == "ViT-Finetuned") & (df["dataset"] == "SPAC")]

In [None]:
import matplotlib.pyplot as plt

def visualize_label_distribution_pie(df, top_n=25, figsize=(12, 8)):
    """
    Visualize the distribution of labels in a pie chart, ordered by largest label count.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing a 'label' column.
    top_n (int): Number of top labels to display individually. Others will be grouped as 'Other'. Default is 10.
    figsize (tuple): Figure size in inches. Default is (12, 8).
    
    Returns:
    None: Displays the plot.
    """
    # Count the occurrences of each label
    label_counts = df['label'].value_counts()

    # Prepare data for the pie chart
    if len(label_counts) > top_n:
        top_labels = label_counts.nlargest(top_n)
        other_count = label_counts.nsmallest(len(label_counts) - top_n).sum()
        pie_data = pd.concat([top_labels, pd.Series({'Other': other_count})])
    else:
        pie_data = label_counts

    # Create the pie chart
    plt.figure(figsize=figsize)
    plt.pie(pie_data.values, labels=pie_data.index, autopct='%1.1f%%', startangle=90)
    plt.title(f"Distribution of Top {top_n} Labels")
    plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle

    # Add a legend
    plt.legend(loc='best', bbox_to_anchor=(1, 0.5))

    plt.tight_layout()
    plt.show()


# 1. Distribution of labels with n number of images
label_counts = df["label"].value_counts()
count_distribution = label_counts.value_counts().sort_index()

plt.figure(figsize=(12, 6))
count_distribution.plot(kind="bar")
plt.title("Distribution of Number of Images per Individual")
plt.xlabel("Number of Images")
plt.ylabel("Number of Individuals")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# NOTE(liamvdv): the top 25 labels (1/5th of the dataset) account for ~60% of the dataset
visualize_label_distribution_pie(df)

In [None]:
import numpy as np
from sklearn.decomposition import PCA
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.io import output_notebook
import colorcet as cc
from PIL import Image
import io
import base64

output_notebook()

def visualize_embeddings(df):
    # Perform PCA
    pca = PCA(n_components=2)
    embeddings_2d = pca.fit_transform(np.vstack(df['embedding'].values))

    # Create a color map
    unique_labels = df['label'].unique()
    num_labels = len(unique_labels)
    color_palette = cc.glasbey[:num_labels]
    color_map = dict(zip(unique_labels, color_palette))

    def image_to_base64(img):
        buffered = io.BytesIO()
        img.save(buffered, format="JPEG")
        img_str = base64.b64encode(buffered.getvalue()).decode()
        return img_str

    # Prepare data for Bokeh
    data = {
        "x": embeddings_2d[:, 0],
        "y": embeddings_2d[:, 1],
        "color": [color_map[label] for label in df['label']],
        "class": df['label'],
        "image": [image_to_base64(img) for img in df['input']]
    }

    source = ColumnDataSource(data=data)

    # Create the figure
    p = figure(width=1920, height=1080, title="2D Projection of Classes", tools="pan,wheel_zoom,box_zoom,reset")

    # Add the scatter plot
    p.scatter(
        x="x",
        y="y",
        size=12,
        fill_color="color",
        line_color="black",
        source=source,
        legend_field="class"
    )

    # Add hover tool
    hover = HoverTool(tooltips='<img src="data:image/jpeg;base64,@image" width="128" height="128">')
    p.add_tools(hover)

    # Show the plot
    return p

fig = visualize_embeddings(df)
show(fig)