In [None]:
from gorillatracker.model.wrappers_ssl import MoCoWrapper
from gorillatracker.utils.embedding_generator import generate_embeddings, df_from_predictions
from gorillatracker.model.wrappers_supervised import TimmEvalWrapper, BaseModuleSupervised
from pathlib import Path
from gorillatracker.data.nlet_dm import NletDataModule
from gorillatracker.data.nlet import build_onelet, SupervisedDataset
from torchvision.transforms import Resize, Normalize, Compose
import pandas as pd
import numpy as np
import timm


def get_finetuned_vit() -> MoCoWrapper:
    # ViT Large + DinoV2; finetuned with SSL and MoCo Loss
    # https://wandb.ai/gorillas/Embedding-VitLarge-MoCo-Face-Sweep/runs/rlemhfix
    finetuned = "/workspaces/gorillatracker/models/ssl/moco-accuracy-0.58.ckpt"
    return MoCoWrapper.load_from_checkpoint(
        checkpoint_path=finetuned,
        data_module=None,
        wandb_run=None,
    )


def get_mock_loss_kwargs() -> dict:
    return {
        "margin": 1.0,  # From the file
        "s": 64.0,  # From the file
        "temperature": 0.07,  # Default value, not specified in the file
        "memory_bank_size": 4096,  # Default value, not specified in the file
        "embedding_size": 128,  # From the file
        "batch_size": 64,  # From the file
        "num_classes": None,  # Default value, not specified in the file
        "class_distribution": None,  # Default value, not specified in the file
        "use_focal_loss": False,  # Default value, not specified in the file
        "k_subcenters": 1,  # Default value, not specified in the file
        "accelerator": "cuda",  # From the file
        "label_smoothing": 0.1,  # Default value, not specified in the file
        "l2_alpha": 0.1,  # From the file
        "l2_beta": 0.01,  # From the file
        "path_to_pretrained_weights": "",  # From the file
        "use_class_weights": False,  # Default value, not specified in the file
        "use_dist_term": False,  # Default value, not specified in the file
    }


def get_pretrained_vit() -> TimmEvalWrapper:
    # ViT Large + DinoV2
    model = BaseModuleSupervised(
        model_name_or_path="timm_eval/vit_large_patch14_dinov2.lvd142m",
        fix_img_size=224,
        freeze_backbone=True,
        wandb_run=None,
        data_module=None,
        loss_mode="offline",
        **get_mock_loss_kwargs(),
    )
    # model = TimmEvalWrapper(
    #     backbone_name="vit_large_patch14_dinov2.lvd142m",
    #     img_size=224,
    # )
    # model.freeze = lambda: None
    return model


def get_pretrained_efnet() -> TimmEvalWrapper:
    # EfficientNetV2 RW_M + ImageNet V2 1k
    model = BaseModuleSupervised(
        model_name_or_path="timm_eval/efficientnetv2_rw_m",
        # Eff Net does not take img_size as an argument
        freeze_backbone=True,
        wandb_run=None,
        data_module=None,
        loss_mode="offline",
        **get_mock_loss_kwargs(),
    )
    c = timm.data.resolve_model_data_config(model)
    assert c["input_size"] == (3, 224, 224)
    # model = TimmEvalWrapper(backbone_name="efficientnetv2_rw_m")
    # model.freeze = lambda: None
    return model


def get_finetuned_efnet() -> TimmEvalWrapper:
    # EfficientNetV2 RW_M + ImageNet V2 1k; finetuned with ????
    # TODO(liamvdv): add SSL trained effnet model.
    return None


def get_model_transforms(model):
    resize = getattr(model, "data_resize_transform", (224, 224))
    model_transforms = Resize(resize)
    normalize_transform = Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    use_normalization = getattr(model, "use_normalization", True)
    # NOTE(liamvdv): normalization_mean, normalization_std are always default.
    if use_normalization:
        model_transforms = Compose([model_transforms, normalize_transform])
    return model_transforms


def _get_dataloader(model, path: Path):
    data_module = NletDataModule(
        data_dir=path,
        dataset_class=SupervisedDataset,
        nlet_builder=build_onelet,
        batch_size=64,
        workers=10,
        model_transforms=get_model_transforms(model),
        training_transforms=lambda x: x,
        dataset_names=["Showcase"],
    )

    data_module.setup("validate")
    dls = data_module.val_dataloader()  # val for transforms
    assert len(dls) == 1
    dl = dls[0]
    return dl


def get_df(model, path: Path):
    dl = _get_dataloader(model, path)
    preds = generate_embeddings(model, dl)
    df = df_from_predictions(preds)
    # TODO(liamvdv): Should be DF of
    #                id, embedding, label, label_string, input, model, dataset

    def transform_embedding(embedding_list):
        return np.array([tensor.item() for tensor in embedding_list])

    df["embedding"] = df["embedding"].apply(transform_embedding)
    df["label"] = df["label"].apply(lambda x: x.item())
    return df

In [None]:
from torchvision.datasets import MNIST
from torchvision.transforms import Resize, Compose, ToTensor, Grayscale, Normalize
from torch.utils.data import DataLoader, Subset
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from PIL import Image


def custom_collate(batch):
    model_inputs, original_images, targets = zip(*batch)
    return torch.stack(model_inputs), list(original_images), torch.tensor(targets)


def get_mnist_dataloader(batch_size=128, num_samples=2000):
    # Define transforms for the model input
    model_transform = Compose(
        [
            Resize((224, 224)),
            Grayscale(3),  # Convert to 3 channels
            ToTensor(),
            Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),  # ImageNet normalization
        ]
    )

    # Define transforms for storing the original image
    storage_transform = Compose(
        [
            Resize((224, 224)),
        ]
    )

    class TransformedMNIST(MNIST):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.model_transform = model_transform
            self.storage_transform = storage_transform

        def __getitem__(self, index):
            img, target = self.data[index], int(self.targets[index])

            # Convert to PIL Image
            img = Image.fromarray(img.numpy(), mode="L")

            return self.model_transform(img), self.storage_transform(img), target

    mnist_dataset = TransformedMNIST(root="./data", train=False, download=True)

    # Stratified sampling to maintain label distribution
    indices = list(range(len(mnist_dataset)))
    _, sampled_indices = train_test_split(
        indices, test_size=num_samples, stratify=mnist_dataset.targets, random_state=42
    )

    sampled_dataset = Subset(mnist_dataset, sampled_indices)

    return DataLoader(sampled_dataset, batch_size=batch_size, shuffle=False, drop_last=True, collate_fn=custom_collate)


def get_mnist_df(model, batch_size=128, num_samples=2000):
    model.eval()
    dataloader = get_mnist_dataloader(batch_size=batch_size, num_samples=num_samples)

    all_embeddings = []
    all_labels = []
    all_images = []

    with torch.no_grad():
        for model_input, original_image, target in tqdm(dataloader, desc="Generating embeddings"):
            if torch.cuda.is_available():
                model_input = model_input.cuda()
                model = model.cuda()

            embeddings = model(model_input)

            all_embeddings.append(embeddings.cpu().numpy())
            all_labels.append(target.numpy())
            all_images.extend(original_image)  # original_image is already a list of PIL Images

    all_embeddings = np.vstack(all_embeddings)
    all_labels = np.concatenate(all_labels)

    num_samples = len(all_labels)

    df = pd.DataFrame(
        {
            "id": range(num_samples),
            "embedding": list(all_embeddings),
            "label": all_labels,
            "label_string": [str(label) for label in all_labels],
            "input": all_images,  # Store the actual PIL Image objects
        }
    )
    return df

In [None]:
import pandas as pd
import numpy as np
from PIL import Image


def generate_synthetic_dataset(num_clusters, points_per_cluster, embedding_size=256, image_size=(224, 224), seed=None):
    rng = np.random.default_rng(seed)

    all_embeddings = []
    all_labels = []
    all_images = []

    for cluster in range(num_clusters):
        # Generate cluster center
        center = rng.standard_normal(embedding_size)

        # Generate points around the center
        # Use standard deviation of 1, which means 95% of points will be within 2 std dev
        points = rng.standard_normal((points_per_cluster, embedding_size)) + center

        all_embeddings.extend(points)
        all_labels.extend([cluster] * points_per_cluster)

        # Generate random images (you might want to make these more meaningful)
        for _ in range(points_per_cluster):
            img = Image.fromarray(rng.integers(0, 256, image_size, dtype=np.uint8), "L")
            all_images.append(img)

    df = pd.DataFrame(
        {
            "id": range(len(all_labels)),
            "embedding": list(all_embeddings),
            "label": all_labels,
            "label_string": [str(label) for label in all_labels],
            "input": all_images,
        }
    )

    return df

In [None]:
on_cpu = True
models = {
    "ViT-Pretrained": get_pretrained_vit,
    "ViT-Finetuned": get_finetuned_vit,
    "EfN-Pretrained": get_pretrained_efnet,
    "EfN-Finetuned": get_finetuned_efnet,
}

# TODO(liamvdv): @robert: why filtered? Worauf sind die Dataset Stats?
BRISTOL = Path(
    "/workspaces/gorillatracker/data/supervised/bristol/cross_encounter_validation/cropped_frames_square_filtered"
)
SPAC = Path("/workspaces/gorillatracker/data/supervised/cxl_all/face_images_square")
datasets = {
    "Bristol": BRISTOL,
    "SPAC": SPAC,
}
dfs = []

# Testing Datasets
m = "ViT-Pretrained"
df = get_mnist_df(models[m]())
df["dataset"] = "MNIST"
df["model"] = m
dfs.append(df)

m = "EfN-Pretrained"
df = get_mnist_df(models[m]())
df["dataset"] = "MNIST"
df["model"] = m
dfs.append(df)

testS = generate_synthetic_dataset(20, 20)
# c - clusters, n - points per cluster
testS["dataset"] = "Synthetic 20c 20n"
testS["model"] = "Synthetic"
dfs.append(testS)

testL = generate_synthetic_dataset(200, 10)
testL["dataset"] = "Synthetic 200c 10n"
testL["model"] = "Synthetic"
dfs.append(testL)

# Actual Datasets
for model_name, get_model in models.items():
    for dataset_name, dataset_path in datasets.items():
        print("Model:", model_name, "| Dataset:", dataset_name, end=" ")
        model = get_model()
        if not model:
            print("Skipping model: Model not yet implemented.")
            continue
        if on_cpu:
            model = model.cpu()

        df = get_df(model, dataset_path)
        df["dataset"] = dataset_name
        df["model"] = model_name
        print("| Done. Appending", len(df), "rows. Embedding Size:", df["embedding"].iloc[0].shape)
        dfs.append(df)

        # Cleanup
        del model  # and?: torch.cuda.empty_cache()

merged_df = pd.concat(dfs, ignore_index=True)
merged_df.to_pickle("merged.pkl")
print("done")
# vitf_spac = merged_df[(merged_df['model'] == 'ViT-Finetuned') & (merged_df['dataset'] == 'SPAC')]