In [1]:
!pip install -U accelerate
!pip install -U transformers
!pip install neptune

Collecting accelerate
  Downloading accelerate-1.3.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.3.0-py3-none-any.whl (336 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m336.6/336.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 1.2.1
    Uninstalling accelerate-1.2.1:
      Successfully uninstalled accelerate-1.2.1
Successfully installed accelerate-1.3.0
Collecting transformers
  Downloading transformers-4.48.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.48.1-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m88.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Foun

In [2]:
import os
import random
import json
import shutil
import copy
import numpy as np
import pandas as pd
import requests
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import Adam
from torchvision import models, datasets, transforms
from torchvision.models import ResNet34_Weights
import timm
from torch.utils.data import DataLoader
from tqdm import tqdm
from PIL import Image
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer
from transformers import ViTModel, ViTConfig, ViTForImageClassification
from transformers import BertForSequenceClassification, BertTokenizer, BertModel
from transformers import AutoModel, AutoImageProcessor, AutoModelForImageClassification
from transformers import Dinov2ForImageClassification
import gc
import neptune as neptune
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [4]:
!unzip "/content/gdrive/My Drive/data_full.zip"

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: data/images/tshirts_and_tops/5a2ea12ec1954bbe9cecac8de520a35b.jpg  
  inflating: __MACOSX/data/images/tshirts_and_tops/._5a2ea12ec1954bbe9cecac8de520a35b.jpg  
  inflating: data/images/tshirts_and_tops/858b324677ef4495aa2dca0a32255397.jpg  
  inflating: __MACOSX/data/images/tshirts_and_tops/._858b324677ef4495aa2dca0a32255397.jpg  
  inflating: data/images/tshirts_and_tops/69b802f6bbb64ab484561cca670ca21f.jpg  
  inflating: __MACOSX/data/images/tshirts_and_tops/._69b802f6bbb64ab484561cca670ca21f.jpg  
  inflating: data/images/tshirts_and_tops/26f32a520a1a497abb9d307c4d92d026.jpg  
  inflating: __MACOSX/data/images/tshirts_and_tops/._26f32a520a1a497abb9d307c4d92d026.jpg  
  inflating: data/images/tshirts_and_tops/9ef4f9646e5d4ea5a3ebea090c26e450.jpg  
  inflating: __MACOSX/data/images/tshirts_and_tops/._9ef4f9646e5d4ea5a3ebea090c26e450.jpg  
  inflating: data/images/tshirts_and_tops/527a458065b2434ebaaeac6faceb

In [5]:
DEFAULT_RANDOM_SEED = 42

def seedBasic(seed=DEFAULT_RANDOM_SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

# torch random seed
import torch
def seedTorch(seed=DEFAULT_RANDOM_SEED):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# basic + torch
def seedEverything(seed=DEFAULT_RANDOM_SEED):
    seedBasic(seed)
    seedTorch(seed)

In [6]:
seedEverything()

# Load class names from the classes file
with open('data/meta/classes.txt') as f:
    classes = [line.strip() for line in f.readlines()]

# Load train and test splits
with open('data/meta/train.json') as f:
    train_data = json.load(f)
with open('data/meta/test.json') as f:
    test_data = json.load(f)

In [7]:
seedEverything()

train_samples = []
test_samples = []
for cls in classes:
    train_samples.extend([(f'data/images/{x}', x[0:x.find('/')]) for x in train_data[cls]])
    test_samples.extend([(f'data/images/{x}', x[0:x.find('/')]) for x in test_data[cls]])
print(f'Train size: {len(train_samples)}')
print(f'Test size: {len(test_samples)}')
print(train_samples[0:5])

Train size: 99000
Test size: 33000
[('data/images/coats/1d260114f0df489b9de8c5cd81d6f26c.jpg', 'coats'), ('data/images/coats/94a8e251345643e2ba10386f26a42eae.jpg', 'coats'), ('data/images/coats/87d66645ed094890af355c8d099580a2.jpg', 'coats'), ('data/images/coats/ee294abae3d646a8b2afdea3e701eb3a.jpg', 'coats'), ('data/images/coats/0f4515c7013f43558d31dfd3a87541d7.jpg', 'coats')]


In [8]:
model_name = "dinov2"
project_name = "HSE-MDS-Kofman-Anna-Diploma"
api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI4MmUyOTZjYy1mNzFjLTQ4YzUtYjk4Yi1hZmIxMTk5OWYwMDgifQ=="

In [9]:
seedEverything()

transform_dinov2 = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Define the dataset class with image IDs
class CustomZalandoDataset(torch.utils.data.Dataset):
    def __init__(self, samples, transform=None):
        self.samples = [s for s in samples if os.path.splitext(s[0])[1].lower() in [".jpg", ".jpeg", ".png"]]
        self.transform = transform

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        image_path, label = self.samples[idx]
        label_index = classes.index(label)
        image = datasets.folder.default_loader(image_path)
        if self.transform:
            image = self.transform(image)
        image_id = os.path.basename(image_path)
        return image, label_index, image_id

# Initialize the datasets
train_dataset = CustomZalandoDataset(train_samples, transform=transform_dinov2)
test_dataset = CustomZalandoDataset(test_samples, transform=transform_dinov2)

# Split the train dataset into training and validation subsets
total_train_count = len(train_dataset)
val_count = int(total_train_count * 0.1)  # 10% for validation
train_count = total_train_count - val_count
current_train, current_val = torch.utils.data.random_split(train_dataset, [train_count, val_count], generator=torch.Generator().manual_seed(42))

# Data loaders for train, validation, and test sets
train_loader = DataLoader(current_train, batch_size=16, shuffle=True)
val_loader = DataLoader(current_val, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Print dataset sizes
print(f"Size of the train dataset before the split: {total_train_count}")
print(f"Size of the val dataset: {val_count}")
print(f"Size of the train dataset after the split: {train_count}")
print(f"Size of the test dataset: {len(test_dataset)}")

Size of the train dataset before the split: 98995
Size of the val dataset: 9899
Size of the train dataset after the split: 89096
Size of the test dataset: 33000


In [10]:
seedEverything()

def extract_and_save_embeddings_with_ids(model, model_name, loaders, embedding_save_path="embeddings_all.npz"):
    model.eval()
    all_embeddings = []
    all_labels = []
    all_sources = []
    all_ids = []

    total_batches = sum(len(loader) for loader in loaders.values())

    with torch.no_grad():
        progress = tqdm(total=total_batches, desc=f"Extracting embeddings for {model_name}", leave=True)

        for split_name, loader in loaders.items():
            embeddings = []
            labels_list = []
            ids_list = []

            for batch in loader:
                inputs, labels, batch_ids = batch
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)  # Forward pass

                # Extract embeddings (e.g., logits for certain models)
                if model_name in ["ViT"]:
                    embeddings_batch = outputs.logits.cpu().numpy()
                else:
                    embeddings_batch = outputs.cpu().numpy()

                embeddings.extend(embeddings_batch)
                labels_list.extend(labels.cpu().numpy())
                ids_list.extend(batch_ids)
                all_sources.extend([split_name] * len(labels))

                progress.update(1)

            embeddings = np.array(embeddings)
            labels_list = np.array(labels_list)

            # Append to global lists
            all_embeddings.append(embeddings)
            all_labels.append(labels_list)
            all_ids.extend(ids_list)

        progress.close()

    # Concatenate embeddings and labels from all splits
    all_embeddings = np.concatenate(all_embeddings, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    # Save embeddings, labels, IDs, and sources locally
    np.savez(embedding_save_path, embeddings=all_embeddings, labels=all_labels, sources=np.array(all_sources), ids=np.array(all_ids))
    print(f"Saved all embeddings with IDs at {embedding_save_path}")

    # Log to Neptune
    run = neptune.init_run(project=project_name, api_token=api_token)
    run[f"embeddings/{model_name}/path"].upload(embedding_save_path)
    run[f"embeddings/{model_name}/size"] = all_embeddings.shape
    run[f"embeddings/{model_name}/labels_size"] = all_labels.shape
    run[f"embeddings/{model_name}/splits"] = list(loaders.keys())
    run[f"embeddings/{model_name}/ids_size"] = len(all_ids)
    run.stop()

    return all_embeddings, all_labels, all_sources, all_ids

In [11]:
seedEverything()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

# Load the DINOv2Classifier model
num_classes = 6
hidden_size = 384  # Hidden size for `dinov2_vits14`

# Step 1: Load DINOv2 backbone
backbone = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')
for param in backbone.parameters():
    param.requires_grad = False  # Freeze backbone parameters

class DINOv2Classifier(nn.Module):
    def __init__(self, backbone, hidden_size, num_classes):
        super(DINOv2Classifier, self).__init__()
        self.backbone = backbone
        self.classifier = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Pass through the backbone
        embeddings = self.backbone(x)
        # Pass through the classification head
        logits = self.classifier(embeddings)
        return logits

# Step 2: Create the DINOv2Classifier model
dino_v2 = DINOv2Classifier(backbone=backbone, hidden_size=hidden_size, num_classes=num_classes)
dino_v2 = dino_v2.to(device)

# Load saved weights
model_path = f"{model_name}.pth"
dino_v2.load_state_dict(torch.load(model_path, map_location=device))
print(f"Loaded weights from {model_path}")
dino_v2.eval()  # Set the model to evaluation mode

# Define loaders for all splits (ensure these are correctly set up)
loaders = {
    "train": train_loader,
    "val": val_loader,
    "test": test_loader
}

# Extract embeddings
dino_v2_embeddings, dino_v2_labels, dino_v2_sources, dino_v2_ids = extract_and_save_embeddings_with_ids(
    model=dino_v2,
    model_name=model_name,
    loaders=loaders,
    embedding_save_path=f"{model_name}_embeddings.npz"
)

Device: cuda


Downloading: "https://github.com/facebookresearch/dinov2/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_pretrain.pth" to /root/.cache/torch/hub/checkpoints/dinov2_vits14_pretrain.pth
100%|██████████| 84.2M/84.2M [00:00<00:00, 194MB/s]


Loaded weights from dinov2.pth


Extracting embeddings for dinov2: 100%|██████████| 8251/8251 [2:19:05<00:00,  1.01s/it]


Saved all embeddings with IDs at dinov2_embeddings.npz




[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/kofmanya/HSE-MDS-Kofman-Anna-Diploma/e/HSEM-12
[neptune] [info   ] Shutting down background jobs, please wait a moment...


        Convert the value to a supported type, such as a string or float, or use stringify_unsupported(obj)
        for dictionaries or collections that contain unsupported values.
        For more, see https://docs.neptune.ai/help/value_of_unsupported_type
        Convert the value to a supported type, such as a string or float, or use stringify_unsupported(obj)
        for dictionaries or collections that contain unsupported values.
        For more, see https://docs.neptune.ai/help/value_of_unsupported_type


[neptune] [info   ] Done!
[neptune] [info   ] Waiting for the remaining 2 operations to synchronize with Neptune. Do not kill this process.
[neptune] [info   ] All 2 operations synced, thanks for waiting!
[neptune] [info   ] Explore the metadata in the Neptune app: https://app.neptune.ai/kofmanya/HSE-MDS-Kofman-Anna-Diploma/e/HSEM-12/metadata


In [None]:
seedEverything()

def save_and_log_transformations(transform, model_name, save_path="transformations.json"):
    transform_dict = []
    for t in transform.transforms:
        if isinstance(t, transforms.Resize):
            transform_dict.append({"name": "Resize", "params": {"size": t.size}})
        elif isinstance(t, transforms.RandomHorizontalFlip):
            transform_dict.append({"name": "RandomHorizontalFlip", "params": {"p": t.p}})
        elif isinstance(t, transforms.RandomRotation):
            transform_dict.append({"name": "RandomRotation", "params": {"degrees": t.degrees}})
        elif isinstance(t, transforms.ColorJitter):
            transform_dict.append({
                "name": "ColorJitter",
                "params": {
                    "brightness": t.brightness,
                    "contrast": t.contrast,
                    "saturation": t.saturation,
                    "hue": t.hue
                }
            })
        elif isinstance(t, transforms.Normalize):
            transform_dict.append({"name": "Normalize", "params": {"mean": t.mean, "std": t.std}})
        elif isinstance(t, transforms.ToTensor):
            transform_dict.append({"name": "ToTensor", "params": None})
        else:
            transform_dict.append({"name": type(t).__name__, "params": None})

    with open(save_path, "w") as f:
        json.dump(transform_dict, f, indent=4)
    print(f"Transformations saved for {model_name} at {save_path}")

    # Log transformations to Neptune
    run = neptune.init_run(project=project_name, api_token=api_token)
    run[f"transformations/{model_name}/path"].upload(save_path)
    run[f"transformations/{model_name}/details"] = transform_dict
    run.stop()

In [13]:
seedEverything()

# Save and log transformations for DINOv2
save_and_log_transformations(
    transform=transform_dinov2,
    model_name=model_name,
    save_path=f"{model_name}_transformations.json"
)

Transformations saved for dinov2 at dinov2_transformations.json
[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/kofmanya/HSE-MDS-Kofman-Anna-Diploma/e/HSEM-13
[neptune] [info   ] Shutting down background jobs, please wait a moment...
[neptune] [info   ] Done!
[neptune] [info   ] Waiting for the remaining 1 operations to synchronize with Neptune. Do not kill this process.
[neptune] [info   ] All 1 operations synced, thanks for waiting!
[neptune] [info   ] Explore the metadata in the Neptune app: https://app.neptune.ai/kofmanya/HSE-MDS-Kofman-Anna-Diploma/e/HSEM-13/metadata
