# Fine Tune DINOv2 for Domain Knowledge

In [1]:
import torchrl.trainers

In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from model import DINOLinearClassifier
from load_data import *
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, confusion_matrix
from transformers import AutoImageProcessor, AutoModel, TrainingArguments
import torch
import torch
import torch.nn as nn
from torch.nn.functional import cross_entropy
import torch.optim as optim
import torchvision
from dataclasses import dataclass
from PIL import Image
import requests
from time import perf_counter


# Helper functions
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

def forward_pass_with_label(batch):
    # Place all input tensors on the same device as the model
    inputs = {k:v.to(device) for k,v in batch.items()
              if k in tokenizer.model_input_names}

    with torch.no_grad():
        output = DINOLinearClassifier(**inputs)
        pred_label = torch.argmax(output.logits, axis=-1)
        loss = cross_entropy(output.logits, batch["label"].to(device),
                             reduction="none")
    # Place outputs on CPU for compatibility with other dataset columns
    return {"loss": loss.cpu().numpy(),
            "predicted_label": pred_label.cpu().numpy()}


def time_pipeline(self, query):
    latencies = []
    # Warmup
    for _ in range(10):
        _ = self.pipeline(query)
    # Timed run
    for _ in range(100):
        start_time = perf_counter()
        _ = self.pipeline(query)
        latency = perf_counter() - start_time
        latencies.append(latency)
    # Compute run statistics
    time_avg_ms = 1000 * np.mean(latencies)
    time_std_ms = 1000 * np.std(latencies)
    print(f"Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f}")
    return {"time_avg_ms": time_avg_ms, "time_std_ms": time_std_ms}

def plot_confusion_matrix(y_preds, y_true, labels):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
    plt.title("Normalized confusion matrix")
    plt.show()


In [27]:
batch_size = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DINOLinearClassifier(num_classes=2)
processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
# model = AutoModel.from_pretrained('facebook/dinov2-base')
model.to(device)

preprocessor_config.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

DINOLinearClassifier(
  (model): DinoVisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (norm): Identity()
    )
    (blocks): ModuleList(
      (0-23): 24 x NestedTensorBlock(
        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
        (attn): MemEffAttention(
          (qkv): Linear(in_features=1024, out_features=3072, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=1024, out_features=1024, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (ls1): LayerScale()
        (drop_path1): Identity()
        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (drop): Dropout(p=0.0, inplace=Fal

In [28]:
%run finetune.py

Loading data...


AttributeError: 'DINOLinearClassifier' object has no attribute 'transformer'

In [None]:
# Freeze the model parameters
for param in model.parameters():
    param.requires_grad = False
in_channel = model.fc.in_features
model.fc = nn.Linear(in_channel, 2) # 2 classes

# Set the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Define transformations to the data inputs
data_transforms = {
    "train": 
        torchvision.transforms.Compose([
            transforms.Resize(size=(196,196), interpolation=transforms.InterpolationMode.BICUBIC),
            torchvision.transforms.RandomHorizontalFlip(),
            torchvision.transforms.ToTensor(),
            torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ]),
    "validation": 
        torchvision.transforms.Compose([
            torchvision.transforms.Resize((196, 196)),
            torchvision.transforms.ToTensor(),
            torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
}

print("Loading data...")
train_dataset = GeoPACHAImageDataset(data_file="/workspace/geopacha/jn/Train.csv",
                            transform=data_transforms["train"])
validation_dataset = GeoPACHAImageDataset(data_file="/workspace/geopacha/jn/Train.csv",
                            transform=data_transforms["validation"])

dataloaders = {
    "train": torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                                            num_workers=4),
    "validation": torch.utils.data.DataLoader(validation_dataset, batch_size=batch_size, shuffle=True,
                                            num_workers=4)
}

trainiter = iter(dataloaders["train"])
validationiter = iter(dataloaders["validation"])

logging_steps = len(train_dataset) // batch_size 
model_name = f"dinov2_vitl14-finetuned" 

config = TrainingConfig(epochs=2, batch_size=batch_size, learning_rate=2e-5)

training_args = TrainingArguments(output_dir="./output",
                                #output_dir=model_name,
                                num_train_epochs=2,
                                learning_rate=2e-5,
                                per_device_train_batch_size=batch_size,
                                per_device_eval_batch_size=batch_size,
                                weight_decay=0.01,
                                evaluation_strategy="epoch",
                                disable_tqdm=False,
                                logging_steps=logging_steps,
                                push_to_hub=True,
                                log_level="error")


trainer = Trainer(model=model, args=training_args,
                compute_metrics=compute_metrics,
                train_dataset=dataset["train"],
                eval_dataset=dataset["validation"])

model.to(config.device)
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

print("Training...")                
# Training and validation loop
for epoch in range(config.epochs):
    model.train()
    train_loss = 0
    for batch in dataloaders["train"]:
        inputs, labels = batch
        inputs, labels = inputs.to(config.device), labels.to(config.device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = cross_entropy(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validation step
    model.eval()
    val_loss = 0
    val_accuracy = []
    with torch.no_grad():
        for batch in dataloaders["validation"]:
            inputs, labels = batch
            inputs, labels = inputs.to(config.device), labels.to(config.device)
            outputs = model(inputs)
            loss = cross_entropy(outputs, labels)
            val_loss += loss.item()

            # Calculate accuracy
            predictions = outputs.argmax(dim=1, keepdim=True)
            correct = predictions.eq(labels.view_as(predictions)).sum().item()
            val_accuracy.append(correct / len(labels))

    # Calculate average losses and accuracy
    avg_train_loss = train_loss / len(dataloaders["train"])
    avg_val_loss = val_loss / len(dataloaders["validation"])
    avg_val_accuracy = sum(val_accuracy) / len(val_accuracy)

    print(f'Epoch {epoch+1}/{config.epochs}, Train Loss: {avg_train_loss:.4f}, '
        f'Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {avg_val_accuracy:.4f}')

print("Training complete!")

preds_output = trainer.predict(dataset["validation"])
preds_output.metrics
y_preds = np.argmax(preds_out_size
put.predictions, axis=1)
plot_confusion_matrix(y_preds, y_valid, labels)

# Convert dataset back to PyTorch tensors
dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
# Compute loss values
dataset["validation"] = dataset["validation"].map(
    forward_pass_with_label, batched=True, batch_size=16)

dataset.set_format("pandas")
cols = ["text", "label", "predicted_label", "loss"]
df_test = dataset["validation"][:][cols]

def label_int2str(row):
    return df_test["train"].features["label"].int2str(row)

df_test["label"] = df_test["label"].apply(label_int2str)
df_test["predicted_label"] = (df_test["predicted_label"]
                            .apply(label_int2str))
df_test.sort_values("loss", ascending=True).head(10)

save_model_path = "dinov2_vitl14-finetuned.pth"
torch.save(model.state_dict(), save_model_path)


In [38]:
from finetune2 import *

In [29]:
# reload model.py
%run model.py

In [40]:
train_config = TrainingConfig()

data_transforms = {
    "train": 
        torchvision.transforms.Compose([
            transforms.Resize(size=(196,196), interpolation=transforms.InterpolationMode.BICUBIC, antialias=True),
            torchvision.transforms.RandomHorizontalFlip(),
            torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ]),
    "validation": 
        torchvision.transforms.Compose([
            torchvision.transforms.Resize(size=(196, 196), antialias=True),
            torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
}

print("Loading data...")
train_dataset = GeoPACHAImageDataset(data_file="/workspace/geopacha/jn/Train.csv",
                            transform=data_transforms["train"])
validation_dataset = GeoPACHAImageDataset(data_file="/workspace/geopacha/jn/Train.csv",
                            transform=data_transforms["validation"])
dataloaders = {
    "train": torch.utils.data.DataLoader(train_dataset, batch_size=train_config.batch_size, shuffle=True, num_workers=4),
    "validation": torch.utils.data.DataLoader(validation_dataset, batch_size=train_config.batch_size, shuffle=True, num_workers=4)
}
print("Data loaded successfully!")
dinov2_vitl14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')
model = DINOLinearClassifier(model=dinov2_vitl14, num_classes=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# freeze and unfreeze specific layers
for param in model.parameters():
    param.requires_grad = False
for param in model.transformer.patch_embed.parameters():
    param.requires_grad = True
for param in model.classifier.parameters():
    param.requires_grad = True
in_channel = model.fc.in_features
model.fc = nn.Linear(in_channel, 2)
# model = initialize_model(train_config)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=train_config.learning_rate)
train_config.logging_steps = len(train_dataset) // train_config.batch_size
print("Training model...")

Loading data...
Data loaded successfully!


Using cache found in /root/.cache/torch/hub/facebookresearch_dinov2_main


Training model...


In [41]:
model.train()

DINOLinearClassifier(
  (transformer): DinoVisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (norm): Identity()
    )
    (blocks): ModuleList(
      (0-23): 24 x NestedTensorBlock(
        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
        (attn): MemEffAttention(
          (qkv): Linear(in_features=1024, out_features=3072, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=1024, out_features=1024, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (ls1): LayerScale()
        (drop_path1): Identity()
        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (drop): Dropout(p=0.0, inpla

In [43]:
for batch in dataloaders["train"]:
        inputs, labels = batch[0].to(device), batch[1].to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)

  warn(
  warn(
  warn(
  warn(


In [44]:
loss

tensor(0.7201, device='cuda:0', grad_fn=<NllLossBackward0>)

In [33]:
x = model.transformer(inputs)
print(x.shape)  # 

torch.Size([64, 1024])


In [34]:
print(x)

tensor([[-2.3060,  0.1102,  0.7062,  ...,  1.2252, -0.3764,  0.3686],
        [-2.8179,  0.4306, -1.0216,  ...,  0.0432, -1.0885,  1.1637],
        [-1.9625,  0.9733, -0.0866,  ...,  0.4928, -0.2759,  1.3728],
        ...,
        [-0.7031,  1.4322,  0.8492,  ..., -1.1186, -0.9884, -1.0179],
        [-0.0903, -1.0424,  0.0894,  ...,  1.3567,  0.2269,  0.7433],
        [-1.9105,  0.7407,  1.5410,  ...,  0.6182,  0.1354, -0.3090]],
       device='cuda:0', grad_fn=<SelectBackward0>)


In [35]:
x = model.transformer.norm(x)
print(x.shape) 
print(x)

torch.Size([64, 1024])
tensor([[-1.9767e+00, -2.6672e-03,  1.3012e+00,  ...,  1.0498e+00,
         -5.0951e-01,  8.2622e-01],
        [-2.3909e+00,  3.0167e-01, -4.4963e-01,  ..., -3.3164e-01,
         -1.2156e+00,  2.3932e+00],
        [-1.5876e+00,  8.3849e-01,  5.0144e-01,  ...,  1.9505e-01,
         -3.8440e-01,  2.8392e+00],
        ...,
        [-4.2304e-01,  1.3032e+00,  1.4562e+00,  ..., -1.6297e+00,
         -1.0962e+00, -1.9340e+00],
        [ 1.2658e-01, -1.1701e+00,  6.7833e-01,  ...,  1.2651e+00,
          1.2955e-01,  1.6642e+00],
        [-1.6253e+00,  6.3244e-01,  2.1818e+00,  ...,  3.5243e-01,
          2.1357e-02, -5.7369e-01]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward0>)


In [36]:
x = model.classifier(x)
print(x.shape) 
print(x)

torch.Size([64, 2])
tensor([[ 0.2109, -0.8287],
        [ 0.4512, -0.9362],
        [-0.2077, -0.6444],
        [ 0.1354, -0.7096],
        [ 0.3738, -1.1583],
        [-0.0447, -0.6878],
        [ 0.4727, -0.9389],
        [ 0.1762, -1.4110],
        [ 0.2367,  0.0023],
        [-0.2615, -0.8846],
        [ 0.1196, -0.4694],
        [ 0.1132, -0.7432],
        [ 0.2245, -0.9475],
        [ 0.0714, -0.7980],
        [ 0.3627, -0.8589],
        [-0.4030, -0.5342],
        [-0.0837, -0.6425],
        [ 0.0406, -0.8466],
        [ 0.2901, -1.1164],
        [ 0.5875, -0.4292],
        [ 0.4975, -1.3442],
        [ 0.0809, -0.4946],
        [-0.0276, -0.3209],
        [ 0.4913, -0.0383],
        [ 0.0156, -0.7508],
        [ 0.1262, -0.2842],
        [ 0.2187, -0.4185],
        [ 0.6525, -0.9659],
        [ 0.2354, -0.3593],
        [ 0.4040, -0.2739],
        [ 0.3303, -0.8400],
        [ 0.4357, -0.8584],
        [ 0.0700, -0.6281],
        [ 0.2209, -0.9860],
        [ 0.2395, -0.6596],


In [37]:
total_loss, total_accuracy = 0, 0
preds_list = [] # store predictions
label_ids_list = [] # store labels
for batch in dataloaders["train"]:
    inputs, labels = batch[0].to(train_config.device), batch[1].to(train_config.device)
    optimizer.zero_grad()
    print(model)
    outputs = model(inputs.float())
    print(outputs)
    print(type(outputs))
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    preds = outputs.argmax(dim=1)
    preds_list.extend(preds.cpu().numpy())
    label_ids_list.extend(labels.cpu().numpy())
    total_accuracy += (preds == labels).float().mean().item()
avg_loss = total_loss / len(dataloaders["train"])
avg_accuracy = total_accuracy / len(dataloaders["train"])
print(f"Training Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.4f}")

  warn(
  warn(
  warn(
  warn(


DINOLinearClassifier(
  (transformer): DinoVisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (norm): Identity()
    )
    (blocks): ModuleList(
      (0-23): 24 x NestedTensorBlock(
        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
        (attn): MemEffAttention(
          (qkv): Linear(in_features=1024, out_features=3072, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=1024, out_features=1024, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (ls1): LayerScale()
        (drop_path1): Identity()
        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (drop): Dropout(p=0.0, inpla

In [12]:
(preds == labels).float().mean().item()

0.4193548262119293