DINO for SECO
- Resnet 18

In [4]:
import copy
import torch
import torchvision
from torch import nn
import sys 
sys.path.insert(0, '/home/akansh-i2sc/Desktop/Study/HLCV/Why-Self-Supervision-in-Time/src/modules')
from lightly.loss import DINOLoss
from lightly.models.modules import DINOProjectionHead
from lightly.models.utils import deactivate_requires_grad, update_momentum
from lightly.transforms.dino_transform import DINOTransform
from lightly.utils.scheduler import cosine_schedule
import seco_dataset_temporal as seco
import numpy as np
import matplotlib.pyplot as plt
import glob
from tqdm.auto import tqdm
from lightly.data import LightlyDataset

In [5]:
class DINO(torch.nn.Module):
    def __init__(self, backbone, input_dim):
        super().__init__()
        self.student_backbone = backbone
        self.student_head = DINOProjectionHead(
            input_dim, 512, 64, 2048, freeze_last_layer=1
        )
        self.teacher_backbone = copy.deepcopy(backbone)
        self.teacher_head = DINOProjectionHead(input_dim, 512, 64, 2048)
        deactivate_requires_grad(self.teacher_backbone)
        deactivate_requires_grad(self.teacher_head)

    def forward(self, x):
        y = self.student_backbone(x).flatten(start_dim=1)
        z = self.student_head(y)
        return z

    def forward_teacher(self, x):
        y = self.teacher_backbone(x).flatten(start_dim=1)
        z = self.teacher_head(y)
        return z

resnet = torchvision.models.resnet18()
backbone = nn.Sequential(*list(resnet.children())[:-1])
input_dim = 512

In [6]:
model = DINO(backbone, input_dim)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device);

In [7]:
transform = DINOTransform()
# path2data =  "/home/akansh-i2sc/Desktop/Study/HLCV/SeCo_dataset/seco_100k/seasonal_contrast_100k/"
# seco_dataset = seco.SeasonalContrastBasic(root=path2data, transform=transform, target_transform=0)
path2data = "/home/akansh-i2sc/Desktop/Study/HLCV/SeCo_dataset/seco_100k/jpeg_40k/"

seco_dataset = LightlyDataset(input_dir=path2data, transform=transform)
dataloader = torch.utils.data.DataLoader(
    seco_dataset,
    batch_size=64,
    shuffle=True,
    drop_last=True,
    num_workers=4,
)

In [15]:
torch.save(model.teacher_backbone.state_dict(), "teacher_backbone.pth")

In [18]:
model_new = torchvision.models.resnet18(weights=None)

In [21]:
model_new.load_state_dict(torch.load("teacher_backbone.pth"))

RuntimeError: Error(s) in loading state_dict for ResNet:
	Missing key(s) in state_dict: "conv1.weight", "bn1.weight", "bn1.bias", "bn1.running_mean", "bn1.running_var", "layer1.0.conv1.weight", "layer1.0.bn1.weight", "layer1.0.bn1.bias", "layer1.0.bn1.running_mean", "layer1.0.bn1.running_var", "layer1.0.conv2.weight", "layer1.0.bn2.weight", "layer1.0.bn2.bias", "layer1.0.bn2.running_mean", "layer1.0.bn2.running_var", "layer1.1.conv1.weight", "layer1.1.bn1.weight", "layer1.1.bn1.bias", "layer1.1.bn1.running_mean", "layer1.1.bn1.running_var", "layer1.1.conv2.weight", "layer1.1.bn2.weight", "layer1.1.bn2.bias", "layer1.1.bn2.running_mean", "layer1.1.bn2.running_var", "layer2.0.conv1.weight", "layer2.0.bn1.weight", "layer2.0.bn1.bias", "layer2.0.bn1.running_mean", "layer2.0.bn1.running_var", "layer2.0.conv2.weight", "layer2.0.bn2.weight", "layer2.0.bn2.bias", "layer2.0.bn2.running_mean", "layer2.0.bn2.running_var", "layer2.0.downsample.0.weight", "layer2.0.downsample.1.weight", "layer2.0.downsample.1.bias", "layer2.0.downsample.1.running_mean", "layer2.0.downsample.1.running_var", "layer2.1.conv1.weight", "layer2.1.bn1.weight", "layer2.1.bn1.bias", "layer2.1.bn1.running_mean", "layer2.1.bn1.running_var", "layer2.1.conv2.weight", "layer2.1.bn2.weight", "layer2.1.bn2.bias", "layer2.1.bn2.running_mean", "layer2.1.bn2.running_var", "layer3.0.conv1.weight", "layer3.0.bn1.weight", "layer3.0.bn1.bias", "layer3.0.bn1.running_mean", "layer3.0.bn1.running_var", "layer3.0.conv2.weight", "layer3.0.bn2.weight", "layer3.0.bn2.bias", "layer3.0.bn2.running_mean", "layer3.0.bn2.running_var", "layer3.0.downsample.0.weight", "layer3.0.downsample.1.weight", "layer3.0.downsample.1.bias", "layer3.0.downsample.1.running_mean", "layer3.0.downsample.1.running_var", "layer3.1.conv1.weight", "layer3.1.bn1.weight", "layer3.1.bn1.bias", "layer3.1.bn1.running_mean", "layer3.1.bn1.running_var", "layer3.1.conv2.weight", "layer3.1.bn2.weight", "layer3.1.bn2.bias", "layer3.1.bn2.running_mean", "layer3.1.bn2.running_var", "layer4.0.conv1.weight", "layer4.0.bn1.weight", "layer4.0.bn1.bias", "layer4.0.bn1.running_mean", "layer4.0.bn1.running_var", "layer4.0.conv2.weight", "layer4.0.bn2.weight", "layer4.0.bn2.bias", "layer4.0.bn2.running_mean", "layer4.0.bn2.running_var", "layer4.0.downsample.0.weight", "layer4.0.downsample.1.weight", "layer4.0.downsample.1.bias", "layer4.0.downsample.1.running_mean", "layer4.0.downsample.1.running_var", "layer4.1.conv1.weight", "layer4.1.bn1.weight", "layer4.1.bn1.bias", "layer4.1.bn1.running_mean", "layer4.1.bn1.running_var", "layer4.1.conv2.weight", "layer4.1.bn2.weight", "layer4.1.bn2.bias", "layer4.1.bn2.running_mean", "layer4.1.bn2.running_var", "fc.weight", "fc.bias". 
	Unexpected key(s) in state_dict: "0.weight", "1.weight", "1.bias", "1.running_mean", "1.running_var", "1.num_batches_tracked", "4.0.conv1.weight", "4.0.bn1.weight", "4.0.bn1.bias", "4.0.bn1.running_mean", "4.0.bn1.running_var", "4.0.bn1.num_batches_tracked", "4.0.conv2.weight", "4.0.bn2.weight", "4.0.bn2.bias", "4.0.bn2.running_mean", "4.0.bn2.running_var", "4.0.bn2.num_batches_tracked", "4.1.conv1.weight", "4.1.bn1.weight", "4.1.bn1.bias", "4.1.bn1.running_mean", "4.1.bn1.running_var", "4.1.bn1.num_batches_tracked", "4.1.conv2.weight", "4.1.bn2.weight", "4.1.bn2.bias", "4.1.bn2.running_mean", "4.1.bn2.running_var", "4.1.bn2.num_batches_tracked", "5.0.conv1.weight", "5.0.bn1.weight", "5.0.bn1.bias", "5.0.bn1.running_mean", "5.0.bn1.running_var", "5.0.bn1.num_batches_tracked", "5.0.conv2.weight", "5.0.bn2.weight", "5.0.bn2.bias", "5.0.bn2.running_mean", "5.0.bn2.running_var", "5.0.bn2.num_batches_tracked", "5.0.downsample.0.weight", "5.0.downsample.1.weight", "5.0.downsample.1.bias", "5.0.downsample.1.running_mean", "5.0.downsample.1.running_var", "5.0.downsample.1.num_batches_tracked", "5.1.conv1.weight", "5.1.bn1.weight", "5.1.bn1.bias", "5.1.bn1.running_mean", "5.1.bn1.running_var", "5.1.bn1.num_batches_tracked", "5.1.conv2.weight", "5.1.bn2.weight", "5.1.bn2.bias", "5.1.bn2.running_mean", "5.1.bn2.running_var", "5.1.bn2.num_batches_tracked", "6.0.conv1.weight", "6.0.bn1.weight", "6.0.bn1.bias", "6.0.bn1.running_mean", "6.0.bn1.running_var", "6.0.bn1.num_batches_tracked", "6.0.conv2.weight", "6.0.bn2.weight", "6.0.bn2.bias", "6.0.bn2.running_mean", "6.0.bn2.running_var", "6.0.bn2.num_batches_tracked", "6.0.downsample.0.weight", "6.0.downsample.1.weight", "6.0.downsample.1.bias", "6.0.downsample.1.running_mean", "6.0.downsample.1.running_var", "6.0.downsample.1.num_batches_tracked", "6.1.conv1.weight", "6.1.bn1.weight", "6.1.bn1.bias", "6.1.bn1.running_mean", "6.1.bn1.running_var", "6.1.bn1.num_batches_tracked", "6.1.conv2.weight", "6.1.bn2.weight", "6.1.bn2.bias", "6.1.bn2.running_mean", "6.1.bn2.running_var", "6.1.bn2.num_batches_tracked", "7.0.conv1.weight", "7.0.bn1.weight", "7.0.bn1.bias", "7.0.bn1.running_mean", "7.0.bn1.running_var", "7.0.bn1.num_batches_tracked", "7.0.conv2.weight", "7.0.bn2.weight", "7.0.bn2.bias", "7.0.bn2.running_mean", "7.0.bn2.running_var", "7.0.bn2.num_batches_tracked", "7.0.downsample.0.weight", "7.0.downsample.1.weight", "7.0.downsample.1.bias", "7.0.downsample.1.running_mean", "7.0.downsample.1.running_var", "7.0.downsample.1.num_batches_tracked", "7.1.conv1.weight", "7.1.bn1.weight", "7.1.bn1.bias", "7.1.bn1.running_mean", "7.1.bn1.running_var", "7.1.bn1.num_batches_tracked", "7.1.conv2.weight", "7.1.bn2.weight", "7.1.bn2.bias", "7.1.bn2.running_mean", "7.1.bn2.running_var", "7.1.bn2.num_batches_tracked". 

In [20]:
model.teacher_backbone

Sequential(
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Con

In [8]:
criterion = DINOLoss(
    output_dim=2048,
    warmup_teacher_temp_epochs=5,
)

In [9]:
criterion = criterion.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
epochs = 10

In [13]:
model

DINO(
  (student_backbone): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats

In [20]:
print("Starting Training")
for epoch in range(epochs):
    total_loss = 0
    momentum_val = cosine_schedule(epoch, epochs, 0.996, 1)
    for batch in tqdm(dataloader):
        views = batch[0]
        update_momentum(model.student_backbone, model.teacher_backbone, m=momentum_val)
        update_momentum(model.student_head, model.teacher_head, m=momentum_val)
        views = [view.to(device) for view in views]
        global_views = views[:2]
        teacher_out = [model.forward_teacher(view) for view in global_views]
        student_out = [model.forward(view) for view in views]
        loss = criterion(teacher_out, student_out, epoch=epoch)
        total_loss += loss.detach()
        loss.backward()
        # We only cancel gradients of student head.
        model.student_head.cancel_last_layer_gradients(current_epoch=epoch)
        optimizer.step()
        optimizer.zero_grad()

    avg_loss = total_loss / len(dataloader)
    print(f"epoch: {epoch:>02}, loss: {avg_loss:.5f}")


Starting Training


100%|██████████| 312/312 [01:37<00:00,  3.19it/s]


epoch: 00, loss: 6.62916


100%|██████████| 312/312 [01:39<00:00,  3.13it/s]


epoch: 01, loss: 3.66713


100%|██████████| 312/312 [01:39<00:00,  3.12it/s]


epoch: 02, loss: 5.08117


100%|██████████| 312/312 [01:40<00:00,  3.12it/s]


epoch: 03, loss: 5.19397


100%|██████████| 312/312 [01:40<00:00,  3.11it/s]


epoch: 04, loss: 4.97491


100%|██████████| 312/312 [01:40<00:00,  3.12it/s]


epoch: 05, loss: 4.81806


100%|██████████| 312/312 [01:39<00:00,  3.13it/s]


epoch: 06, loss: 4.71214


100%|██████████| 312/312 [01:40<00:00,  3.12it/s]


epoch: 07, loss: 4.66292


100%|██████████| 312/312 [01:40<00:00,  3.12it/s]


epoch: 08, loss: 4.63802


100%|██████████| 312/312 [01:40<00:00,  3.11it/s]

epoch: 09, loss: 4.62389





NameError: name 'model' is not defined

In [39]:
resnet = torchvision.models.resnet18()
backbone = nn.Sequential(*list(resnet.children())[:-1])
input_dim = 512

In [None]:
model = DINO(backbone, input_dim)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device);

In [41]:
transform = DINOTransform()

In [44]:
path2data =  "/home/akansh-i2sc/Desktop/Study/HLCV/SeCo_dataset/seco_100k/seasonal_contrast_100k/"
seco_dataset = seco.SeasonalContrastBasic(root=path2data)

In [56]:
dataset = torchvision.datasets.VOCDetection(
    "datasets/pascal_voc",
    download=True,
    transform=transform,
    target_transform=lambda t: 0,
)

Downloading http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar to datasets/pascal_voc/VOCtrainval_11-May-2012.tar


100%|██████████| 1999639040/1999639040 [05:07<00:00, 6506917.08it/s] 


Extracting datasets/pascal_voc/VOCtrainval_11-May-2012.tar to datasets/pascal_voc


In [57]:
dataloader = torch.utils.data.DataLoader(
    dataset,
    batch_size=64,
    shuffle=True,
    drop_last=True,
    num_workers=4,
)

In [58]:
criterion = DINOLoss(
    output_dim=2048,
    warmup_teacher_temp_epochs=5,
)

In [59]:
criterion = criterion.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 10

print("Starting Training")
for epoch in range(epochs):
    total_loss = 0
    momentum_val = cosine_schedule(epoch, epochs, 0.996, 1)
    for batch in dataloader:
        views = batch[0]
        update_momentum(model.student_backbone, model.teacher_backbone, m=momentum_val)
        update_momentum(model.student_head, model.teacher_head, m=momentum_val)
        views = [view.to(device) for view in views]
        global_views = views[:2]
        teacher_out = [model.forward_teacher(view) for view in global_views]
        student_out = [model.forward(view) for view in views]
        loss = criterion(teacher_out, student_out, epoch=epoch)
        total_loss += loss.detach()
        loss.backward()
        # We only cancel gradients of student head.
        model.student_head.cancel_last_layer_gradients(current_epoch=epoch)
        optimizer.step()
        optimizer.zero_grad()

    avg_loss = total_loss / len(dataloader)
    print(f"epoch: {epoch:>02}, loss: {avg_loss:.5f}")


Starting Training


AttributeError: 'MocoModel' object has no attribute 'student_backbone'

In [1]:

# instead of a resnet you can also use a vision transformer backbone as in the
# original paper (you might have to reduce the batch size in this case):
# backbone = torch.hub.load('facebookresearch/dino:main', 'dino_vits16', pretrained=False)
# input_dim = backbone.embed_dim



transform = DINOTransform()
# we ignore object detection annotations by setting target_transform to return 0
dataset = torchvision.datasets.VOCDetection(
    "datasets/pascal_voc",
    download=True,
    transform=transform,
    target_transform=lambda t: 0,
)
# or create a dataset from a folder containing images or videos:
# dataset = LightlyDataset("path/to/folder")

dataloader = torch.utils.data.DataLoader(
    dataset,
    batch_size=64,
    shuffle=True,
    drop_last=True,
    num_workers=8,
)

criterion = DINOLoss(
    output_dim=2048,
    warmup_teacher_temp_epochs=5,
)
# move loss to correct device because it also contains parameters
criterion = criterion.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 10

print("Starting Training")
for epoch in range(epochs):
    total_loss = 0
    momentum_val = cosine_schedule(epoch, epochs, 0.996, 1)
    for batch in dataloader:
        views = batch[0]
        update_momentum(model.student_backbone, model.teacher_backbone, m=momentum_val)
        update_momentum(model.student_head, model.teacher_head, m=momentum_val)
        views = [view.to(device) for view in views]
        global_views = views[:2]
        teacher_out = [model.forward_teacher(view) for view in global_views]
        student_out = [model.forward(view) for view in views]
        loss = criterion(teacher_out, student_out, epoch=epoch)
        total_loss += loss.detach()
        loss.backward()
        # We only cancel gradients of student head.
        model.student_head.cancel_last_layer_gradients(current_epoch=epoch)
        optimizer.step()
        optimizer.zero_grad()
        print("No error"")

    avg_loss = total_loss / len(dataloader)
    print(f"epoch: {epoch:>02}, loss: {avg_loss:.5f}")

Using downloaded and verified file: datasets/pascal_voc/VOCtrainval_11-May-2012.tar
Extracting datasets/pascal_voc/VOCtrainval_11-May-2012.tar to datasets/pascal_voc
Starting Training
epoch: 00, loss: 8.05496


KeyboardInterrupt: 