In [1]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import models
import torch
from collections import OrderedDict
import json
import os
import torchvision.transforms as transforms
from tokenizer import SimpleTokenizer
import datasets
import utils
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.colors as pc
import plotly.io as pio
import random
import kaleido
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Subset
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = getattr(models, 'ICLIP_VITB16')()
model.cuda()

# Creating model
ckpt_path = 'checkpoint_best.pt'

ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=False)
state_dict = OrderedDict()
for k, v in ckpt['state_dict'].items():
    state_dict[k.replace('module.', '')] = v

old_args = ckpt['args']
print("=> creating model: {}".format(old_args.model))
model = getattr(models, old_args.model)()
model.cuda()
model.load_state_dict(state_dict, strict=True)
print("=> loaded resume checkpoint '{}' (epoch {})".format(ckpt_path, ckpt['epoch']))

cwd = '/home/onyxia/work/DetailCLIP'
with open(os.path.join(cwd, 'dataset_catalog.json')) as f:
    catalog = json.load(f)

with open(os.path.join(cwd, 'templates.json')) as f:
    all_templates = json.load(f)

with open(os.path.join(cwd, 'labels.json')) as f:
    all_labels = json.load(f)

print("=> creating dataset")
tokenizer = SimpleTokenizer()
val_transform = transforms.Compose([
        transforms.Resize(224),
        transforms.CenterCrop(224),
        lambda x: x.convert('RGB'),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                std=[0.229, 0.224, 0.225])
    ])

classes_to_keep = ['001.Black_footed_Albatross',
'002.Laysan_Albatross',
'005.Crested_Auklet',
'012.Yellow_headed_Blackbird',
'016.Painted_Bunting',
'017.Cardinal',
'019.Gray_Catbird',
'036.Northern_Flicker',
'044.Frigatebird',
'045.Northern_Fulmar',
'056.Pine_Grosbeak',
'069.Rufous_Hummingbird']

d='handmade'
print('Evaluating {}'.format(d))
val_dataset = datasets.get_downstream_dataset(catalog, name=d, is_train=False, transform=val_transform)

class_to_idx = {cls: i for i, cls in enumerate(val_dataset.classes)}
indices = [i for i, (_, label) in enumerate(val_dataset.samples) 
           if val_dataset.classes[label] in classes_to_keep]

subset = Subset(val_dataset, indices) # dataset with selected classes

val_loader = torch.utils.data.DataLoader(
    subset, batch_size=58, shuffle=False,
    num_workers=10, pin_memory=True, drop_last=False)

labels = all_labels[d]
true_labels = all_labels['cub200']

model.eval()

	Creating MAE projection head
	MAE projection head created
	Creating IBOT projection head


  WeightNorm.apply(module, name, dim)


	keys have been loaded for ibot head with status: <All keys matched successfully>
	IBOT projection head created
	DetailCLIP model created
=> creating model: ICLIP_VITB16
	Creating MAE projection head
	MAE projection head created
	Creating IBOT projection head
	keys have been loaded for ibot head with status: <All keys matched successfully>
	IBOT projection head created
	DetailCLIP model created
=> loaded resume checkpoint 'checkpoint_best.pt' (epoch 48)
=> creating dataset
Evaluating handmade


ICLIP(
  (visual): MaskVisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      (norm): Identity()
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): Sequential(
      (0): Block(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (act): GELU(approximate='none')
          (drop1): Dropout(p=0.0, inplace=False)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (drop2): Dropout

In [None]:
root = Path(val_dataset.root)
counts = {cls: len(list((root / cls).glob('*.jpg'))) for cls in classes_to_keep}
img_counts = list(counts.values()) # number of images per class

348


In [19]:
def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res

In [31]:
# classification

total_top1 = 0
total_images = 0

all_outputs = []
all_targets = []
all_acc=[]

print('=> encoding captions')
with torch.no_grad():
    text_features = []
    for label in labels:
        texts = label
        texts = tokenizer(texts).cuda(non_blocking=True)
        texts = texts.view(-1, 77).contiguous()
        class_embeddings = utils.get_model(model).encode_text(texts, ema=True)
        class_embeddings = class_embeddings / class_embeddings.norm(dim=-1, keepdim=True)
        class_embeddings = class_embeddings.mean(dim=0)
        class_embeddings = class_embeddings / class_embeddings.norm(dim=-1, keepdim=True)
        text_features.append(class_embeddings)
    text_features = torch.stack(text_features, dim=0)

    for images, target in val_loader:
        images = images.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)

        # encode images
        image_features = utils.get_model(model).encode_image(images, ema=True)
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)

        # cosine similarity as logits
        logits_per_image = image_features @ text_features.t()

        all_outputs.append(logits_per_image)
        all_targets.append(target)

        #print(logits_per_image.size(), target.size()) # [58,12]  [58]
        acc = accuracy(logits_per_image, target, topk=(1,12))
        all_acc.append(acc)

top1 = torch.tensor([a[0] for a in all_acc]).mean()
top5 = torch.tensor([a[1] for a in all_acc]).mean()


print('top1 :', top1)
print('top5 :', top5)

=> encoding captions
top1 : tensor(2.8736)
top5 : tensor(31.6092)
