In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import models
import torch
from collections import OrderedDict
import json
import os
import torchvision.transforms as transforms
from tokenizer import SimpleTokenizer
import datasets
import utils
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.colors as pc
import plotly.io as pio
import random
import kaleido
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Subset
from collections import Counter
from pathlib import Path

ModuleNotFoundError: No module named 'sklearn'

In [None]:
model = getattr(models, 'ICLIP_VITB16')()
model.cuda()

# Creating model
ckpt_path = 'checkpoint_best.pt'

ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=False)
state_dict = OrderedDict()
for k, v in ckpt['state_dict'].items():
    state_dict[k.replace('module.', '')] = v

old_args = ckpt['args']
print("=> creating model: {}".format(old_args.model))
model = getattr(models, old_args.model)()
model.cuda()
model.load_state_dict(state_dict, strict=True)
print("=> loaded resume checkpoint '{}' (epoch {})".format(ckpt_path, ckpt['epoch']))

	Creating MAE projection head
	MAE projection head created
	Creating IBOT projection head


  WeightNorm.apply(module, name, dim)


	keys have been loaded for ibot head with status: <All keys matched successfully>
	IBOT projection head created
	DetailCLIP model created
=> creating model: ICLIP_VITB16
	Creating MAE projection head
	MAE projection head created
	Creating IBOT projection head
	keys have been loaded for ibot head with status: <All keys matched successfully>
	IBOT projection head created
	DetailCLIP model created
=> loaded resume checkpoint 'checkpoint_best.pt' (epoch 48)


In [None]:
cwd = '/home/onyxia/work/DetailCLIP'
with open(os.path.join(cwd, 'dataset_catalog.json')) as f:
    catalog = json.load(f)

with open(os.path.join(cwd, 'templates.json')) as f:
    all_templates = json.load(f)

with open(os.path.join(cwd, 'labels.json')) as f:
    all_labels = json.load(f)

In [None]:
print("=> creating dataset")
tokenizer = SimpleTokenizer()
val_transform = transforms.Compose([
        transforms.Resize(224),
        transforms.CenterCrop(224),
        lambda x: x.convert('RGB'),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                std=[0.229, 0.224, 0.225])
    ])

=> creating dataset


In [None]:
classes_to_keep = ['001.Black_footed_Albatross',
'002.Laysan_Albatross',
'005.Crested_Auklet',
'012.Yellow_headed_Blackbird',
'016.Painted_Bunting',
'017.Cardinal',
'019.Gray_Catbird',
'036.Northern_Flicker',
'044.Frigatebird',
'045.Northern_Fulmar',
'056.Pine_Grosbeak',
'069.Rufous_Hummingbird']

In [None]:
d='handmade'
print('Evaluating {}'.format(d))
val_dataset = datasets.get_downstream_dataset(catalog, name=d, is_train=False, transform=val_transform)

class_to_idx = {cls: i for i, cls in enumerate(val_dataset.classes)}
indices = [i for i, (_, label) in enumerate(val_dataset.samples) 
           if val_dataset.classes[label] in classes_to_keep]

subset = Subset(val_dataset, indices) # dataset with selected classes

val_loader = torch.utils.data.DataLoader(
    subset, batch_size=256, shuffle=False,
    num_workers=10, pin_memory=True, drop_last=False)

labels = all_labels[d]
true_labels = all_labels['cub200']

model.eval()

root = Path(val_dataset.root)
counts = {cls: len(list((root / cls).glob('*.jpg'))) for cls in classes_to_keep}
img_counts = list(counts.values()) # number of images per class

Evaluating handmade


In [None]:
#Text embeddings

results = []
print('=> encoding captions')
with torch.no_grad():
    text_features = []
    for label in labels:
        texts = label
        texts = tokenizer(texts).cuda(non_blocking=True)
        texts = texts.view(-1, 77).contiguous()
        class_embeddings = utils.get_model(model).encode_text(texts, ema=True)
        class_embeddings = class_embeddings / class_embeddings.norm(dim=-1, keepdim=True)
        class_embeddings = class_embeddings.mean(dim=0)
        class_embeddings = class_embeddings / class_embeddings.norm(dim=-1, keepdim=True)
        text_features.append(class_embeddings)

    text_features = torch.stack(text_features, dim=0)

text_features.size()

=> encoding captions


torch.Size([12, 512])

In [None]:
# Image embeddings

with torch.no_grad():
    image_features_tot = []
    target_list = []
    for images, target in val_loader:
        images = images.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)

        # encode images
        image_features = utils.get_model(model).encode_image(images, ema=True)
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        image_features_tot.append(image_features)
        target_list.append(target)

In [None]:
image_features_cat = torch.cat(image_features_tot, dim=0)

In [None]:
# fusion of text and image embeddings

# standardization
scaler = StandardScaler()
text_features_tsne0 = scaler.fit_transform(text_features.cpu())
image_features_tsne0 = scaler.fit_transform(image_features_cat.cpu())

text_features_tsne0 = torch.Tensor(text_features_tsne0)
image_features_tsne0 = torch.Tensor(image_features_tsne0)

text_cat_image = torch.cat((text_features_tsne0, image_features_tsne0), dim=0)

In [None]:
original_labels = torch.cat(target_list).tolist() # one index per image, sorted
unique_labels = list(set(original_labels))
unique_labels.sort()

# normalizing indexes for color balancing
label_map = {old_label: new_label for new_label, old_label in enumerate(unique_labels)}
normalized_labels = [label_map[label] for label in original_labels]  # images color indexes, sorted
normalized_unique_labels = list(range(12))   # text color indexes, sorted

shuffled_classes = normalized_unique_labels.copy() # text color indexes, shuffled
random.seed(3)
random.shuffle(shuffled_classes)

shuffling_dic = {i: shuffled for i, shuffled in zip(normalized_unique_labels,shuffled_classes)}

shuffled_colors = [shuffling_dic[i] for i in normalized_labels] # images color indexes, shuffled

In [None]:
class_names = ['Black_footed_Albatross',
'Laysan_Albatross',
'Crested_Auklet',
'Yellow_headed_Blackbird',
'Painted_Bunting',
'Cardinal',
'Gray_Catbird',
'Northern_Flicker',
'Frigatebird',
'Northern_Fulmar',
'Pine_Grosbeak',
'Rufous_Hummingbird']

In [None]:
# t-SNE of the merged tensor

tsne = TSNE(n_components=2, verbose=0, perplexity=40, max_iter=300)
image_text_tsne = tsne.fit_transform(text_cat_image.to('cpu'))

In [None]:
sizes = np.array([15 if i < 12 else 5 for i in range(len(image_text_tsne))])

image_hover_list = [[class_names[i] for _ in range(img_counts[i])] for i in range(12)]
flatten_image_hover_list = [name for sublist in class_names for name in sublist]

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=image_text_tsne[:12, 0],
    y=image_text_tsne[:12, 1],
    mode='markers',
    name='Detailed caption embeddings',
    text = class_names,
    hoverinfo='text',
    hovertemplate='%{text}<extra></extra>',
    marker=dict(
        size=sizes[:12],
        color=shuffled_classes,
        opacity=0.7,
        line=dict(width=0)
    ),
    showlegend=True
))

fig.add_trace(go.Scatter(
    x=image_text_tsne[12:, 0],
    y=image_text_tsne[12:, 1],
    mode='markers',
    name='Image embeddings',
    text = flatten_image_hover_list,
    hoverinfo='text',
    hovertemplate='%{text}<extra></extra>',    
    marker=dict(
        size=sizes[12:],
        color=shuffled_colors,
        opacity=0.7,
        line=dict(width=0)
    ),
    showlegend=True
))


fig.update_layout(
legend=dict(
    title='Types de données',
    x=0.01,
    y=0.99,
    bgcolor='rgba(255,255,255,0.8)',
    bordercolor='black',
    borderwidth=1
    )
)

fig.update_traces(
    
    marker=dict(
                              line=dict(width=0,
                                        )
                  ))

path='/home/onyxia/work/DetailCLIP/images/text + image/image.png'
fig.write_image(path, format='png', engine='kaleido')
fig.show()



In [None]:
def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.reshape(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res

In [None]:
# Classification

logits_per_image = image_features @ text_features.t()

pred = logits_per_image.argmax(dim=1)
correct = pred.eq(target).sum()
total_top1 += correct.item()
total_images += images.size(0)