In [19]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import models
import torch
from collections import OrderedDict
import json
import os
import torchvision.transforms as transforms
from tokenizer import SimpleTokenizer
import datasets
import utils
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.colors as pc
import plotly.io as pio
import random
import kaleido
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler

In [20]:
model = getattr(models, 'ICLIP_VITB16')()
model.cuda()

# Creating model
ckpt_path = 'checkpoint_best.pt'

ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=False)
state_dict = OrderedDict()
for k, v in ckpt['state_dict'].items():
    state_dict[k.replace('module.', '')] = v

old_args = ckpt['args']
print("=> creating model: {}".format(old_args.model))
model = getattr(models, old_args.model)()
model.cuda()
model.load_state_dict(state_dict, strict=True)
print("=> loaded resume checkpoint '{}' (epoch {})".format(ckpt_path, ckpt['epoch']))

	Creating MAE projection head
	MAE projection head created
	Creating IBOT projection head
	keys have been loaded for ibot head with status: <All keys matched successfully>
	IBOT projection head created
	DetailCLIP model created
=> creating model: ICLIP_VITB16
	Creating MAE projection head
	MAE projection head created
	Creating IBOT projection head
	keys have been loaded for ibot head with status: <All keys matched successfully>
	IBOT projection head created
	DetailCLIP model created
=> loaded resume checkpoint 'checkpoint_best.pt' (epoch 48)


In [54]:
cwd = '/home/onyxia/work/DetailCLIP'
with open(os.path.join(cwd, 'dataset_catalog.json')) as f:
    catalog = json.load(f)

with open(os.path.join(cwd, 'templates.json')) as f:
    all_templates = json.load(f)

with open(os.path.join(cwd, 'labels.json')) as f:
    all_labels = json.load(f)

In [22]:
# Data loading code
print("=> creating dataset")
tokenizer = SimpleTokenizer()
val_transform = transforms.Compose([
        transforms.Resize(224),
        transforms.CenterCrop(224),
        lambda x: x.convert('RGB'),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                std=[0.229, 0.224, 0.225])
    ])

=> creating dataset


In [98]:
d='blip'
print('Evaluating {}'.format(d))
val_dataset = datasets.get_downstream_dataset(catalog, name=d, is_train=False, transform=val_transform)

val_loader = torch.utils.data.DataLoader(
    val_dataset, batch_size=256, shuffle=False,
    num_workers=10, pin_memory=True, drop_last=False)

templates = all_templates[d]
labels = all_labels[d]
true_labels = all_labels['cub200'][:9]

model.eval()

Evaluating blip


ICLIP(
  (visual): MaskVisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      (norm): Identity()
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): Sequential(
      (0): Block(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (act): GELU(approximate='none')
          (drop1): Dropout(p=0.0, inplace=False)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (drop2): Dropout

In [81]:
#Text embeddings

results = []
print('=> encoding captions')
with torch.no_grad():
    text_features = []
    print(len(labels))
    for label in labels:
        texts = label
        texts = tokenizer(texts).cuda(non_blocking=True)
        texts = texts.view(-1, 77).contiguous()
        class_embeddings = utils.get_model(model).encode_text(texts, ema=True)
        class_embeddings = class_embeddings / class_embeddings.norm(dim=-1, keepdim=True)
        class_embeddings = class_embeddings.mean(dim=0)
        class_embeddings = class_embeddings / class_embeddings.norm(dim=-1, keepdim=True)
        text_features.append(class_embeddings)

    text_features = torch.stack(text_features, dim=0)

print(text_features.size())

=> encoding captions
270
torch.Size([270, 512])


In [74]:
# t-SNE

color = [[i for j in range(9)] for i in range(30)]
flat_color = np.array(color).flatten().tolist()

text_features_tsne = text_features
tsne = TSNE(n_components=2, verbose=0, perplexity=40, n_iter=300)
text_tsne = tsne.fit_transform(text_features_tsne.to('cpu'))

fig = px.scatter(x=text_tsne[:, 0], y=text_tsne[:, 1], hover_name=[all_labels[d] for d in catalog][0], color=flat_color)
fig.update_layout(
    title="t-SNE class names embeddings",
    xaxis_title="First Principal Component",
    yaxis_title="Second Principal Component",
)
cwd = '/home/onyxia/work/tsne_plot/'
path = os.path.join(cwd, 't-SNE class names embeddings.png')
fig.write_image(path, format='png', engine='kaleido')
fig.show()


'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.



In [65]:
# Image embeddings

with torch.no_grad():
    image_features_tot = []
    target_list = []
    for images, target in val_loader:
        images = images.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)

        # encode images
        image_features = utils.get_model(model).encode_image(images, ema=True)
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        image_features_tot.append(image_features)
        target_list.append(target)

In [75]:
image_features_cat = torch.cat(image_features_tot, dim=0)

In [59]:
# fusion of text and image embeddings
scaler = StandardScaler()

text_features_tsne0 = scaler.fit_transform(text_features.cpu())
image_features_tsne0 = scaler.fit_transform(image_features_cat.cpu())

text_features_tsne0 = torch.Tensor(text_features_tsne0)
image_features_tsne0 = torch.Tensor(image_features_tsne0)

text_cat_image = torch.cat((text_features_tsne0, image_features_tsne0), dim=0)
print(text_cat_image.size())

torch.Size([6190, 512])


In [104]:
txt_img_color = flat_color + torch.cat(target_list).tolist()[:270]
print(txt_img_color)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 0, 0, 0, 

In [None]:
# t-SNE of the merged tensor

class_labels = np.array([[label for _ in range(30)] for label in true_labels]).flatten().tolist()

unique_labels = sorted(set(class_labels))
label_to_int = {label: i for i, label in enumerate(unique_labels)}

color_list = [label_to_int[label] for label in class_labels]

text_labels = []
for i,label in enumerate(class_labels):
    text_labels.append(label + ': "' + labels[i] + '"')

tsne = TSNE(n_components=2, verbose=0, perplexity=40, n_iter=300)
image_text_tsne = tsne.fit_transform(text_cat_image.to('cpu'))

sizes = np.array([15 if i < 270 else 5 for i in range(len(image_text_tsne))])

hover_labels = text_labels+ class_labels

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=image_text_tsne[:540, 0],
    y=image_text_tsne[:540, 1],
    mode='markers',
    text=hover_labels,
    hoverinfo = 'text',
    hovertemplate='%{text}<extra></extra>',
    marker=dict(
        size=sizes,
        color=color_list + color_list,
        opacity=0.7,
        line=dict(width=0),
    ),
    showlegend=False
))

fig.update_layout(
legend=dict(
    title='Types de données',
    x=0.01,
    y=0.99,
    bgcolor='rgba(255,255,255,0.8)',
    bordercolor='black',
    borderwidth=1
    )
)
fig.data[0].name = "Image embeddings"

fig.update_traces(
    marker=dict(
                line=dict(width=0,)
                ))

path='/home/onyxia/work/DetailCLIP/images/text + image/image.png'
fig.write_image(path, format='png', engine='kaleido')
fig.show()

['Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Black footed Albatross', 'Laysan Albatross', 'Laysan Albatross', 'Laysan Albatross', 'Laysan Albatross', 'Laysan Albatross', 'Laysan Albatross', 'Laysan Albatross', 'Laysan Albatross', 'Laysan Albatross', 'Laysan Albatross', 'Laysan Albatross',


'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.



[15 15 15 ...  5  5  5]
