In [9]:
from transformers import CLIPTokenizerFast, CLIPModel
import torch
from torchvision.datasets import ImageFolder
from torchvision.transforms import Compose, ToTensor, Lambda, Resize, Normalize
from PIL import Image, ImageDraw
from torch.utils.data import DataLoader, Dataset

from sklearn.metrics import accuracy_score,f1_score

import numpy as np
import pandas as pd
from tqdm import tqdm

In [10]:
DIRECTROY = 'data'
MODEL_PATH = 'models'
BATCH_SIZE = 32
IMG_SIZE = 224

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
df_train = pd.read_csv(f'{DIRECTROY}/reduced_train.csv') 
df_test = pd.read_csv(f'{DIRECTROY}/reduced_test.csv') 
num_classes = len(df_train['newid'].unique())
classes = df_train['label'].unique().tolist()

In [12]:
class2label = {c:l for c, l in zip(df_train['newid'], df_train['label'])}

In [13]:
len(class2label), num_classes

(640, 640)

In [8]:
len(df_train['label'].unique())

640

In [14]:
df_test_public = df_test[df_test['Usage'] == 'Public']
df_test_private = df_test[df_test['Usage'] == 'Private']

In [None]:
class CustomDataset(Dataset):
    def __init__(self, df, transforms, directory):
        self.tokenizer =  CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch16")
        self.df = df
        self.transforms = transforms
        self.directory = directory
        self.labels = torch.Tensor(df['newid'].values).long()
        self.imgs = [self.resize_img(Image.open(f'{DIRECTROY}/{self.directory}/{x}')).convert('RGB') for x in tqdm(df['name'].values)]

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img = self.imgs[idx]
        label = self.labels[idx]
        return img, label

In [15]:
from transformers import CLIPProcessor

In [16]:
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

In [20]:
processor.image_processor.image_mean, processor.image_processor.image_std

([0.48145466, 0.4578275, 0.40821073], [0.26862954, 0.26130258, 0.27577711])

In [23]:
tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch16")

In [24]:
tokenizer("a photo of a cat", padding=True, truncation=True, return_tensors="pt")

{'input_ids': tensor([[49406,   320,  1125,   539,   320,  2368, 49407]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

In [22]:
processor(text=["a photo of a cat"], images=Image.open("data/train/1.jpg"))

{'input_ids': [[49406, 320, 1125, 539, 320, 2368, 49407]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]], 'pixel_values': [array([[[1.9303361, 1.9303361, 1.9303361, ..., 1.9303361, 1.9303361,
         1.9303361],
        [1.9303361, 1.9303361, 1.9303361, ..., 1.9303361, 1.9303361,
         1.9303361],
        [1.9303361, 1.9303361, 1.9303361, ..., 1.9303361, 1.9303361,
         1.9303361],
        ...,
        [1.9303361, 1.9303361, 1.9303361, ..., 1.9303361, 1.9303361,
         1.9303361],
        [1.9303361, 1.9303361, 1.9303361, ..., 1.9303361, 1.9303361,
         1.9303361],
        [1.9303361, 1.9303361, 1.9303361, ..., 1.9303361, 1.9303361,
         1.9303361]],

       [[2.0748837, 2.0748837, 2.0748837, ..., 2.0748837, 2.0748837,
         2.0748837],
        [2.0748837, 2.0748837, 2.0748837, ..., 2.0748837, 2.0748837,
         2.0748837],
        [2.0748837, 2.0748837, 2.0748837, ..., 2.0748837, 2.0748837,
         2.0748837],
        ...,
        [2.0748837, 2.0748837, 2.0748837, .

In [7]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch16")

config.json:   0%|          | 0.00/4.10k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/599M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [8]:
model.parameters

<bound method Module.parameters of CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_n

In [9]:
prompts = tokenizer(classes, return_tensors="pt", padding=True, truncation=True)
criteria = torch.nn.CrossEntropyLoss()

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [None]:
true_labels = []
pred_labels = []
test_loss = 0
len_dataset = 0
for i in range(2):
    dataset = torch.load(f'{DIRECTROY}/test_public_dataset/train_dataset_{i}.pth')
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
    for i, (inputs, labels) in tqdm(enumerate(dataset)):
        inputs = inputs.to(device)
        labels = labels.to(device)
        with torch.no_grad():
            logis_per_image, logis_per_text = model(inputs, labels=prompts.input_ids)
            
            loss = criteria(logis_per_image, labels)
            
            pred = torch.argmax(logis_per_image,1).flatten().cpu().numpy()
            labels = labels.flatten().cpu().numpy()
            
            true_labels.extend(pred)
            pred_labels.extend(labels)
        
            test_loss += loss.item()
    len_dataset += len(dataset)
        
       
print(f'Loss: {test_loss/len_dataset}')
print(f'Accuracy: {accuracy_score(true_labels, pred_labels)}')
print(f'F1 Score Weighted: {f1_score(true_labels, pred_labels, average="weighted")}')
print(f'F1 Score Macro: {f1_score(true_labels, pred_labels, average="macro")}')
    