In [None]:
from transformers import CLIPTokenizerFast, CLIPModel
import torch
from torchvision.datasets import ImageFolder
from torchvision.transforms import Compose, ToTensor, Lambda, Resize, Normalize
from PIL import Image, ImageDraw
from torch.utils.data import DataLoader, Dataset

from sklearn.metrics import accuracy_score,f1_score

import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
DIRECTROY = 'data'
MODEL_PATH = 'models'
BATCH_SIZE = 32
IMG_SIZE = 224

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
df_train = pd.read_csv(f'{DIRECTROY}/train.csv') 
df_test = pd.read_csv(f'{DIRECTROY}/test_kaggletest.csv') 
num_classes = len(df_train['class'].unique())
classes = df_train['class'].unique().values.tolist()

In [None]:
df_test_public = df_test[df_test['Usage'] == 'Public']
df_test_private = df_test[df_test['Usage'] == 'Private']

In [None]:
image_transforms = Compose([
    Resize((IMG_SIZE, IMG_SIZE)),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


In [None]:
class CustomDataset(Dataset):
    def __init__(self, df, transforms, directory):
        self.df = df
        self.transforms = transforms
        self.directory = directory

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img = Image.open(f'{DIRECTROY}/{self.directory}/{self.df.iloc[idx, 0]}')
        img = self.transforms(img)
        label = self.df.iloc[idx, 1]
        return img, label

In [None]:
train_dataset = CustomDataset(df_train, image_transforms, 'train')
test_dataset = CustomDataset(df_test_public, image_transforms, 'test')

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch16")

In [None]:
prompts = tokenizer(classes, return_tensors="pt", padding=True, truncation=True)
criteria = torch.nn.CrossEntropyLoss()

In [None]:
true_labels = []
pred_labels = []

for i, (inputs, labels) in tqdm(enumerate(train_dataset)):
    inputs = inputs.to(device)
    labels = labels.to(device)
    with torch.no_grad():
        logis_per_image, logis_per_text = model(inputs, labels=prompts.input_ids)
        
        loss = criteria(logis_per_image, labels)
        
        pred = torch.argmax(logis_per_image,1).flatten()
        labels = labels.flatten()
        
        true_labels.extend(pred)
        pred_labels.extend(labels)
       
print(f'Loss: {loss}')
print(f'Accuracy: {accuracy_score(true_labels, pred_labels)}')
print(f'F1 Score Weighted: {f1_score(true_labels, pred_labels, average="weighted")}')
print(f'F1 Score Macro: {f1_score(true_labels, pred_labels, average="macro")}')
    