In [None]:
from transformers import CLIPTokenizerFast, CLIPModel
import torch
from torchvision.datasets import ImageFolder
from torchvision.transforms import Compose, ToTensor, Lambda, Resize, Normalize
from PIL import Image, ImageDraw
from torch.utils.data import DataLoader, ImageFolder
import torch.optim as optim

from sklearn.metrics import accuracy_score,f1_score

import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
DIRECTROY = 'data'
MODEL_PATH = 'models'
IMG_SIZE = 224
BATCH_SIZE = 32
EPOCHS = 100
LR = 0.0001
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
df_train = pd.read_csv(f'{DIRECTROY}/train.csv') 
num_classes = len(df_train['class'].unique())
classes = df_train['class'].unique().values.tolist()

In [None]:
image_transforms = Compose([
    Resize((IMG_SIZE, IMG_SIZE)),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

data = ImageFolder(f'{DIRECTROY}/train', transform=image_transforms)
train_dataset = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
test_dataset = train_dataset

In [None]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch16")

prompts = tokenizer(classes, return_tensors="pt", padding=True, truncation=True)

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = LR)
scheduler = optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.1, total_iters=EPOCHS)

In [None]:
for epoch in range(EPOCHS):
    model.train()
    train_loss = 0.0
    
    # Training loop
    for inputs, labels in tqdm(train_dataset):
        optimizer.zero_grad()
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        outputs = model(inputs)
        loss = criterion(outputs.logits, labels)
        
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        
    scheduler.step()    
    train_loss/=len(train_dataset)
    print(f'Epoch {epoch+1}/{EPOCHS}, Loss: {train_loss}')
    
    eval_loss = 0.0
    model.eval()
    
    true_labels = []
    pred_labels = []
    
    with torch.no_grad():
        for inputs, labels in tqdm(test_dataset):
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs.logits, labels)
            eval_loss += loss.item()
            
            outputs = torch.argmax(outputs.logits, 1).flatten()
            labels = labels.flatten()
            
            true_labels.extend(labels)
            pred_labels.extend(outputs)
        
        print(f'Epoch {epoch+1}/{EPOCHS}, Loss: {loss}')
        print(f'Accuracy: {accuracy_score(true_labels, pred_labels)}')
        print(f'F1 Score Weighted: {f1_score(true_labels, pred_labels, average="weighted")}')
        print(f'F1 Score Macro: {f1_score(true_labels, pred_labels, average="macro")}')
            