In [15]:
import pandas as pd
import numpy as np
from transformers import CLIPVisionModel, CLIPFeatureExtractor, AutoConfig
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import lightning.pytorch as pl
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score, accuracy_score
from PIL import Image

from lightning.pytorch.callbacks import ModelCheckpoint, LearningRateMonitor, EarlyStopping
from lightning.pytorch.loggers import CSVLogger
import os
import torchvision.transforms as T
from torchvision.transforms import Compose
from torch.utils.data import Dataset

SEED=1234542

pl.seed_everything(SEED, workers=True)

df_train=pd.read_csv('../../data/splitted/train.csv')
df_validation=pd.read_csv('../../data/splitted/validation.csv')
df_test=pd.read_csv('../../data/splitted/test.csv')


NUM_CLASSES= len(df_train['labels'].unique())

TRAIN_IMAGES_PATH= '../../images/train'
VALIDATION_IMAGES_PATH= '../../images/validation'
TEST_IMAGES_PATH= '../../images/test'

Global seed set to 1234542


In [16]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
MODEL_NAME = 'openai/clip-vit-base-patch32'

pretrained_model = CLIPVisionModel.from_pretrained(MODEL_NAME)
config= AutoConfig.from_pretrained(MODEL_NAME)
vision_config=config.vision_config
image_processor= CLIPFeatureExtractor.from_pretrained(MODEL_NAME)

Some weights of the model checkpoint at openai/clip-vit-base-patch32 were not used when initializing CLIPVisionModel: ['text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.1.mlp.fc2.weight', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.3.mlp.fc1.weight', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.10.layer_norm1.bias', 'text_model.encoder.layers.2.mlp.fc2.bias', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.10.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.8.self_attn.v_proj.bias', 'text_model.encoder.layers.5.mlp.fc2.weight', 'text_model.encoder.layers.6.layer_norm1.bias', 'text_model.encoder.layers.2.self_attn.v_proj.weight', 'text_model.encoder.layers.7.self_attn.out_proj.bias', 'text_model.encoder.layers

In [None]:
for i in range(len(df_train)):
    label_text= df_train['labels_text'].iloc[i]
    img_path = os.path.join(TRAIN_IMAGES_PATH, label_text, df_train['image_id'].iloc[i])
    img_path=img_path + '.jpg'
    image = Image.open(img_path)
    if(image.mode!= 'RGB'):
       image=image.convert('RGB')
       print(f'Converted: {i} {image.mode}')

In [17]:
class CustomImageDataset(Dataset):
    def __init__(self, df, img_dir):
        self.df= df
        self.img_dir = img_dir
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        label_text= self.df['labels_text'].iloc[idx]
        img_path = os.path.join(self.img_dir, label_text, self.df['image_id'].iloc[idx])
        img_path=img_path + '.jpg'
        image = Image.open(img_path)
        if(image.mode != 'RGB'):
            image=image.convert('RGB')
        label = self.df['labels'].iloc[idx]
        return image, label
    
train_dataset= CustomImageDataset(df_train, TRAIN_IMAGES_PATH)
validation_dataset= CustomImageDataset(df_validation, VALIDATION_IMAGES_PATH)
test_dataset= CustomImageDataset(df_test, TEST_IMAGES_PATH)

In [19]:
class VisionlCollator:
    HARD_IMG_AUGMENTER = T.RandAugment(num_ops=6, magnitude=9)
    SOFT_IMG_AUGMENTER = Compose([T.RandomPerspective(.1, p=.5),
                                  T.RandomHorizontalFlip(p=.5),
                                ])
    
    def __init__(self, processor=image_processor, augment_mode='hard', split='train'):
        # 40 max length for vilt // 77 max length for clip
        self.processor = processor
        self.split = split
        self.augment_mode = augment_mode

    def __call__(self, batch):
        images, labels = list(zip(*batch))
        if self.split=='train' and self.augment_mode == 'hard':
            images = [self.HARD_IMG_AUGMENTER(img) for img in images]
        elif self.split=='train' and self.augment_mode == 'soft':
            images = [self.SOFT_IMG_AUGMENTER(img) for img in images]

        encoding = self.processor(images=images, 
                                  return_tensors='pt')
        
        encoding['labels']=torch.tensor(labels)
        return encoding

In [20]:
BATCH_SIZE=8

collator_train=VisionlCollator(split='train')
collator_val=VisionlCollator(split='val')
collator_test=VisionlCollator(split='test')
train_loader = DataLoader(train_dataset, collate_fn=collator_train, batch_size=BATCH_SIZE, shuffle=True)
validation_loader = DataLoader(validation_dataset, collate_fn=collator_val, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, collate_fn=collator_test, batch_size=BATCH_SIZE)

In [21]:
class ImageClassifier(pl.LightningModule):
    def __init__(self, model=pretrained_model,  lr=2e-5):
        super(ImageClassifier, self).__init__()
        self.criterion = nn.CrossEntropyLoss()
        self.lr=lr
        # En el train hacemos media de medias
        self.train_loss=[]
        self.train_accs=[]
        self.train_f1s=[]
        
        
        # Aqui computamos las métricas con todo para mayor precision   
        self.val_loss=[]             
        self.all_val_y_true=[]
        self.all_val_y_pred=[]
        
        self.model = model
        self.fc1 = nn.Linear(vision_config.hidden_size, 512)
        self.activation1 = nn.GELU()
        self.output = nn.Linear(512, NUM_CLASSES)
        
    def compute_outputs(self, pixel_values):
        outputs = self.model(pixel_values=pixel_values)
        logits = outputs.pooler_output
        x = self.activation1(self.fc1(logits))
        return self.output(x)
    
    def forward(self, batch):
        pixel_values = batch['pixel_values']
        x = self.compute_outputs(pixel_values)
        return x
    
    def training_step(self, batch, batch_idx):
        pixel_values = batch['pixel_values']
        labels = batch['labels']
        #Compute the output logits
        logits = self.compute_outputs(pixel_values)
        #Compute metrics
        loss=self.criterion(logits,labels)
        preds = torch.argmax(logits, dim=-1)
        acc=accuracy_score(y_true=labels.tolist(), y_pred=preds.tolist())
        f1=f1_score(y_true=labels.tolist(), y_pred=preds.tolist(), average='macro')
        self.train_loss.append(loss)
        self.train_accs.append(acc)
        self.train_f1s.append(f1)
        
        return loss
    
    def on_train_epoch_end(self):
        # outs is a list of whatever you returned in `validation_step`
        mean_loss = sum(self.train_loss)/len(self.train_loss)
        mean_acc=sum(self.train_accs)/len(self.train_accs)
        mean_f1=sum(self.train_f1s)/len(self.train_f1s)
        
        self.log("train_loss", mean_loss)
        self.log("train_acc", mean_acc)
        self.log("train_f1", mean_f1)
        
        self.train_loss=[]
        self.train_accs=[]
        self.train_f1s=[]
    
    
    def validation_step(self, batch, batch_idx):
        pixel_values = batch['pixel_values']
        labels = batch['labels']
        #Compute the output logits
        logits = self.compute_outputs(pixel_values)
        #Compute metrics
        loss=self.criterion(logits,labels)
        preds = torch.argmax(logits, dim=-1)
        
        self.val_loss.append(loss)
        
        self.all_val_y_true.extend(labels.tolist())
        self.all_val_y_pred.extend(preds.tolist())
        return loss
    
    def on_validation_epoch_end(self):
        # outs is a list of whatever you returned in `validation_step`
        mean_loss = sum(self.val_loss)/len(self.val_loss)
        
        acc= accuracy_score(y_true=self.all_val_y_true, y_pred=self.all_val_y_pred)
        f1= f1_score(y_true=self.all_val_y_true, y_pred=self.all_val_y_pred, average='macro')
        
        self.log("val_loss", mean_loss)
        self.log("val_acc", acc)
        self.log("val_f1", f1)
        
        self.val_loss=[]
        self.all_val_y_true=[]
        self.all_val_y_pred=[]
    
    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.lr, amsgrad=True, weight_decay=0.01)
        scheduler = ReduceLROnPlateau(optimizer=optimizer, factor=0.1, patience=5)
        return {
                "optimizer": optimizer,
                "lr_scheduler": {
                    "scheduler": scheduler,
                    "monitor": "val_loss",
                    },
                }

In [22]:
experiment_name=f'{MODEL_NAME}_only'
# Define the callbacks
checkpoint_callback = ModelCheckpoint(
     dirpath='../../model_ckpts/Unimodal/Image',
     filename=experiment_name,
     monitor='val_f1', mode='max')
lr_monitor = LearningRateMonitor(logging_interval='epoch')
early_stopping = EarlyStopping('val_f1', patience=10,mode='max')

# instantiate the logger object
logger = CSVLogger(save_dir="../../logs/Unimodal/Image", name=experiment_name)
 

my_model=ImageClassifier(pretrained_model)
trainer=pl.Trainer(accelerator="gpu", devices=[0], deterministic=True, max_epochs=40, logger=logger, precision='16-mixed', accumulate_grad_batches=2,
                   callbacks=[lr_monitor, early_stopping, checkpoint_callback])
trainer.fit(model=my_model,train_dataloaders=train_loader, val_dataloaders=validation_loader)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name        | Type             | Params
-------------------------------------------------
0 | criterion   | CrossEntropyLoss | 0     
1 | model       | CLIPVisionModel  | 87.5 M
2 | fc1         | Linear           | 393 K 
3 | activation1 | GELU             | 0     
4 | output      | Linear           | 12.3 K
-------------------------------------------------
87.9 M    Trainable params


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


                                                                           

  rank_zero_warn(


Epoch 0:  67%|######7   | 4063/6023 [28:02<13:31,  2.41it/s, v_num=0]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
