# Imports, Installs and Definitions

In [1]:
!pip install transformers --quiet
!pip install neptune-client --quiet
!pip install datasets --quiet
# !pip install torchvision --quiet
!pip install ipywidgets --quiet
# !pip install adabelief-pytorch
# !pip install wandb -qqq
# !pip install wandb --upgrade

In [2]:
import os
import math
import glob
import torch
import json
import random
import requests
import numpy as np
import pandas as pd
from PIL import Image
import torch.nn as nn
from torch.optim import Adam
import neptune.new as neptune
from torchvision import datasets
from datasets import load_dataset
from torchvision import transforms
from torch.utils.data import IterableDataset
from transformers import AutoConfig, AutoModel
from torch.utils.data import Dataset,DataLoader
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer,CLIPFeatureExtractor
# from adabelief_pytorch import AdaBelief


from transformers import AutoTokenizer  # Or BertTokenizer
from transformers import AutoModelForPreTraining  # Or BertForPreTraining for loading pretraining heads
from transformers import AutoModel  # or BertModel, for BERT without pretraining heads
from transformers import CLIPVisionModel

# it makes PIL ignore truncated blocks within a image
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

# import wandb

In [3]:
torch.cuda.is_available()

False

In [4]:
import torch
torch.zeros(1).cuda()

AssertionError: Torch not compiled with CUDA enabled

In [3]:
run = neptune.init(
    project="alyssonbm/CLIP-PTBR",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJjNDgzMzU0OS01N2JmLTQyN2YtODQ5My01ZTE1MDg1ODRiNmQifQ==",
)  # your credentials

https://app.neptune.ai/alyssonbm/CLIP-PTBR/e/CLIP3-48
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


In [4]:
# wandb.login()

wandb: Currently logged in as: alyssonbm. Use `wandb login --relogin` to force relogin


True

In [5]:
# wandb.finish()

In [6]:
# wandb.init(project="CLIP-PTBR", entity="alyssonbm")

In [7]:
params = {
    'BATCH_SIZE': 16, 
    "learning_rate": 1e-5, 
    "optimizer": "Adam", 
    'weight_decay':0.001,
    'n_epochs' : 200,
    'patience':10,
    'logs':'all',
    'padding_size' : 95,
    'accumulate_grad_batches': 32,
    'frozen_time' : 10,
}

re_do_csv_pracegover_dataset = False

run["parameters"] = params
# wandb.config = params

## PraCegoVer Data

In [8]:
image_folder = '../../../datasets/pracegover/images'
f = open('../../../datasets/pracegover/pracegover_173k/pracegover_dataset.json')
dataset = json.load(f)
f.close()

f = open('../../../datasets/pracegover/pracegover_173k/pracegover_captions_val2014.json')
dataset_valid = json.load(f)
f.close()

In [9]:
df_pracegover = []
for data in dataset['images']:
    d = {
        'image_name' : data['filename'],  # some formula for obtaining values
        'filepath' : data['filepath'],
        'comment' : data['sentences'][0]['raw'],
    }
    df_pracegover.append(d)

df_pracegover = pd.DataFrame(df_pracegover)

In [10]:
df_pracegover.head()

Unnamed: 0,image_name,filepath,comment
0,i-00081339.jpg,train2014,"Imagem de um notebook aberto em uma planilha, ..."
1,i-00202456.jpg,train2014,imagem com fundo roxo com plantas de cannabis....
2,i-00202458.jpg,train2014,Na imagem uma ilustração bem forte de caveiras...
3,i-00202455.jpg,train2014,"Imagem ilustrativa de um desenho de época, da ..."
4,i-00202457.jpg,train2014,"Ilustração com um homem em estrada de asfalto,..."


In [11]:
valid_annotations = []
for data in dataset_valid['annotations']:
    d = {
        'image_id' : data['image_id'],  # some formula for obtaining values
        'comment' : data['caption'].replace('\n'," "),
    }
    valid_annotations.append(d)

valid_annotations = pd.DataFrame(valid_annotations)

In [12]:
valid_file_names = []
for data in dataset_valid['images']:
    d = {
        'image_id' : data['id'],  # some formula for obtaining values
        'image_name' : data['file_name'],
    }
    valid_file_names.append(d)

valid_file_names = pd.DataFrame(valid_file_names)

In [13]:
df_pracegover_valid = pd.merge(valid_annotations, valid_file_names, how='inner')

In [14]:
df_pracegover_valid.head()

Unnamed: 0,image_id,comment,image_name
0,391760,versos em fotografia de um pássaro com os pés ...,i-00391760.jpg
1,208391,poema em página de livro § time-lapse § a flor...,i-00208391.jpg
2,382172,versos sobre fotografia de um pássaro pousado ...,i-00382172.jpg
3,382501,versos sobre fotografia de pegadas sobre a nev...,i-00382501.jpg
4,191147,poema em página de livro § tocata e fuga § que...,i-00191147.jpg


## Dataset and Dataloaders

In [15]:
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [16]:
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)

In [17]:
from typing import Tuple

class image_caption_dataset(Dataset):
    def __init__(self, df_flickr,flicker_image_folder):

        self.dataset = df_flickr
        self.flicker_image_folder = flicker_image_folder

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return self.dataset['comment'][idx], (Image.open(self.flicker_image_folder+ '/' + self.dataset['image_name'][idx]))

image_text_dataset = image_caption_dataset(df_pracegover,image_folder)
image_text_dataset_valid = image_caption_dataset(df_pracegover_valid,image_folder)


class collate_func_text_img:
    def __init__(self, clip_processor,betimbau_tokenizer):
        self.clip_processor = clip_processor
        self.betimbau_tokenizer = betimbau_tokenizer
    def __call__(self, batch: list) -> Tuple[torch.Tensor, torch.Tensor]:
        text = []
        image = []
        for i in batch:
            text_value, image_value = i
            text.append(text_value)
            image.append(image_value)
            image_input = self.clip_processor(images=image, return_tensors="pt", padding=True, truncation=True)
        text_input = self.betimbau_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=params['padding_size'])
        
        return image_input,text_input

dataloader_img_txt = DataLoader(dataset=image_text_dataset, batch_size=params['BATCH_SIZE'], collate_fn=collate_func_text_img(processor,tokenizer), drop_last=True)
dataloader_img_txt_valid = DataLoader(dataset=image_text_dataset_valid, batch_size=params['BATCH_SIZE'], collate_fn=collate_func_text_img(processor,tokenizer), drop_last=True)

In [18]:
one_batch_exemple = next(iter(dataloader_img_txt))

In [21]:
one_batch_exemple[0]

torch.Size([16, 3, 224, 224])

# Training

In [20]:
class CLIPTBR(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.projection_dim = 512
        self.model_clip = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
        self.model_bertimbau = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased').to(device)
        self.model_clip.gradient_checkpointing_enable()
        self.model_bertimbau.gradient_checkpointing_enable()
        self.visual_projection = nn.Linear(self.model_clip.vision_model.post_layernorm.normalized_shape[0], self.projection_dim, bias=False).to(device)
        self.text_projection = nn.Linear(self.model_bertimbau.pooler.dense.in_features, self.projection_dim, bias=False).to(device)

        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)).to(device)

    def encode_visual(self, visual_inputs):
        outputs = self.model_clip(visual_inputs.to(device))
        hidden_states = outputs.pooler_output

        return self.visual_projection(hidden_states)

    def encode_text(self, text_inputs):
        outputs = self.model_bertimbau(**text_inputs.to(device))
        return self.text_projection(outputs.pooler_output)

    def forward(self, data):
        image_input, text_input = data
        image_features = self.encode_visual(image_input['pixel_values'])
        text_features = self.encode_text(text_input)
        
        return image_features, text_features

    def compute_logits(self, image_features, text_features,fixed_logit):
        # normalized features
        image_features = image_features / image_features.norm(dim=1, keepdim=True)
        text_features = text_features / text_features.norm(dim=1, keepdim=True)

        # cosine similarity as logits
        if (fixed_logit) >= 0:
            logit_scale = self.logit_scale.exp()
        else:
            logit_scale = 20
        logits_per_image = logit_scale * image_features @ text_features.t()
        logits_per_text = logits_per_image.t()

        # shape: [batch_size, batch_size]
        return logits_per_image, logits_per_text
    
    
    def model_requires_grad(self, status = True):
        for param in self.model_clip.parameters():
            param.requires_grad = status
        for param in self.model_bertimbau.parameters():
            param.requires_grad = status

In [15]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [16]:
device

device(type='cpu')

In [22]:
def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))

def clip_loss(similarity: torch.Tensor) -> torch.Tensor:
    caption_loss = contrastive_loss(similarity)
    image_loss = contrastive_loss(similarity.T)
    return (caption_loss + image_loss) / 2.0

In [23]:
clip_ptbr = CLIPTBR()

clip_ptbr= nn.DataParallel(clip_ptbr)
clip_ptbr.to(device)

Some weights of the model checkpoint at openai/clip-vit-base-patch32 were not used when initializing CLIPVisionModel: ['text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.8.mlp.fc1.weight', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.6.layer_norm1.weight', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.8.self_attn.q_proj.bias', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.4.layer_norm2.bias', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.7.layer_norm1.weight', 'text_model.encoder.layers.7.layer_norm1.bias', 'text_model.encoder.layers.

DataParallel(
  (module): CLIPTBR(
    (model_clip): CLIPVisionModel(
      (vision_model): CLIPVisionTransformer(
        (embeddings): CLIPVisionEmbeddings(
          (patch_embedding): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
          (position_embedding): Embedding(50, 768)
        )
        (pre_layrnorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (encoder): CLIPEncoder(
          (layers): ModuleList(
            (0): CLIPEncoderLayer(
              (self_attn): CLIPAttention(
                (k_proj): Linear(in_features=768, out_features=768, bias=True)
                (v_proj): Linear(in_features=768, out_features=768, bias=True)
                (q_proj): Linear(in_features=768, out_features=768, bias=True)
                (out_proj): Linear(in_features=768, out_features=768, bias=True)
              )
              (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (mlp): CLIPMLP(
                (ac

In [None]:
best_batch_text_accuracy = 0
optimizer = Adam(clip_ptbr.parameters(), lr=params["learning_rate"],betas=(0.9,0.98),eps=1e-6,weight_decay=params['weight_decay']) #Params from paper
# optimizer = AdaBelief(clip_ptbr.parameters(), params["learning_rate"], eps=0.001, betas=(0.9,0.999), weight_decouple = True, rectify = False)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=params["patience"], factor = 0.9)

for epoch in range(params["n_epochs"]): 
    clip_ptbr.train()
    accumulated_loss = 0
    accumulated_batch_image_accuracy = 0
    accumulated_batch_text_accuracy = 0
    accumulated_text_image = 0
    image_feature_list = []
    text_feature_list = []
    
    # if epoch <= params['frozen_time']:
    #     clip_ptbr.module.model_requires_grad(False)
    # else:
    #     clip_ptbr.module.model_requires_grad()
        
    for batch_idx, data in enumerate(dataloader_img_txt):
        
        mb_image_features, mb_text_features = clip_ptbr(data)
        image_feature_list.append(mb_image_features)
        text_feature_list.append(mb_text_features)
        
        if (batch_idx + 1) % params["accumulate_grad_batches"] == 0:
            optimizer.zero_grad()
            
            image_features = torch.concat(image_feature_list, dim=0)
            text_features = torch.concat(text_feature_list, dim=0)
            logits_per_image, logits_per_text = clip_ptbr.module.compute_logits(image_features, text_features,fixed_logit=-1)
            loss = clip_loss(logits_per_text)
            
            loss.backward() 
            optimizer.step()

            ground_truth = torch.arange((params['BATCH_SIZE'] * params["accumulate_grad_batches"])).to(device)
            accumulated_loss += loss
                    
            #Accuracy Calcs
            preds_image = logits_per_image.argmax(dim=1)
            preds_text = logits_per_text.argmax(dim=1)        
            

            batch_image_accuracy = ((preds_image == ground_truth).sum()) / (params['BATCH_SIZE'] * params["accumulate_grad_batches"])            
            batch_text_accuracy = ((preds_text == ground_truth).sum()) / (params['BATCH_SIZE'] * params["accumulate_grad_batches"])
            batch_text_image = (batch_image_accuracy + batch_text_accuracy)/2
            accumulated_batch_image_accuracy += (preds_image == ground_truth).sum()
            accumulated_batch_text_accuracy += (preds_text == ground_truth).sum()
            accumulated_text_image += accumulated_batch_image_accuracy + accumulated_batch_text_accuracy
            
            # Neptune Logs
            run["loss"].log(loss)
            run["image/accuracy"].log(batch_image_accuracy)
            run["text/accuracy"].log(batch_text_accuracy)
            run["text+image/accuracy"].log(batch_text_image)
            run["learning_rate"].log(optimizer.param_groups[0]["lr"])
            # wandb.log({"loss": loss})
            # wandb.log({"image/accuracy": batch_image_accuracy})
            # wandb.log({"text/accuracy": batch_text_accuracy})
            # wandb.log({"text+image/accuracy": batch_text_image})
            # wandb.log({"learning_rate": optimizer.param_groups[0]["lr"]})
            
            image_feature_list = []
            text_feature_list = []
            
            if(batch_text_accuracy > best_batch_text_accuracy):
                torch.save(clip_ptbr.state_dict(), 'clip_ptbr_weights')
        
    train_loss = accumulated_loss / (params['BATCH_SIZE'] * params["accumulate_grad_batches"])
    train_accumulated_batch_image_accuracy = accumulated_batch_image_accuracy / len(dataloader_img_txt.dataset)
    train_accumulated_batch_text_accuracy = accumulated_batch_text_accuracy / len(dataloader_img_txt.dataset)
    train_accumulated_text_image = accumulated_text_image / (len(dataloader_img_txt.dataset) * 2)
    
    run['train/loss'].log(train_loss)
    run['train/batch_image_accuracy'].log(train_accumulated_batch_image_accuracy)
    run['train/batch_text_accuracy'].log(train_accumulated_batch_text_accuracy)
    run['train/batch_text_image'].log(train_accumulated_text_image)
    # wandb.log({"train/loss": train_loss})
    # wandb.log({"train/batch_image_accuracy": train_accumulated_batch_image_accuracy})
    # wandb.log({"train/batch_text_accuracy": train_accumulated_batch_text_accuracy})
    # wandb.log({"train/batch_text_image": train_accumulated_text_image})
    
    clip_ptbr.eval()
    # Laço de Validação, um a cada época.
    accumulated_loss = 0
    accumulated_batch_image_accuracy = 0
    accumulated_batch_text_accuracy = 0
    accumulated_text_image = 0
    image_feature_list = []
    text_feature_list = []
    with torch.no_grad():
        for batch_idx, data in enumerate(dataloader_img_txt_valid):
            mb_image_features, mb_text_features = clip_ptbr(data)
            image_feature_list.append(mb_image_features)
            text_feature_list.append(mb_text_features)
            
            if (batch_idx + 1) % params["accumulate_grad_batches"] == 0:
                
                image_features = torch.concat(image_feature_list, dim=0)
                text_features = torch.concat(text_feature_list, dim=0)
                logits_per_image, logits_per_text = clip_ptbr.module.compute_logits(image_features, text_features,fixed_logit=-1)
                
                loss = clip_loss(logits_per_text)
                accumulated_loss += loss
            
                ground_truth = torch.arange((params['BATCH_SIZE'] * params["accumulate_grad_batches"])).to(device)                
            
                #Accuracy Calcs
                preds_image = logits_per_image.argmax(dim=1)
                preds_text = logits_per_text.argmax(dim=1)        

                batch_image_accuracy = ((preds_image == ground_truth).sum()) / (params['BATCH_SIZE'] * params["accumulate_grad_batches"])            
                batch_text_accuracy = ((preds_text == ground_truth).sum()) / (params['BATCH_SIZE'] * params["accumulate_grad_batches"])
                batch_text_image = (batch_image_accuracy + batch_text_accuracy)/2
                accumulated_batch_image_accuracy += (preds_image == ground_truth).sum()
                accumulated_batch_text_accuracy += (preds_text == ground_truth).sum()
                accumulated_text_image += accumulated_batch_image_accuracy + accumulated_batch_text_accuracy

                # Neptune Logs
                run["valid/loss"].log(loss)
                run["valid/image/accuracy"].log(batch_image_accuracy)
                run["valid/text/accuracy"].log(batch_text_accuracy)
                run["valid/text+image/accuracy"].log(batch_text_image)
                # wandb.log({"valid/loss": loss})
                # wandb.log({"valid/image/accuracy": batch_image_accuracy})
                # wandb.log({"valid/text/accuracy": batch_text_accuracy})
                # wandb.log({"valid/text+image/accuracy": batch_text_image})
                
                image_feature_list = []
                text_feature_list = []

    valid_loss = accumulated_loss / (params['BATCH_SIZE'] * params["accumulate_grad_batches"])
    valid_accumulated_batch_image_accuracy = accumulated_batch_image_accuracy / len(dataloader_img_txt_valid.dataset)
    valid_accumulated_batch_text_accuracy = accumulated_batch_text_accuracy / len(dataloader_img_txt_valid.dataset)
    valid_accumulated_text_image = accumulated_text_image / (len(dataloader_img_txt_valid.dataset) * 2)

    run['valid/loss'].log(valid_loss)
    run['valid/batch_image_accuracy'].log(valid_accumulated_batch_image_accuracy)
    run['valid/batch_text_accuracy'].log(valid_accumulated_batch_text_accuracy)
    run['valid/batch_text_image'].log(valid_accumulated_text_image)
    # wandb.log({"valid/total/loss": valid_loss})
    # wandb.log({"valid/total/batch_image_accuracy": valid_accumulated_batch_image_accuracy})
    # wandb.log({"valid/total/batch_text_accuracy": valid_accumulated_batch_text_accuracy})
    # wandb.log({"valid/total/batch_text_image": valid_accumulated_text_image})
    
    scheduler.step(loss)
        
    # wandb.watch(clip_ptbr)

wandb: Network error (ReadTimeout), entering retry loop.
wandb: Network error (ReadTimeout), entering retry loop.


In [None]:
run.stop()
# wandb.finish()

In [None]:
# Logit scale = 20
# Max_num_tokens = 95 
# Add Validation
# Linear layer = fixed
# More than one GPU
# Wraper class
# Batch_size = 512


## TODO
# Optimizer = 'adabeleave + some things'

In [None]:
# for para in clip_ptbr.parameters():
#     print(para.requires_grad)