In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import VisualBertForPreTraining, BertTokenizer, VisualBertModel
from transformers import VisualBertConfig

import pickle
import pandas as pd


In [None]:
class Model(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.visual_bert = VisualBertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.linear = nn.Linear(config.hidden_size, 1)
        
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        visual_embeds=None,
        visual_attention_mask=None,
        visual_token_type_ids=None,
    ):
        outputs = self.visual_bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            visual_embeds=visual_embeds,
            visual_attention_mask=visual_attention_mask,
            visual_token_type_ids=visual_token_type_ids,
        )
        pooled_output = self.dropout(outputs[1])
        linear_output = self.linear(pooled_output)
        output=torch.sigmoid(linear_output)
        return output


In [None]:
# Define the training dataset
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.indices = list(range(len(data)))  # set indices attribute
        print(self.data.keys())  
        print(f"Number of indices: {len(self.indices)}")


    def __getitem__(self, index):
        index = self.indices[index]  # get the actual index from self.indices
        text = self.data['text'][index]
        label = self.data['label'][index]
        embedded = self.data['embedded'][index]
        
        return text, label, embedded

    def __len__(self):
        return len(self.data)



In [None]:
# Define the pre-trained Visual-Bert model

config= VisualBertConfig.from_pretrained('uclanlp/visualbert-nlvr2-coco-pre')
model = Model(config)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:
# Define the training loop

def train(model, tokenizer, train_dataset, optimizer, criterion, device, batch_size, epochs):
    model.to(device)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, drop_last=False)
    for epoch in range(epochs):
        running_loss = 0.0
        running_accuracy = 0.0
        for batch in train_loader:
            try : 
              text, label, embedded = batch
              optimizer.zero_grad()
              text_encoded = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
              text_encoded = {k: v.to(device) for k, v in text_encoded.items()}
              label = label.float().unsqueeze(1).to(device)
              inputs_ids=text_encoded['input_ids'].to(device)
              visual_embeds = embedded.to(device)
              attention_mask = text_encoded['attention_mask'].to(device)
              outputs = model(input_ids=inputs_ids, attention_mask=attention_mask, visual_embeds=visual_embeds)
              loss = criterion(outputs, label)
              loss.backward()
              optimizer.step()
              pred_labels = torch.round(outputs) # round the probabilities to obtain predicted labels
              correct_preds = (pred_labels == label).sum().item() # count the number of correct predictions
              accuracy = correct_preds / batch_size # calculate accuracy
              running_loss += loss.item()
              running_accuracy += accuracy
            except :
              continue
           
          

        #except: 
          #print('error')
          #continue
            
        epoch_loss = running_loss / len(train_loader)
        epoch_accuracy = running_accuracy / len(train_loader)
        print('Epoch [%d] - loss: %.4f - accuracy: %.4f' % (epoch+1, epoch_loss, epoch_accuracy))
            



In [None]:

# Define the optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=0.001)
criterion = nn.BCELoss()


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df=pd.read_json("/content/drive/MyDrive/ENSAE/Informatique/Statapp/data/train.jsonl",lines=True)

l = []

with open(r"/content/drive/MyDrive/ENSAE/Informatique/Statapp/data/file.pkl","rb") as f:
    l = pickle.load(f)
f.close()
df.loc[:, 'embedded'] = l

In [None]:
mask = pd.to_numeric(df['embedded'], errors='coerce').isna()
df = df[mask]
print(len(df))

8464


In [None]:
df = df.reset_index(drop=True)


In [None]:
train_dataset = MyDataset(df)
print(len(train_dataset))  # should print the length of your train data

Index(['id', 'img', 'label', 'text', 'embedded'], dtype='object')
Number of indices: 8464
8464


In [None]:
train(model, tokenizer, train_dataset, optimizer, criterion, device, batch_size=8, epochs=10)

Epoch [1] - loss: 0.6856 - accuracy: 0.6231
Epoch [2] - loss: 0.7033 - accuracy: 0.6265
Epoch [3] - loss: 0.7121 - accuracy: 0.6243
Epoch [4] - loss: 0.7070 - accuracy: 0.6257
Epoch [5] - loss: 0.7092 - accuracy: 0.6259
Epoch [6] - loss: 0.7079 - accuracy: 0.6272
Epoch [7] - loss: 0.7075 - accuracy: 0.6282
Epoch [8] - loss: 0.7062 - accuracy: 0.6268
Epoch [9] - loss: 0.7113 - accuracy: 0.6256
Epoch [10] - loss: 0.7094 - accuracy: 0.6257


Epoch [1] - loss: 0.6736 - accuracy: 0.6287
Epoch [2] - loss: 0.6910 - accuracy: 0.6282
Epoch [3] - loss: 0.6976 - accuracy: 0.6298
Epoch [4] - loss: 0.6989 - accuracy: 0.6325
Epoch [5] - loss: 0.6991 - accuracy: 0.6285
Epoch [6] - loss: 0.6977 - accuracy: 0.6298
Epoch [7] - loss: 0.6991 - accuracy: 0.6320
Epoch [8] - loss: 0.6973 - accuracy: 0.6323
Epoch [9] - loss: 0.6972 - accuracy: 0.6263
Epoch [10] - loss: 0.6978 - accuracy: 0.6278