In [137]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import VisualBertForPreTraining, BertTokenizer, VisualBertModel
from transformers import VisualBertConfig

import pickle
import pandas as pd


In [180]:
class Model(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.visual_bert = VisualBertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.linear = nn.Linear(config.hidden_size, 1)
        
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        visual_embeds=None,
        visual_attention_mask=None,
        visual_token_type_ids=None,
    ):
        outputs = self.visual_bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            visual_embeds=visual_embeds,
            visual_attention_mask=visual_attention_mask,
            visual_token_type_ids=visual_token_type_ids,
        )
        pooled_output = self.dropout(outputs[1])
        linear_output = self.linear(pooled_output)
        output=torch.sigmoid(linear_output)
        return output


In [59]:
# Define the training dataset
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.indices = list(range(len(data)))  # set indices attribute
        print(self.data.keys())  
        print(f"Number of indices: {len(self.indices)}")


    def __getitem__(self, index):
        index = self.indices[index]  # get the actual index from self.indices
        text = self.data['text'][index]
        label = self.data['label'][index]
        embedded = self.data['embedded'][index]
        
        return text, label, embedded

    def __len__(self):
        return len(self.data)



In [132]:
def initialize_model( feature_extract, use_pretrained=True):
    # Initialize these variables which will be set in this if statement. Each of these
    #   variables is model specific.
    model_ft = None
    model_ft = VisualBertModel('uclanlp/visualbert-nlvr2-coco-pre', num_labels=2)
    num_ftrs = model_ft.fc.in_features
    model_ft.fc = nn.Linear(num_ftrs, 1)
    set_parameter_requires_grad(model_ft, feature_extract)
    return model_ft

In [181]:
# Define the pre-trained Visual-Bert model

#model=initialize_model(feature_extract= True) : ajouter une couche linéaire ?????
config= VisualBertConfig.from_pretrained('uclanlp/visualbert-nlvr2-coco-pre')

model = Model(config)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [193]:
# Define the training loop

def train(model, tokenizer, train_dataset, optimizer, criterion, device, batch_size, epochs):
    model.to(device)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=False)
    for epoch in range(epochs):
        running_loss = 0.0
        running_accuracy = 0.0
        for text,label,embedded in train_loader:
            optimizer.zero_grad()
            text_encoded = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
            #image = embedded.to(device)
            text_encoded = {k: v.to(device) for k, v in text_encoded.items()}
            label = label.to(device)
            #print(text_encoded['input_ids'].shape)
            #print(embedded.shape)
            #print(label.shape)
            label = label.float().unsqueeze(1).to(device)
            print(label)
            inputs_ids=text_encoded['input_ids']
            attention_mask = text_encoded['attention_mask']
            #visual_attention_mask = embedded(embedded.shape[:-1], dtype=torch.long)
            #visual_token_type_ids = embedded(embedded.shape[:-1], dtype=torch.long)
            #token_type_ids = text_encoded["token_type_ids"]
            outputs = model(input_ids=inputs_ids, attention_mask=attention_mask, visual_embeds=embedded)
            #outputs=model(input_ids=inputs_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, visual_embeds=embedded, visual_attention_mask=visual_attention_mask, visual_token_type_ids=visual_token_type_ids)           
            print(outputs)
            loss = criterion(outputs, label)
            loss.backward()
            optimizer.step()
            pred_labels = torch.round(outputs) # round the probabilities to obtain predicted labels
            print(pred_labels)
            correct_preds = (pred_labels == label).sum().item() # count the number of correct predictions
            accuracy = correct_preds / batch_size # calculate accuracy
            running_loss += loss.item()
            running_accuracy += accuracy
            
        epoch_loss = running_loss / len(train_loader)
        epoch_accuracy = running_accuracy / len(train_loader)
        print('Epoch [%d] - loss: %.4f - accuracy: %.4f' % (epoch+1, epoch_loss, epoch_accuracy))
            

#torch.from_numpy(embedded)

In [173]:

# Define the optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()


In [41]:
df=pd.read_json("data/train.jsonl",lines=True)

l = []

with open(r"C:\Users\arman\OneDrive\Bureau\data\file.pkl","rb") as f:
    l = pickle.load(f)
f.close()
df.loc[:, 'embedded'] = l

In [189]:
essai=df.head(80)
train_dataset = MyDataset(essai)
print(len(train_dataset))  # should print the length of your train data

Index(['id', 'img', 'label', 'text', 'embedded'], dtype='object')
Number of indices: 80
80


In [194]:
train(model, tokenizer, train_dataset, optimizer, criterion, device, batch_size=8, epochs=3)

tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]])
tensor([[0.4622],
        [0.4647],
        [0.4015],
        [0.4731],
        [0.4492],
        [0.4524],
        [0.4803],
        [0.4446]], grad_fn=<SigmoidBackward0>)
tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]], grad_fn=<RoundBackward0>)
tensor([[1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]])
tensor([[0.4194],
        [0.4832],
        [0.4665],
        [0.4793],
        [0.4358],
        [0.4791],
        [0.4321],
        [0.4399]], grad_fn=<SigmoidBackward0>)
tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]], grad_fn=<RoundBackward0>)
tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]])
tensor([[0.4380],
        [0.4731],
 