In [1]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
def set_parameter_requires_grad(model, feature_extracting):
    if feature_extracting:
        for param in model.parameters():
            param.requires_grad = False
    else : 
        for param in model.parameters():
            param.requires_grad = True

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import VisualBertForPreTraining, BertTokenizer, VisualBertModel, VisualBertForVisualReasoning
from transformers import VisualBertConfig

import numpy as np
import pickle
import pandas as pd


In [4]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'


Définition des classes dont on a besoin

In [5]:
class Model(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.visual_bert = VisualBertModel(config=config)
        self.dropout = nn.Dropout(0.5) #config.hidden_dropout_prob
        self.linear1 = nn.Linear(config.hidden_size,int(config.hidden_size/4))
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(int(config.hidden_size/4), 2)  # Ajout de la seconde couche linéaire  
        self.attentions = None  # Add an `attentions` attribute to the class
        self.pooler_output  = None
        self.last_hidden_state = None
        set_parameter_requires_grad(self.visual_bert, False) #true pour entrainer que la dernière couche

        #for param in self.visual_bert.parameters():
         #   param.requires_grad = False
        #for param in self.dropout.parameters():
         #   param.requires_grad = False
        
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        visual_embeds=None,
        visual_attention_mask=None,
        visual_token_type_ids=None,
    ):
        outputs  = self.visual_bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            visual_embeds=visual_embeds,
            visual_attention_mask=visual_attention_mask,
            visual_token_type_ids=visual_token_type_ids,
            output_attentions=True
        )
        otp=outputs.pooler_output 
        pooled_output = self.dropout(otp)
        linear_output1 = self.linear1(pooled_output)
        relu_output = self.relu(linear_output1)
        linear_output2 = self.linear2(relu_output)  # Passage par la seconde couche linéaire      
        Soutput=torch.softmax(linear_output2, dim=-1)
        attentions = outputs.attentions # extract attention scores from the model outputs
        

        return Soutput, attentions


In [6]:
# Define the training dataset
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.indices = list(range(len(data)))  # set indices attribute
        print(self.data.keys())  
        print(f"Number of indices: {len(self.indices)}")


    def __getitem__(self, index):
        index = self.indices[index]  # get the actual index from self.indices
        text = self.data['text'][index]
        label = self.data['label'][index]
        embedded = self.data['embedded'][index]
        
        return text, label, embedded

    def __len__(self):
        return len(self.data)



Chargement du modèle pré entrainé et customisation du modèle

In [7]:
# Define the pre-trained Visual-Bert model
config = VisualBertConfig.from_pretrained('uclanlp/visualbert-nlvr2-coco-pre', output_attentions=True)
model=  VisualBertModel.from_pretrained('uclanlp/visualbert-nlvr2-coco-pre', config=config)
model = Model(config)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


Some weights of the model checkpoint at uclanlp/visualbert-nlvr2-coco-pre were not used when initializing VisualBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing VisualBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VisualBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
#for layer in model.children():
 #   if hasattr(layer, 'reset_parameters'):
  #      layer.reset_parameters()

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [10]:
model = model.to(device)


print("Params to learn:")
params_to_update = []
for name,param in model.named_parameters():
    if param.requires_grad == True:
        params_to_update.append(param)
        print("\t",name)


# Observe that all parameters are being optimized

Params to learn:
	 visual_bert.embeddings.word_embeddings.weight
	 visual_bert.embeddings.position_embeddings.weight
	 visual_bert.embeddings.token_type_embeddings.weight
	 visual_bert.embeddings.LayerNorm.weight
	 visual_bert.embeddings.LayerNorm.bias
	 visual_bert.embeddings.visual_token_type_embeddings.weight
	 visual_bert.embeddings.visual_position_embeddings.weight
	 visual_bert.embeddings.visual_projection.weight
	 visual_bert.embeddings.visual_projection.bias
	 visual_bert.encoder.layer.0.attention.self.query.weight
	 visual_bert.encoder.layer.0.attention.self.query.bias
	 visual_bert.encoder.layer.0.attention.self.key.weight
	 visual_bert.encoder.layer.0.attention.self.key.bias
	 visual_bert.encoder.layer.0.attention.self.value.weight
	 visual_bert.encoder.layer.0.attention.self.value.bias
	 visual_bert.encoder.layer.0.attention.output.dense.weight
	 visual_bert.encoder.layer.0.attention.output.dense.bias
	 visual_bert.encoder.layer.0.attention.output.LayerNorm.weight
	 visual_

Définition des fonctions d'entrainement et de test

In [22]:
# Define the training loop
def train(model, tokenizer, train_dataset, dev_dataset, optimizer, criterion, device, batch_size, epochs, patience):
    model.to(device)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    patience_count = 0
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        running_accuracy = 0.0
        running_TP = 0.0
        running_FP = 0.0
        running_TN = 0.0
        running_FN = 0.0
        for batch in train_loader:
            pred_labels=[0]*batch_size
            pred_prob=[0]*batch_size
            text, label, embedded = batch
            optimizer.zero_grad()
            text_encoded = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
            text_encoded = {k: v.to(device) for k, v in text_encoded.items()}
            label = label.float().to(device)
            inputs_ids=text_encoded['input_ids'].to(device)
            visual_embeds = embedded.to(device)
            attention_mask = text_encoded['attention_mask'].to(device)
            outputs, attentions = model(input_ids=inputs_ids, attention_mask=attention_mask, visual_embeds=visual_embeds)
            for j in range(len(outputs[:])) :
              if outputs[j][0].item()>outputs[j][1].item() :
                pred_labels[j] = 0
                pred_prob[j] = outputs[j][0].item()
              else : 
                pred_labels[j]=1
                pred_prob[j] = outputs[j][1].item()
            loss = criterion(torch.tensor(pred_prob,  requires_grad=True).to(device), label)
            loss.backward()
            optimizer.step()
            scheduler.step()
            correct_preds = 0
            TP=FP=TN=FN=0
            for i in range(len(pred_labels)) :
              if pred_labels[i]==label[i] :
                correct_preds += 1
       
            accuracy = correct_preds / batch_size # calculate accuracy
            running_loss += loss.item()
            running_accuracy += accuracy

            for i in range(len(pred_labels)) :
                if pred_labels[i]== 1 and label[i]==1 : TP+=1
                elif pred_labels[i]== 1 and label[i]==0 : FP+=1
                elif pred_labels[i]== 0 and label[i]==0 : TN+=1
                elif pred_labels[i]== 0 and label[i]==1 : FN+=1


                
            # Calculate TP, FP, TN, FN
          # TP = ((pred_labels == 1) & (label == 1)).sum().item()
           #FP = ((pred_labels == 1) & (label == 0)).sum().item()
            #TN = ((pred_labels == 0) & (label == 0)).sum().item()
            #FN = ((pred_labels == 0) & (label == 1)).sum().item()
            running_TP += TP
            running_FP += FP
            running_TN += TN
            running_FN += FN
       
          
        epoch_loss = running_loss / len(train_loader)
        epoch_accuracy = running_accuracy / len(train_loader)
        epoch_recall = running_TP / (running_TP + running_FN) # recall
        epoch_specificity = running_TN / (running_TN + running_FP) # specificity
        print('Epoch [%d] - loss: %.4f - accuracy: %.4f - recall: %.4f - specificity: %.4f' % (epoch+1, epoch_loss, epoch_accuracy, epoch_recall, epoch_specificity))

        # Evaluate on dev_dataset
        model.eval()
        dev_running_loss = 0.0
        dev_running_accuracy = 0.0
        dev_running_TP = 0.0
        dev_running_FP = 0.0
        dev_running_TN = 0.0
        dev_running_FN = 0.0

        with torch.no_grad():
            for dev_batch in dev_loader:
                dev_pred_labels = [0] * batch_size
                dev_pred_prob = [0] * batch_size
                dev_text, dev_label, dev_embedded = dev_batch
                dev_text_encoded = tokenizer(dev_text, return_tensors='pt', padding=True, truncation=True)
                dev_text_encoded = {k: v.to(device) for k, v in dev_text_encoded.items()}
                dev_label = dev_label.float().to(device)
                dev_inputs_ids = dev_text_encoded['input_ids'].to(device)
                dev_visual_embeds = dev_embedded.to(device)
                dev_attention_mask = dev_text_encoded['attention_mask'].to(device)
                dev_outputs, dev_attentions = model(input_ids=dev_inputs_ids, attention_mask=dev_attention_mask, visual_embeds=dev_visual_embeds)
                for j in range(len(dev_outputs[:])):
                    if dev_outputs[j][0].item() > dev_outputs[j][1].item():
                        dev_pred_labels[j] = 0
                        dev_pred_prob[j] = dev_outputs[j][0].item()
                    else:
                        dev_pred_labels[j] = 1
                        dev_pred_prob[j] = dev_outputs[j][1].item()
                dev_loss = criterion(torch.tensor(dev_pred_prob, requires_grad=True).to(device), dev_label)
                loss = criterion(torch.tensor(pred_prob,  requires_grad=True).to(device), label)

                dev_correct_preds = 0
                dev_TP = dev_FP = dev_TN = dev_FN = 0

                for i in range(len(dev_pred_labels)):
                    if dev_pred_labels[i] == dev_label[i]:
                        dev_correct_preds += 1

                dev_accuracy = dev_correct_preds / batch_size
                dev_running_loss += dev_loss.item()
                dev_running_accuracy += dev_accuracy

                for i in range(len(dev_pred_labels)):
                    if dev_pred_labels[i] == 1 and dev_label[i] == 1:
                        dev_TP += 1
                    elif dev_pred_labels[i] == 1 and dev_label[i] == 0:
                        dev_FP += 1
                    elif dev_pred_labels[i] == 0 and dev_label[i] == 0:
                        dev_TN += 1
                    elif dev_pred_labels[i] == 0 and dev_label[i] == 1:
                        dev_FN += 1

                dev_running_TP += dev_TP
                dev_running_FP += dev_FP
                dev_running_TN += dev_TN
                dev_running_FN += dev_FN

        dev_epoch_loss = dev_running_loss / len(dev_loader)
        dev_epoch_accuracy = dev_running_accuracy / len(dev_loader)
        dev_epoch_recall = dev_running_TP / (dev_running_TP + dev_running_FN)
        dev_epoch_specificity = dev_running_TN / (dev_running_TN + dev_running_FP)
        print('Epoch [%d] - Dev - loss: %.4f - accuracy: %.4f - recall: %.4f - specificity: %.4f' % (
            epoch + 1, dev_epoch_loss, dev_epoch_accuracy, dev_epoch_recall, dev_epoch_specificity))

     

In [12]:

def test(model, tokenizer, test_dataset, criterion, device, batch_size):
    model.eval()
    model.to(device)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    running_loss = 0.0
    running_accuracy = 0.0
    running_TP = 0.0
    running_FP = 0.0
    running_TN = 0.0
    running_FN = 0.0
    with torch.no_grad():
        for batch in test_loader:
            pred_labels=[0]*batch_size
            pred_prob=[0]*batch_size
            text, label, embedded = batch
            text_encoded = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
            text_encoded = {k: v.to(device) for k, v in text_encoded.items()}
            label = label.float().to(device)
            inputs_ids=text_encoded['input_ids'].to(device)
            visual_embeds = embedded.to(device)
            attention_mask = text_encoded['attention_mask'].to(device)
            outputs , attentions = model(input_ids=inputs_ids, attention_mask=attention_mask, visual_embeds=visual_embeds)
            for j in range(len(outputs[:])) :
                if outputs[j][0].item()>outputs[j][1].item() :
                  pred_labels[j] = 0
                  pred_prob[j] = outputs[j][0].item()
                else : 
                  pred_labels[j]=1
                  pred_prob[j] = outputs[j][1].item()
            loss = criterion(torch.tensor(pred_prob,  requires_grad=True).to(device), label)
            correct_preds = 0
            TP=FP=TN=FN=0
            for i in range(len(pred_labels)) :
                if pred_labels[i]==label[i] :
                  correct_preds += 1

            for i in range(len(pred_labels)) :
                if pred_labels[i]== 1 and label[i]==1 : TP+=1
                elif pred_labels[i]== 1 and label[i]==0 : FP+=1
                elif pred_labels[i]== 0 and label[i]==0 : TN+=1
                elif pred_labels[i]== 0 and label[i]==1 : FN+=1

      # Calculate TP, FP, TN, FN
            
            running_TP += TP
            running_FP += FP
            running_TN += TN
            running_FN += FN

            accuracy = correct_preds / batch_size # calculate accuracy
            running_loss += loss.item()
            running_accuracy += accuracy

              
                
         
    test_loss = running_loss / len(test_loader)
    test_accuracy = running_accuracy / len(test_loader)
    test_recall = running_TP/ (running_TP + running_FN) # recall
    test_specificity = running_TN / (running_TN + running_FP) # specificity
    print('Test loss: %.4f - Test accuracy: %.4f- recall: %.4f - specificity: %.4f' % (test_loss, test_accuracy,test_recall,test_specificity))


In [13]:

# Define the optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.00005) #, weight_decay=0.1
criterion = nn.BCEWithLogitsLoss()
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 2000)




In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
df_test=pd.read_json("/content/drive/MyDrive/data/dev.jsonl",lines=True)

m = []

with open(r"/content/drive/MyDrive/data/file_val.pkl","rb") as g:
    m = pickle.load(g)
g.close()
df_test.loc[:, 'embedded'] = m

In [16]:
df_train=pd.read_json("/content/drive/MyDrive/data/train.jsonl",lines=True)

l = []

with open(r"/content/drive/MyDrive/data/file.pkl","rb") as f:
    l = pickle.load(f)
f.close()
df_train.loc[:, 'embedded'] = l

In [17]:
mask_train = pd.to_numeric(df_train['embedded'], errors='coerce').isna()
df_train = df_train[mask_train]
print(len(df_train))

8464


In [18]:
from sklearn.model_selection import train_test_split

train_df, dev_df = train_test_split(df_train, test_size=1000, random_state=42)

# Verify the sizes of the datasets
print(f'Train dataset size: {len(train_df)}')
print(f'Validation dataset size: {len(dev_df)}')

Train dataset size: 7464
Validation dataset size: 1000


In [19]:
train_df = train_df.reset_index(drop=True)
dev_df = dev_df.reset_index(drop=True)


In [20]:
train_dataset = MyDataset(train_df)
print(len(train_dataset))
dev_dataset = MyDataset(dev_df)
print(len(dev_dataset))

Index(['id', 'img', 'label', 'text', 'embedded'], dtype='object')
Number of indices: 7464
7464
Index(['id', 'img', 'label', 'text', 'embedded'], dtype='object')
Number of indices: 1000
1000


In [23]:
train(model, tokenizer, train_dataset, dev_dataset, optimizer, criterion, device, batch_size=16, epochs=10, patience=5)

Epoch [1] - loss: 0.8077 - accuracy: 0.4072 - recall: 0.8447 - specificity: 0.1645
Epoch [1] - Dev - loss: 0.7997 - accuracy: 0.3730 - recall: 1.0000 - specificity: 0.0000
Epoch [2] - loss: 0.8075 - accuracy: 0.3998 - recall: 0.8350 - specificity: 0.1583
Epoch [2] - Dev - loss: 0.7997 - accuracy: 0.3730 - recall: 1.0000 - specificity: 0.0000
Epoch [3] - loss: 0.8081 - accuracy: 0.4084 - recall: 0.8378 - specificity: 0.1707
Epoch [3] - Dev - loss: 0.7986 - accuracy: 0.3750 - recall: 1.0000 - specificity: 0.0000
Epoch [4] - loss: 0.8081 - accuracy: 0.4087 - recall: 0.8454 - specificity: 0.1667
Epoch [4] - Dev - loss: 0.7991 - accuracy: 0.3740 - recall: 1.0000 - specificity: 0.0000
Epoch [5] - loss: 0.8078 - accuracy: 0.4014 - recall: 0.8378 - specificity: 0.1596
Epoch [5] - Dev - loss: 0.7986 - accuracy: 0.3750 - recall: 1.0000 - specificity: 0.0000
Epoch [6] - loss: 0.8078 - accuracy: 0.4068 - recall: 0.8458 - specificity: 0.1634
Epoch [6] - Dev - loss: 0.8003 - accuracy: 0.3720 - recal

In [None]:
mask_test = pd.to_numeric(df_test['embedded'], errors='coerce').isna()
df_test = df_test[mask_test]
print(len(df_test))

In [None]:
df_test = df_test.reset_index(drop=True)


In [None]:
test_dataset = MyDataset(df_test)
print(len(test_dataset))  # should print the length of your train data

In [None]:
test(model, tokenizer, test_dataset, criterion, device, batch_size=32)

In [None]:
import numpy as np
import torch
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, roc_auc_score


test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, drop_last=False)

model.eval()
with torch.no_grad():
    y_pred = []
    y_true = []
   
    for batch in test_loader:
          text, label, embedded = batch
          text_encoded = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
          text_encoded = {k: v.to(device) for k, v in text_encoded.items()}
          label = label.float().unsqueeze(1).to(device)
          inputs_ids=text_encoded['input_ids'].to(device)
          visual_embeds = embedded.to(device)
          attention_mask = text_encoded['attention_mask'].to(device)
          outputs, attentions = model(input_ids=inputs_ids, attention_mask=attention_mask, visual_embeds=visual_embeds)
          outputs_cpu=outputs[0:].cpu()
          label_cpu=label[0:].cpu()
          #print(outputs[0:])
          y_pred.extend(outputs_cpu.numpy())
          y_true.extend(label_cpu.numpy())
          pred_labels = torch.round(outputs[0]) # round the probabilities to obtain predicted labels
          correct_preds = (pred_labels == label).sum().item() # count the number of correct predictions


print(len(y_pred))
print(len(y_true))
# Calculer l'AUC-ROC

auc_roc = roc_auc_score(y_true, y_pred)
print("AUC-ROC : {:.4f}".format(auc_roc))

# Calculer la courbe ROC
fpr, tpr, _ = roc_curve(y_true, y_pred)
roc_auc = auc(fpr, tpr)

# Tracer la courbe ROC
plt.figure(figsize=(4,4))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.4f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
  model.to(device)
  with torch.no_grad():
  
    for batch in test_loader:
    
              text, label, embedded = batch
              text_encoded = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
              text_encoded = {k: v.to(device) for k, v in text_encoded.items()}
              label = label.float().unsqueeze(1).to(device)
              inputs_ids=text_encoded['input_ids'].to(device)
              visual_embeds = embedded.to(device)
              attention_mask = text_encoded['attention_mask'].to(device)
              outputs, attentions = model(input_ids=inputs_ids, attention_mask=attention_mask, visual_embeds=visual_embeds)
     

              # Extraire les scores d'attention par couche
              #attentions = outputs.attentions
              for layer, attention in enumerate(attentions):
                print(f"Layer {layer+1} attention shape: {attention.shape}")
              # Extraire les scores d'attention par tête et par couche
              #multi_head_attention = outputs.multi_head_attention_outputs
              #for layer, attention_layer in enumerate(multi_head_attention):
                 #print(f"Layer {layer+1} attention shape: {[attention.shape for attention in attention_layer]}")
              



In [None]:
import matplotlib.pyplot as plt
import numpy as np
import cv2


test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, drop_last=False)
for batch in test_loader  : 

  # Load the image


    text, label, embedded = batch
    text_encoded = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    text_encoded = {k: v.to(device) for k, v in text_encoded.items()}
    label = label.float().unsqueeze(1).to(device)
    inputs_ids=text_encoded['input_ids'].to(device)
    visual_embeds = embedded.to(device)
    attention_mask = text_encoded['attention_mask'].to(device)
    outputs, attentions = model(input_ids=inputs_ids, attention_mask=attention_mask, visual_embeds=visual_embeds)
    break
  #Resize the image to match the input size of the model
  #img = cv2.resize(img, (224, 224))
  #img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

img_path = df_test['img'][1]
path='/content/drive/MyDrive/data/'+img_path
#img = cv2.imread('/content/drive/MyDrive/data/'+img_path)
img= cv2.imread('/content/drive/MyDrive/data/img/01235.png')
# Get the attention scores for the image
attention_scores = attentions[0][0][0].detach().cpu().numpy()

attention_scores = cv2.resize(attention_scores, (img.shape[1], img.shape[0]))
attention_scores = (attention_scores - attention_scores.min()) / (attention_scores.max() - attention_scores.min())
heatmap = cv2.applyColorMap(np.uint8(255*attention_scores), cv2.COLORMAP_JET)

result = cv2.addWeighted(img, 0.6, heatmap, 0.4, 0)

plt.imshow(result)
plt.axis('off')
plt.show()


In [None]:
  valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    best_valid_loss = float('inf')  
  
  valid_loss = 0.0
        valid_accuracy = 0.0


        for batch in valid_loader:
            pred_labels=[0]*batch_size
            pred_prob=[0]*batch_size
            text, label, embedded = batch
            text_encoded = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
            text_encoded = {k: v.to(device) for k, v in text_encoded.items()}
            label = label.float().unsqueeze(1).to(device)
            inputs_ids = text_encoded['input_ids'].to(device)
            visual_embeds = embedded.to(device)
            attention_mask = text_encoded['attention_mask'].to(device)
            outputs, attentions = model(input_ids=inputs_ids, attention_mask=attention_mask, visual_embeds=visual_embeds)
            for j in range(len(outputs[:])) :
              if outputs[j][0].item()>outputs[j][1].item() :
                pred_labels[j] = 0
                pred_prob[j] = outputs[j][0].item()
              else : 
                pred_labels[j]=1
                pred_prob[j] = outputs[j][1].item()
      
            loss = criterion(torch.tensor(pred_prob,  requires_grad=True).unsqueeze(1).to(device), label)
          
            correct_preds = 0
            TP=FP=TN=FN=0
            for i in range(len(pred_labels)) :
              if pred_labels[i]==label[i] :
                correct_preds += 1

            for i in range(len(pred_labels)) :
                if pred_labels[i]== 1 and label[i]==1 : TP+=1
                elif pred_labels[i]== 1 and label[i]==0 : FP+=1
                elif pred_labels[i]== 0 and label[i]==0 : TN+=1
                elif pred_labels[i]== 0 and label[i]==1 : FN+=1

    
            valid_loss += loss.item()
            valid_accuracy += accuracy
          
        valid_loss /= len(valid_loader)
        valid_accuracy /= len(valid_loader)


           if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            patience_count = 0
        else:
            patience_count += 1
            if patience_count >= patience:
                print("Validation loss did not improve for %d epochs. Training stopped early." % patience)
                break


