In [160]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [161]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [162]:
from __future__ import print_function
from __future__ import division
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
from tqdm import tqdm
import os
import copy
import pandas as pd
from transformers import AutoTokenizer, AutoModel, BertModel
import nltk
from nltk.corpus import stopwords
import string
nltk.download('stopwords')
print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print(device)


PyTorch Version:  2.0.0+cu118
Torchvision Version:  0.15.1+cu118
cuda:0


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [163]:
num_epochs = 5

batch_size = 8

max_len = 336
# Chargement du tokenizer et du modèle BERT
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)



In [164]:
train = pd.read_json("/content/drive/MyDrive/data/train.jsonl",lines=True)
val = pd.read_json("/content/drive/MyDrive/data/val.jsonl",lines=True)
del train['id']
del val['id']
def stop_words(df, column, new_column):
  df[new_column]=df[column].apply(lambda x: ' '.join([item for item in x.split() if item not in stopwords.words('english')]))
  return df

def punctuation(df, column, new_column):
  df[new_column]=df[column].apply(lambda x: "".join([char for char in x if char not in string.punctuation]))
  return df

In [165]:
cleaned_train = stop_words(train, 'text', 'cleaned_text')
cleaned_train = punctuation(cleaned_train, 'cleaned_text', 'cleaned_text')
del cleaned_train['text']

In [166]:
cleaned_val = stop_words(val, 'text', 'cleaned_text')
cleaned_val = punctuation(cleaned_val, 'cleaned_text', 'cleaned_text')
del cleaned_val['text']

In [167]:
import torch
from PIL import Image

class MyDataset(torch.utils.data.Dataset):
  'Caractérise un jeu de données pour PyTorch'
  def __init__(self, transforms = None, root_dir = '/content/drive/MyDrive/data', mode = 'train', tokenizer = tokenizer):
        'Initialisation'
        if mode == 'train' : 
            self.df = cleaned_train
        else : 
            self.df = cleaned_val
        self.labels = self.df.label
        self.image_names = self.df.img
        self.transforms = transforms
        self.root_dir = root_dir
        self.texts = [tokenizer(sentence, padding='max_length', max_length = max_len, truncation=True, return_tensors="pt") for sentence in self.df['cleaned_text']]

#        print(len(self.labels[self.labels == 0])/len(self.labels))
      
  def __len__(self):
        "Représente le nombre total d'exemples du jeu de données"
        return len(self.labels)

  def __getitem__(self, idx):
      'Génère un exemple à partir du jeu de données'
      # Sélection de l'exemple

      image_path = f"{self.root_dir}/{self.image_names.iloc[idx]}"

      img = Image.open(image_path, ).convert('RGB')

      if self.transforms :
            img = self.transforms(img)

      

      return self.texts[idx], img, self.labels.iloc[idx]
  


In [168]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [x for x in df['label']]
        self.texts = [tokenizer(sentence, padding='max_length', max_length = max_len, truncation=True, return_tensors="pt") for sentence in df['cleaned_text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [169]:
# Data augmentation and normalization for training
# Just normalization for validation
input_size = 224
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(input_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        
    ]),
    'val': transforms.Compose([
        transforms.Resize(input_size),
        transforms.CenterCrop(input_size),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        
    ]),
}

print("Initializing Datasets and Dataloaders...")

# Create training and validation datasets
image_datasets = {x: MyDataset(transforms = data_transforms[x], mode = x) for x in ['train', 'val']}

# Create training and validation dataloaders
dataloaders_dict = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, shuffle=True, num_workers=4) for x in ['train', 'val']}

# Detect if we have a GPU available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Initializing Datasets and Dataloaders...




In [170]:
image_datasets

{'train': <__main__.MyDataset at 0x7fc041f75cc0>,
 'val': <__main__.MyDataset at 0x7fc041f76710>}

In [171]:
def set_parameter_requires_grad(model, feature_extracting):
    if feature_extracting:
        for param in model.parameters():
            param.requires_grad = False
    else :
        for param in model.parameters():
            param.requires_grad = True

def initialize_model( feature_extract, use_pretrained=True):
    # Initialize these variables which will be set in this if statement. Each of these
    #   variables is model specific.
    model_ft = None
    model_ft = models.resnet18(pretrained=use_pretrained)
    
    num_ftrs = model_ft.fc.in_features
    model_ft.fc = nn.Linear(num_ftrs, 1)
    set_parameter_requires_grad(model_ft, feature_extract)
    return model_ft

In [172]:
from torch import nn
from transformers import BertForSequenceClassification
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2, output_attentions = False, output_hidden_states = False)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input_id, mask):

        bert_output = self.bert(input_ids = input_id, attention_mask = mask)
        logits = bert_output.logits          # pour obtenir le tensor en sortie de bert
        final_output = self.dropout(logits)  # pour réduire l'overfitt, ici p = 0.5

        return logits

In [173]:
# définir le modèle Bert
bert_model = BertClassifier()
bert_model.load_state_dict(state_dict = torch.load('/content/drive/MyDrive/data/best_model_sigm (1).pth', map_location=torch.device('cpu')))

# définir le modèle ResNet18
resnet18_model = initialize_model( feature_extract= True )
resnet18_model.load_state_dict(torch.load('/content/drive/MyDrive/data/best_model.pth'))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

<All keys matched successfully>

In [174]:
bert_model

BertClassifier(
  (bert): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, 

In [175]:
# définir une classe de modèle qui effectue la moyenne des sorties des deux modèles
class ConcatBertModel(torch.nn.Module):
    def __init__(self, bert_model, resnet18_model):
        super(ConcatBertModel, self).__init__()
        self.bert_model = bert_model
        self.resnet18_model = resnet18_model
        self.fc = torch.nn.Linear(3, 1)
        set_parameter_requires_grad(self.bert_model ,True)
        set_parameter_requires_grad(self.resnet18_model ,True)
        
    def forward(self, input_ids, attention_mask, image):
        bert_output = self.bert_model(input_id=input_ids, mask=attention_mask)

        resnet_output = self.resnet18_model(image)
      
        concat_output = torch.cat((bert_output, resnet_output), dim=1)
        output = self.fc(concat_output)

        output = torch.sigmoid(output)
        return output


In [176]:
# initialiser une instance de ConcatBertModel
concat_bert = ConcatBertModel(bert_model, resnet18_model)
concat_bert = concat_bert.to(device)
print(concat_bert)

ConcatBertModel(
  (bert_model): BertClassifier(
    (bert): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): Linear(in_features=768, out_features=768, bias=True)
                  (key): Linear(in_features=768, out_features=768, bias=True)
                  (value): Linear(in_features=768, out_features=768, bias=True)
                  (dropout): Dropout(p=0.1, inplace=False)
                )
                (output): BertSelfOu

In [177]:
def train_concat_bert(model, dataloaders, criterion, optimizer, num_epochs):
    since = time.time()
    val_acc_history = []

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.552

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0
            running_TP = 0.0
            running_FP = 0.0
            running_TN = 0.0
            running_FN = 0.0
            for input_ids, inputs, labels in tqdm(dataloaders[phase]):
                TP=FP=TN=FN=0
                inputs = inputs.to(device)
                input_id = input_ids['input_ids'].squeeze(1).to(device)
                mask = input_ids['attention_mask'].to(device)

                labels = labels.to(device)
              
                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    # effectuer la prédiction et calculer la perte
                    outputs = concat_bert(input_id, mask, inputs)
                    loss = criterion(outputs, labels.float().unsqueeze(1).to(device))
                
                    if phase == 'train':
                        # rétropropager et mettre à jour les poids
                        loss.backward()
                        optimizer.step()

                  
                    preds = (outputs > 0.5).float()
                    for i in range(len(preds)) :
                      if preds[i]== 1 and labels[i]==1 : TP+=1
                      elif preds[i]== 1 and labels[i]==0 : FP+=1
                      elif preds[i]== 0 and labels[i]==0 : TN+=1
                      elif preds[i]== 0 and labels[i]==1 : FN+=1
                    

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.float().unsqueeze(1))
                running_TP += TP
                running_FP += FP
                running_TN += TN
                running_FN += FN
            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)
            epoch_recall = running_TP / (running_TP + running_FN) # recall
            epoch_specificity = running_TN / (running_TN + running_FP)
            print('{} Loss: {:.4f} Acc: {:.4f} Recall: {:.4f} Specificity: {:.4f}'.format(phase, epoch_loss, epoch_acc, epoch_recall, epoch_specificity))
            
                # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(model.state_dict(), "/content/drive/MyDrive/data/best_model_concat.pth")

            if phase == 'val':
                val_acc_history.append(epoch_acc)

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)

    return concat_bert, val_acc_history

In [178]:
# définir la fonction de perte et l'optimiseur
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(concat_bert.parameters(), lr=5e-5)

In [179]:
print("Params to learn:")
params_to_update = []
for name,param in concat_bert.named_parameters():
        if param.requires_grad == True:
            print("\t",name)

Params to learn:
	 fc.weight
	 fc.bias


In [None]:
concat_model, hist = train_concat_bert(concat_bert, dataloaders_dict, criterion, optimizer, num_epochs)

Epoch 0/4
----------


 92%|█████████▏| 973/1063 [03:36<00:18,  4.78it/s]

In [None]:
import numpy as np
import torch
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, roc_auc_score



# Prédire les probabilités pour les données de test
concat_model.eval()
with torch.no_grad():
    y_pred = []
    y_true = []
    for input_ids, inputs, labels in dataloaders_dict['val']:
        inputs = inputs.to(device)
        input_id = input_ids['input_ids'].squeeze(1).to(device)
        mask = input_ids['attention_mask'].to(device)

        labels = labels
              
        outputs = concat_bert(input_id, mask, inputs)

        y_pred.extend(torch.sigmoid(outputs).cpu().numpy())
        y_true.extend(labels.numpy())


# Calculer l'AUC-ROC
auc_roc = roc_auc_score(y_true, y_pred)
print("AUC-ROC : {:.4f}".format(auc_roc))

# Calculer la courbe ROC
fpr, tpr, _ = roc_curve(y_true, y_pred)
roc_auc = auc(fpr, tpr)

# Tracer la courbe ROC
plt.figure(figsize=(10,10))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.4f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic curve')
plt.legend(loc="lower right")
plt.show()