# Approche deep learning - Transformers

## Chargement des données

In [None]:
from pathlib import Path
from dataset import load_data_film
import numpy as np

DATASET_PATH_TRAIN = Path("data/aclImdb/train")
DATASET_PATH_TEST = Path("data/aclImdb/test")

NB_DOC_MAX_Tr = 1000 # par classe
NB_DOC_MAX_Te = 1000 # par classe
IMDB_CLASSES  = ['neg','pos']
VOC_SIZE = 10000 # utilisé dans le tokenizer de l'approche custom en fin du ntbk
BATCH_SIZE = 8 # à modifier en fonction des capacité du PC


train_txt, files_train, train_labels = load_data_film(DATASET_PATH_TRAIN, max_size = NB_DOC_MAX_Tr)
txts_test, files_test, labels_test = load_data_film(DATASET_PATH_TEST, max_size = NB_DOC_MAX_Te)

#val/test split
#70/30 ratio
import random as rd

rd.seed(42)
test_idx = rd.sample([i for i in range(2*NB_DOC_MAX_Te)], k = int(0.3 * NB_DOC_MAX_Te * 2))

test_txt = txts_test[test_idx]; val_txt = np.delete(txts_test,test_idx)
test_label = labels_test[test_idx]; val_label = np.delete(labels_test,test_idx) 


On fait le choix de réaliser un split val/test sut le le dataset de test

In [54]:
print(f" Train set : {len(train_txt)} examples\n" )
print(f"Example : \t {train_labels[0]} {train_txt[0][:100]}")
print(f"Example : \t {train_labels[1]} {train_txt[1][:100]} \n")
print(f" Val set :  {len(val_txt)} Examples\n")
print(f" Example : \t {val_label[0]} {val_txt[0][:100]}")
print(f" Example : \t {val_label[1]} {val_txt[1][:100]} \n")
print(f" Test set : {len(test_txt)} examples \n ")
print(f"Example : \t {test_label[0]} {test_txt[0][:100]} ")
print(f"Example : \t {test_label[1]} {test_txt[1][:100]} \n")


 Train set : 2000 examples

Example : 	 0 So, I know that I voted 1 out of 10 but really this deserves no more than half of a star. I hated it
Example : 	 0 Alex D. Linz replaces Macaulay Culkin as the central figure in the third movie in the Home Alone emp 

 Val set :  1400 Examples

 Example : 	 0 There are just so many things wrong with this movie.<br /><br />Jeff Bridges weird accent.<br /><br 
 Example : 	 0 Viewers of independent films know that once or twice a year they are going to see stories about dysf 

 Test set : 600 examples 
 
Example : 	 1 It's one of my favorite movies as much because of the location and music as the story line. Don't ma 
Example : 	 0 I gotta go with my boy Allen (who also reviewed this film)...ZOMBIE GANGBANGERS (as my copy is entit 



# Approche on the shelf - Modèle HuggingFace

On va utiliser le modèle suivant, il s'agit d'un modèle bert, finetuné pour de la classification de sentiment
https://huggingface.co/tabularisai/multilingual-sentiment-analysis
Seulement ce modèle a été entrainé sur 5 classes. Il faut légèrement modifier la fonction de prédiction pour mapper les niveaux 0/1 vers 0 et 3/4 vers 1.

In [55]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "tabularisai/multilingual-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def predict_sentiment(texts):
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    sentiment_map = {0: "Very Negative", 1: "Negative", 2: "Neutral", 3: "Positive", 4: "Very Positive"}
    return [sentiment_map[p] for p in torch.argmax(probabilities, dim=-1).tolist()]


In [None]:
pred = predict_sentiment(train_txt[0])
print(f"Prediction : {pred[0]} \nVérité terrain : {train_labels[0]} \n \n {train_txt[0]}")

Prediction : Very Negative 
 Vérité terrain : 0 
 
 So, I know that I voted 1 out of 10 but really this deserves no more than half of a star. I hated it. It was so stupid and unrealistic, I can't believe any of the stars signed on to make this ridiculously absurd project.<br /><br />James G. and Cathrine O'Hara were excellent in their characters and Ben Affleck and Christina Applegate were just as good too, but the story sucked and I encourage anyone who sees this in the video store to not even bother picking it up and reading the back cover, but to just walk away...I don't even want to get into what the movie is about, because it is too stupid to pontificate about.<br /><br />Don't rent this! It's horrible! Horrible!


## 1 DataLoader

Par la suite on va utiliser des datasets et dataloaders, on les définit ici.

In [6]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import sys
import re


class TextDataset(Dataset):
    def __init__(self, texts: list, labels):
        self.labels = labels
        self.phrasesnum = texts

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, i):
        return self.phrasesnum[i], torch.tensor(self.labels[i])

## create  datasets
ds_train = TextDataset(train_txt,train_labels)
ds_val = TextDataset(val_txt,val_label)
ds_test  = TextDataset(test_txt,test_label)


def collate_fn(batch):
    sequences, labels = zip(*batch)
    batch_encoding = tokenizer(sequences, return_tensors="pt", truncation=True, padding=True, max_length=512)
    batch_encoding['labels'] = torch.tensor(labels)
    return batch_encoding


## create dataloader
train_loader = DataLoader(ds_train, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(ds_val, batch_size=BATCH_SIZE, shuffle=False , collate_fn=collate_fn)
test_loader = DataLoader(ds_test, batch_size=BATCH_SIZE, shuffle=False ) #, collate_fn=collate_fn)


In [7]:
next(iter(train_loader))

{'input_ids': tensor([[  101, 11301,   107,  ...,     0,     0,     0],
        [  101, 10117, 18077,  ...,     0,     0,     0],
        [  101, 10747, 10124,  ..., 10189,   106,   102],
        ...,
        [  101, 10747, 10458,  ...,     0,     0,     0],
        [  101, 12489, 44936,  ...,     0,     0,     0],
        [  101, 11723, 10301,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 0, 1, 1, 1, 1, 0, 0])}

Ce modèle présente 5 classes de sortie on doit en avoir 2. On va légèrement modifier la fonction predict_sentiment

In [57]:
def predict_sentiment_2C(inputs):
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    sentiment_map = {0: 0, 1: 0, 2: "Neutral", 3: 1, 4: 1}
    return [sentiment_map[p] if p!= 2 else rd.randint(0,1) for p in torch.argmax(probabilities, dim=-1).tolist()]


In [58]:
y = predict_sentiment_2C(next(iter(val_loader)))
print(y, next(iter(val_loader)).labels)

[0, 1, 0, 1, 1, 0, 1, 1] tensor([0, 0, 0, 0, 0, 0, 0, 0])


## 2 Evaluation du modèle on the shelf

On évalue les performances du modèle modifié sur le jeu de données de validation.

In [None]:
from tqdm import tqdm


def eval_on_the_shelf(val_loader, mod=predict_sentiment_2C):
    y_pred = []
    cum_correct = 0
    epoch_n_example = len(val_loader)*BATCH_SIZE # formule à corriger si len(batch[-1]) < BATCH_SIZE

    for batch in tqdm(val_loader):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        output = mod(batch)
        cum_correct += (torch.tensor(output) == batch['labels']).sum()
        y_pred += output

    return torch.tensor(y_pred), cum_correct/epoch_n_example


In [None]:
from pickle import dump
from pickle import load
import os

if os.path.isfile('eval/2_dl_on_shelf.pkl'):
    with open('eval/2_dl_onshelf.pkl') as f:
        df = load(f)
else :  
    y_pred, acc = eval_on_the_shelf(val_loader, predict_sentiment_2C)
    df = {'ytrue' : ds_val.labels, 'ypred' : y_pred, 'accuracy' : acc} 
    with open('eval/2_dl_onshelf.pkl', 'wb') as f:
        dump(df,f)
        

In [86]:
print(data['ypred'])

[[0, 1, 0, 0, 0, 1, 0, 1], [0, 1, 1, 0, 0, 1, 0, 0], [1, 0, 0, 1, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 1], [0, 0, 0, 1, 0, 0, 0, 0]]


## 3 Finetuning

On va maintenant finetuner (i.e. réentrainer le modèle en activant le gradient) avec le jeu de données de train. Puis on évaluera les perfomances sur le jeu de validation. On s'attend à priori à ce que ce modèle soit meilleur que le premier car il va mieux capturer la distribution de nos données, sans doute différente de celle de son jeu d'entrainement.

In [59]:
model_name = "tabularisai/multilingual-sentiment-analysis"


tokenizer = AutoTokenizer.from_pretrained(model_name)
model_FT = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, torch_dtype="auto", ignore_mismatched_sizes=True)
model_FT.to(DEVICE)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at tabularisai/multilingual-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
from torch.optim import AdamW
from transformers import get_scheduler


optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)


In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))
model_FT.train()

train_loss = []
test_loss = []

for epoch in range(num_epochs):
    
    cum_train_loss = 0
    for batch in train_loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = model_FT(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        cum_train_loss += loss.item()
        print(f"Training - epoch {epoch} - batch train loss {loss}")
    
    train_loss.append(cum_train_loss)

    cum_test_loss = 0
    for btach in tqdm(val_loader):
        batch = {k: v for k, v in batch.items()}
        outputs = model_FT(**batch)
        test_loss = outputs.loss
        cum_test_loss += test_loss.item() 
    print(f"Test - epoch {epoch} - test loss {cum_test_loss}")

    test_loss.append(cum_test_loss)


  0%|          | 0/750 [00:07<?, ?it/s]


Training - epoch 0 - batch train loss 0.7270489931106567


In [None]:
## sauvegarde du modèle et des poids

# avec hugging face ou torch ??

## 4 Évaluation du modèle adjusté

In [None]:
def predict(inputs):
    with torch.no_grad():
        outputs = model_FT(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return torch.argmax(probabilities, dim=-1).tolist()

In [74]:
from pickle import dump
from pickle import load
import os

if os.path.isfile('eval/2_dl_onshelf_FT.pkl'):
    with open('eval/2_dl_onshelf_FT.pkl') as f:
        df = load(f)
else :  
    y_pred,acc = eval_on_the_shelf(val_loader, predict)
    df = {'ytrue' : ds_val.labels, 'ypred' : y_pred, 'accuracy' : acc} 
    with open('2_dl_onshelf_FT.pkl', 'wb') as f:
        dump(df,f)

  1%|          | 1/175 [00:15<45:45, 15.78s/it]

[0, 2, 2, 2, 2, 1, 2, 2] tensor([0, 0, 0, 0, 0, 0, 0, 0])
tensor(1)
[]


  1%|          | 1/175 [00:24<1:10:25, 24.28s/it]


KeyboardInterrupt: 

## 5 Optimisation classifier - avec modèles tranformers freezé

In [None]:
model_name = "tabularisai/multilingual-sentiment-analysis"


tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, torch_dtype="auto", ignore_mismatched_sizes=True)


class MLPSentClassifier(nn.module):

    def __init__(self, hidden_dim, output_class):
        super.__init__()
        self.hidden_dim = hidden_dim
        self.output_dim = output_class
        self.h2o = nn.sequential(nn.linear(self.hidden_dim,self.output_dim))

    def foward(self, logits):
        output = self.h20(logits)
        return output



Création d'un nouveau dataset avec les données déjà tranformée avec de tabularisai. On entraine seulement la tête de régression (sans activer le gradient sur le reste du réseau). On peut faire passer toute les données à travers le réseau pour gagner du temps.

In [84]:
ds_train = TextDataset(train_txt,train_labels)
ds_val = TextDataset(val_txt,val_label)
ds_test  = TextDataset(test_txt,test_label)


def pre_process(txt):
    batch_encoding = tokenizer(txt, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**batch_encoding)
    return outputs.logits

In [None]:
pre_process(train_txt[:2])

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

# Approche custom - Modèle maison

## 1 Tokenizer

On construit un tokenizer à l'aides de la librairie tokenizers d'HuggingFace. On utilise sensiblement les même paramètres que pour le tokenizer de Bert en changeant la taille de vocabulaire.

In [None]:
from tokenizers import Tokenizer

import os

if os.path.isfile('bert-wiki.json'):
    tokenizer = Tokenizer.from_file("bert-wiki.json")
else :  

    from tokenizers.models import WordPiece
    from tokenizers import normalizers
    from tokenizers.normalizers import NFD, Lowercase, StripAccents
    from tokenizers.pre_tokenizers import Whitespace
    from tokenizers.trainers import WordPieceTrainer

    trainer = WordPieceTrainer(
        vocab_size=VOC_SIZE, 
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
        )

    tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
    tokenizer.pre_tokenizer = Whitespace()

    tokenizer.train(files, trainer)
    tokenizer.save("bert-wiki.json")

PAD = tokenizer.encode('[CLS]').ids[0]

## 2 Data loader

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import sys
import re


class TextDataset(Dataset):
    def __init__(self, texts: list, labels):
        self.labels = labels
        self.phrasesnum = texts

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, i):
        return self.phrasesnum[i], torch.tensor(self.labels[i])

## create  datasets
ds_train = TextDataset(txts,filelabels)
ds_test  = TextDataset(txts_test,filelabels_test)


def collate_fn(batch):
    sequences, labels = zip(*batch)
    sequences  = [torch.tensor(tokenizer.encode('[CLS] ' + seq).ids) for seq in sequences]
    lengths = [len(seq) for seq in sequences]
    padded_sequences = pad_sequence(sequences, batch_first=False)
    return padded_sequences, torch.tensor(lengths), torch.tensor(labels)


## create dataloader
train_loader = DataLoader(ds_train, batch_size=BATCH_SIZE, shuffle=True,  collate_fn=collate_fn)
test_loader = DataLoader(ds_test, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)



In [None]:
batch =next(iter( train_loader))
padded_sequences, lengths, labels = batch

print("Padded sequences:", padded_sequences)

print(padded_sequences.size(), lengths.max())
print("Labels:", labels)


Padded sequences: tensor([[   1,    1,    1,  ...,    1,    1,    1],
        [ 235, 2201,   45,  ...,  250,  209,  250],
        [ 517,  776, 7095,  ...,  276, 4860,  233],
        ...,
        [5126,    0,    0,  ...,    0,    0,    0],
        [  20,    0,    0,  ...,    0,    0,    0],
        [ 209,    0,    0,  ...,    0,    0,    0]])
torch.Size([249, 8]) tensor(249)
Labels: tensor([1, 1, 1, 1, 0, 0, 1, 1])


## 3 Model definition

In [None]:
## see the file models.py for model definition

import models


In [None]:
import time

## Network creation
emb_size = 128
voc_size = VOC_SIZE
num_layers = 4
num_heads = 4
hidden_size_mlp = 128
output_size = 2

# build network
sent_clf = models.SentModel( emb_size, voc_size, num_layers, num_heads, hidden_size_mlp , output_size, PAD)
sent_clf.name = "SentModel-"+time.asctime()



## 4 Train loop

In [None]:
import os

def save_model(model,fichier): 
      """ sauvegarde du modèle dans fichier """
      state = {'model_state': model.state_dict()}
      torch.save(state,fichier) 
 
def load_model(fichier,model):
      """ Si le fichier existe, on charge le modèle  """
      if os.path.isfile(fichier):
          state = torch.load(fichier)
          model.load_state_dict(state['model_state'])
      else:
           print("Erreur de chargement du fichier")

In [None]:
# définition de la métrique d'évaluation
def accuracy(yhat,y):
    # y encode les indexes, s'assurer de la bonne taille de tenseur
    assert len(y.shape)==1 or y.size(1)==1
    return (torch.argmax(yhat,1).view(y.size(0),-1)== y.view(-1,1)).float().mean()

In [None]:
from tqdm import tqdm
device = "cpu"

def train(model,epochs,train_loader,test_loader):
    #writer = SummaryWriter(f"{TB_PATH}/{model.name}")
    optim = torch.optim.Adam(model.parameters(),lr=5e-4)    # choix optimizer
    model = model.to(device)
    print(f"running {model.name}")
    loss = nn.CrossEntropyLoss()                            # choix loss
    # 
    # loss = nn.CrossEntropyLoss(weight=cl_weight.to(device))                            # choix loss
    for epoch in tqdm(range(epochs)):
        cumloss, cumacc, count = 0, 0, 0
        model.train()
        for x, lengths, y in tqdm(train_loader):                            # boucle sur les batchs
            optim.zero_grad()
            x,y = x.to(device), y.to(device)                # y doit être un tensor (pas un int)
            yhat = model(x)
            l = loss(yhat,y)
            l.backward()
            optim.step()
            cumloss += l*len(x)                             # attention, il peut y avoir un batch + petit (le dernier)
            cumacc += accuracy(yhat,y)*len(x)
            count += len(x)
        #writer.add_scalar('loss/train',cumloss/count,epoch)
        #writer.add_scalar('accuracy/train',cumacc/count,epoch)
        if epoch % 2 == 0:
            model.eval()
            with torch.no_grad():
                cumloss, cumacc, count = 0, 0, 0
                for x, lengths, y in test_loader:
                    x,y = x.to(device), y.to(device)
                    yhat = model(x)
                    cumloss += loss(yhat,y)*len(x)
                    cumacc += accuracy(yhat,y)*len(x)
                    count += len(x)
                #writer.add_scalar(f'loss/test',cumloss/count,epoch)
                #writer.add_scalar('accuracy/test',cumacc/count,epoch)

In [None]:
n_epoch = 1
train(sent_clf, n_epoch, train_loader, test_loader)

running SentModel-Sun Feb 16 22:11:59 2025


  0%|          | 0/1 [00:00<?, ?it/s]

torch.Size([259, 8, 128])




torch.Size([246, 8, 128])




torch.Size([249, 8, 128])




torch.Size([255, 8, 128])


  1%|          | 3/250 [00:11<15:46,  3.83s/it]
  0%|          | 0/1 [00:11<?, ?it/s]


KeyboardInterrupt: 

## 5 Évaluation du modèle Custom

## 6 HyperParameter tuning

In [None]:
from optuna import Study


def objective(trial):
    return


study = 