# Approche deep learning - Transformers

## Chargement des données

In [4]:
from pathlib import Path
from dataset import load_data_film


DATASET_PATH_TRAIN = Path("data/aclImdb/train")
DATASET_PATH_TEST = Path("data/aclImdb/test")

NB_DOC_MAX = 1000 # par classe
IMDB_CLASSES  = ['neg','pos']
VOC_SIZE = 10000
BATCH_SIZE = 8


txts, files, filelabels = load_data_film(DATASET_PATH_TRAIN, max_size = NB_DOC_MAX)
txts_test, files_test, filelabels_test = load_data_film(DATASET_PATH_TEST, max_size = NB_DOC_MAX)

On va utiliser le modèle suivant, il s'agot d'un modèle bert, finetuné pour de la classification de sentiment
https://huggingface.co/tabularisai/multilingual-sentiment-analysis

# Approche on the shelf - Modèle HuggingFace

In [21]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model_name = "tabularisai/multilingual-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def predict_sentiment(texts):
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    sentiment_map = {0: "Very Negative", 1: "Negative", 2: "Neutral", 3: "Positive", 4: "Very Positive"}
    return [sentiment_map[p] for p in torch.argmax(probabilities, dim=-1).tolist()]


In [None]:
predict_sentiment(txts[0])

So, I know that I voted 1 out of 10 but really this deserves no more than half of a star. I hated it. It was so stupid and unrealistic, I can't believe any of the stars signed on to make this ridiculously absurd project.<br /><br />James G. and Cathrine O'Hara were excellent in their characters and Ben Affleck and Christina Applegate were just as good too, but the story sucked and I encourage anyone who sees this in the video store to not even bother picking it up and reading the back cover, but to just walk away...I don't even want to get into what the movie is about, because it is too stupid to pontificate about.<br /><br />Don't rent this! It's horrible! Horrible!


['Very Negative']

## 1 Eval

## 2 Finetuning

# Approche custom - Modèle maison

## 1 Tokenizer

On construit un tokenizer à l'aides de la librairie tokenizers d'HuggingFace. On utilise sensiblement les même paramètres que pour le tokenizer de Bert en changeant la taille de vocabulaire.

In [6]:
from tokenizers import Tokenizer

import os

if os.path.isfile('bert-wiki.json'):
    tokenizer = Tokenizer.from_file("bert-wiki.json")
else :  

    from tokenizers.models import WordPiece
    from tokenizers import normalizers
    from tokenizers.normalizers import NFD, Lowercase, StripAccents
    from tokenizers.pre_tokenizers import Whitespace
    from tokenizers.trainers import WordPieceTrainer

    trainer = WordPieceTrainer(
        vocab_size=VOC_SIZE, 
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
        )

    tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
    tokenizer.pre_tokenizer = Whitespace()

    tokenizer.train(files, trainer)
    tokenizer.save("bert-wiki.json")

PAD = tokenizer.encode('[CLS]').ids[0]

## 2 Data loader

In [7]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import sys
import re


class TextDataset(Dataset):
    def __init__(self, texts: list, labels):
        self.labels = labels
        self.phrasesnum = texts

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, i):
        return self.phrasesnum[i], torch.tensor(self.labels[i])

## create  datasets
ds_train = TextDataset(txts,filelabels)
ds_test  = TextDataset(txts_test,filelabels_test)


def collate_fn(batch):
    sequences, labels = zip(*batch)
    sequences  = [torch.tensor(tokenizer.encode('[CLS] ' + seq).ids) for seq in sequences]
    lengths = [len(seq) for seq in sequences]
    padded_sequences = pad_sequence(sequences, batch_first=False)
    return padded_sequences, torch.tensor(lengths), torch.tensor(labels)


## create dataloader
train_loader = DataLoader(ds_train, batch_size=BATCH_SIZE, shuffle=True,  collate_fn=collate_fn)
test_loader = DataLoader(ds_test, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)



In [8]:
batch =next(iter( train_loader))
padded_sequences, lengths, labels = batch

print("Padded sequences:", padded_sequences)

print(padded_sequences.size(), lengths.max())
print("Labels:", labels)


Padded sequences: tensor([[   1,    1,    1,  ...,    1,    1,    1],
        [ 235, 2201,   45,  ...,  250,  209,  250],
        [ 517,  776, 7095,  ...,  276, 4860,  233],
        ...,
        [5126,    0,    0,  ...,    0,    0,    0],
        [  20,    0,    0,  ...,    0,    0,    0],
        [ 209,    0,    0,  ...,    0,    0,    0]])
torch.Size([249, 8]) tensor(249)
Labels: tensor([1, 1, 1, 1, 0, 0, 1, 1])


## 3 Model definition

In [9]:
## see the file models.py for model definition

import models


In [10]:
import time

## Network creation
emb_size = 128
voc_size = VOC_SIZE
num_layers = 4
num_heads = 4
hidden_size_mlp = 128
output_size = 2

# build network
sent_clf = models.SentModel( emb_size, voc_size, num_layers, num_heads, hidden_size_mlp , output_size, PAD)
sent_clf.name = "SentModel-"+time.asctime()



## 4 Train loop

In [11]:
import os

def save_model(model,fichier): 
      """ sauvegarde du modèle dans fichier """
      state = {'model_state': model.state_dict()}
      torch.save(state,fichier) 
 
def load_model(fichier,model):
      """ Si le fichier existe, on charge le modèle  """
      if os.path.isfile(fichier):
          state = torch.load(fichier)
          model.load_state_dict(state['model_state'])
      else:
           print("Erreur de chargement du fichier")

In [12]:
# définition de la métrique d'évaluation
def accuracy(yhat,y):
    # y encode les indexes, s'assurer de la bonne taille de tenseur
    assert len(y.shape)==1 or y.size(1)==1
    return (torch.argmax(yhat,1).view(y.size(0),-1)== y.view(-1,1)).float().mean()

In [13]:
from tqdm import tqdm
device = "cpu"

def train(model,epochs,train_loader,test_loader):
    #writer = SummaryWriter(f"{TB_PATH}/{model.name}")
    optim = torch.optim.Adam(model.parameters(),lr=5e-4)    # choix optimizer
    model = model.to(device)
    print(f"running {model.name}")
    loss = nn.CrossEntropyLoss()                            # choix loss
    # 
    # loss = nn.CrossEntropyLoss(weight=cl_weight.to(device))                            # choix loss
    for epoch in tqdm(range(epochs)):
        cumloss, cumacc, count = 0, 0, 0
        model.train()
        for x, lengths, y in tqdm(train_loader):                            # boucle sur les batchs
            optim.zero_grad()
            x,y = x.to(device), y.to(device)                # y doit être un tensor (pas un int)
            yhat = model(x)
            l = loss(yhat,y)
            l.backward()
            optim.step()
            cumloss += l*len(x)                             # attention, il peut y avoir un batch + petit (le dernier)
            cumacc += accuracy(yhat,y)*len(x)
            count += len(x)
        #writer.add_scalar('loss/train',cumloss/count,epoch)
        #writer.add_scalar('accuracy/train',cumacc/count,epoch)
        if epoch % 2 == 0:
            model.eval()
            with torch.no_grad():
                cumloss, cumacc, count = 0, 0, 0
                for x, lengths, y in test_loader:
                    x,y = x.to(device), y.to(device)
                    yhat = model(x)
                    cumloss += loss(yhat,y)*len(x)
                    cumacc += accuracy(yhat,y)*len(x)
                    count += len(x)
                #writer.add_scalar(f'loss/test',cumloss/count,epoch)
                #writer.add_scalar('accuracy/test',cumacc/count,epoch)

In [14]:
n_epoch = 1
train(sent_clf, n_epoch, train_loader, test_loader)

running SentModel-Sun Feb 16 22:11:59 2025


  0%|          | 0/1 [00:00<?, ?it/s]

torch.Size([259, 8, 128])




torch.Size([246, 8, 128])




torch.Size([249, 8, 128])




torch.Size([255, 8, 128])


  1%|          | 3/250 [00:11<15:46,  3.83s/it]
  0%|          | 0/1 [00:11<?, ?it/s]


KeyboardInterrupt: 

## 5 Performance evaluation

## 6 HyperParameter tuning

In [None]:
from optuna import Study


def objective(trial):
    return


study = 