In [1]:
# Import des bibliothèques

import os
from pathlib import Path
import numpy as np
import pandas as pd
import sys
import time
import gc
import random as rd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from tokenizers import Tokenizer
# from transformers import AutoTokenizer
from transformers import BertForMaskedLM, AutoTokenizer, AutoModelForMaskedLM
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace


In [2]:
# Détermination du Path

DATASET_PATH = Path("./data/text")


In [3]:
# Récupération des données textuelles

VOC_SIZE = 1000

def load_data(datapath, max_size=None):
    texts_files = list(datapath.glob("*.txt"))
    texts = []  
    for files in texts_files:
        with open(files, "r", encoding='utf8') as files:
            text = files.readlines()
            texts += text
    texts = list(set(texts))
    
    return texts

texts = load_data(DATASET_PATH)

In [4]:
# Chargement du modèle CamemBERTav2 et tokenisation du texte

model_checkpoint = "almanach/camembertav2-base"

tokenizerCamemBERTaV2_FT = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
modelCamemBERTaV2_FT = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

inputs = tokenizerCamemBERTaV2_FT(texts, return_tensors='pt', max_length=100, 
                   truncation=True, padding='max_length')

inputs['labels'] = inputs.input_ids.detach().clone()

print(inputs.tokens(1))


Some weights of DebertaV2ForMaskedLM were not initialized from the model checkpoint at almanach/camembertav2-base and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['[CLS]', 'Fres', '##h', 'for', '##ages', 'Gi', '##ant', 'sensi', '##tive', '(', 'Mim', '##osa', 'inv', '##isa', '),', 'le', '##aves', ',', 'ste', '##ms', 'and', 'flow', '##ers', ',', 'fres', '##h', 'Gi', '##ant', 'sensi', '##tive', '(', 'Mim', '##osa', 'inv', '##isa', 'Coll', '##a', '),', 'le', '##aves', ',', 'ste', '##ms', 'and', 'flow', '##ers', ',', 'fres', '##h', '\n', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


In [5]:
# Préparation des données pour le Modèle de language maskey

rand = torch.rand(inputs.input_ids.shape)
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102) * (inputs.input_ids != 0)

inputs.input_ids[mask_arr] = 103

sample_idx = [i for i in range(len(inputs.input_ids))]

shuffled_sample_idx = rd.sample(sample_idx, len(sample_idx))

train_idx = shuffled_sample_idx[:int(0.70*len(shuffled_sample_idx))]
val_idx = shuffled_sample_idx[int(0.70*len(shuffled_sample_idx)):int(0.85*len(shuffled_sample_idx))]
test_idx = shuffled_sample_idx[int(0.85*len(shuffled_sample_idx)):]

In [6]:
# Préparation du dataset

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, idx):
        self.encodings = encodings
        self.idx = idx
        self.encodings = {key: [val[i] for i in self.idx] for key, val in self.encodings.items()}
        
    def __getitem__(self, idx):
        return {key : torch.tensor(val[idx]) for key, val in self.encodings.items()}
    
    def __len__(self):
        return len(self.encodings['input_ids'])

dataset_train = CustomDataset(inputs, train_idx)
dataset_val = CustomDataset(inputs, val_idx)
dataset_test = CustomDataset(inputs, test_idx)

train_dataloaded = torch.utils.data.DataLoader(dataset_train, batch_size=16, shuffle=True)
val_dataloaded = torch.utils.data.DataLoader(dataset_val, batch_size=16, shuffle=True)
test_dataloaded = torch.utils.data.DataLoader(dataset_test, batch_size=16, shuffle=True)

In [7]:
#class MLM_model(nn.Module):
#    def __init__(self, model):
#        super(MLM_model, self).__init__()
#        self.history = {"epochs":[], "test":[]}
#        self.model = model
    
#    def parameters(self):
#        return self.model.parameters()

#    def forward(self, x, attention_mask, labels):
#        return self.model(x, attention_mask, labels)
    
#    def train_log(self, train_batch_losses, val_batch_losses, train_loss, validation_loss):
#        self.history["epochs"].append({"train_batch_losses":train_batch_losses, 
#                                "val_batch_losses":val_batch_losses, 
#                                "train_loss":train_loss, 
#                                "validation_loss":validation_loss})
    
#    def test_log(self, test_batch_losses, test_loss):
#        self.history["test"].append({"test_batch_losses":test_batch_losses,
#                                "test_loss":test_loss})

In [8]:
# Définition du device 

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
#model = MLM_model(model)
modelCamemBERTaV2_FT.to(device)
print(device)

cuda


In [9]:
# Apprentissage

def train_step(module, batch, batch_idx, optimizer):
    module.train(True)
    
    inputs_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    
    outputs = module(inputs_ids, attention_mask, labels=labels)
    
    loss = outputs.loss
    print(f"\n\033[1;37mBatch loss {batch_idx+1} : {loss.item()}")
    loss.backward()
    
    torch.nn.utils.clip_grad_norm_(module.parameters(), max_norm=1.0)
    optimizer.step()
    optimizer.zero_grad()
    
    return module, loss

def eval_step(module, batch, batch_idx, optimizer=None, training=True):
    if training == False :
            module.to('cpu')
            
    with torch.no_grad():
            
        inputs_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
    
        outputs = module(inputs_ids, attention_mask, labels=labels)
    
        loss = outputs.loss
         
        if training:
            print(f"\n\033[1;32mValidation Batch loss {batch_idx+1} : {loss.item()}")
            return module, loss
        else:
            print(f"\n\033[1;32mTest Batch loss {batch_idx+1} : {loss.item()}")
            return module, loss, outputs, labels

def train_loop(module, EPOCHS, train_dataset, val_dataset, optimizer, lr_scheduler=None):
    for epoch in range(EPOCHS):
        deb=time.time()
        
        module.train(True)
        
        train_batch_losses = []
        for batch_idx in range(len(train_dataset)):
            batch = next(iter(train_dataset))
            module, loss = train_step(module, batch, batch_idx, optimizer)
            train_batch_losses.append(loss.item())
            
        if lr_scheduler is not None:
          lr_scheduler.step()
        train_loss = np.mean(train_batch_losses)

        module.train(False)
        val_batch_losses = []
        for batch_idx in range(len(val_dataset)):
            batch = next(iter(val_dataset))
            module, loss = eval_step(module, batch, batch_idx)
            val_batch_losses.append(loss.item())
        val_loss = np.mean(val_batch_losses)

#        module.train_log(train_batch_losses, val_batch_losses, train_loss, val_loss)
        print(f"\n\033[1;33mEpoch {epoch+1} :\n\033[1;37mTraining Loss : {train_loss}")
        print(f"\033[1;32mValidation Loss : {val_loss}")
        print(f"\033[1;31mDurée epoch : {time.time()-deb} secondes")
    return module

def evaluate(module, test_dataset):
    module.train(False)
    test_batch_losses = []
    predictions = []
    true_targets = []
    for batch_idx in range(len(test_dataset)):
        batch = next(iter(test_dataset))
        module, loss, outputs, labels = eval_step(module, batch, batch_idx, training=False)

        test_batch_losses.append(loss.item())
        predictions.append(outputs)
        true_targets.append(labels)

    test_loss = np.mean(test_batch_losses)
#    module.test_log(test_batch_losses, test_loss)
    print(f"\nTest Loss : {test_loss}")
    return predictions, true_targets

In [10]:
# Entrainement

if __name__ == "__main__":
    EPOCHS = 1
    LR = 1e-4
    
    optimizer = torch.optim.Adam(modelCamemBERTaV2_FT.parameters(), lr=LR, eps=5e-8)
    module = train_loop(module=modelCamemBERTaV2_FT,
                        EPOCHS=EPOCHS,
                        train_dataset=train_dataloaded, 
                        val_dataset=val_dataloaded,
                        optimizer=optimizer)
    device = 'cpu'
    predictions, true_targets = evaluate(module, 
                                         test_dataloaded)



  return {key : torch.tensor(val[idx]) for key, val in self.encodings.items()}



[1;37mBatch loss 1 : 16.722492218017578

[1;37mBatch loss 2 : 14.218109130859375

[1;37mBatch loss 3 : 11.939130783081055

[1;37mBatch loss 4 : 9.944234848022461

[1;37mBatch loss 5 : 9.848074913024902

[1;37mBatch loss 6 : 7.555875778198242

[1;37mBatch loss 7 : 6.836812973022461

[1;37mBatch loss 8 : 6.171835899353027

[1;37mBatch loss 9 : 5.471433639526367

[1;37mBatch loss 10 : 5.411044120788574

[1;37mBatch loss 11 : 4.816878795623779

[1;37mBatch loss 12 : 3.6743950843811035

[1;37mBatch loss 13 : 3.9125168323516846

[1;37mBatch loss 14 : 3.7844576835632324

[1;37mBatch loss 15 : 3.7364253997802734

[1;37mBatch loss 16 : 3.679506540298462

[1;37mBatch loss 17 : 3.646343946456909

[1;37mBatch loss 18 : 3.3670647144317627

[1;37mBatch loss 19 : 3.4528441429138184

[1;37mBatch loss 20 : 3.236177682876587

[1;37mBatch loss 21 : 3.2011797428131104

[1;37mBatch loss 22 : 2.7212440967559814

[1;37mBatch loss 23 : 3.011889934539795

[1;37mBatch loss 24 : 2.815845


[1;37mBatch loss 191 : 0.41186758875846863

[1;37mBatch loss 192 : 0.5777398943901062

[1;37mBatch loss 193 : 0.3341621458530426

[1;37mBatch loss 194 : 0.45269259810447693

[1;37mBatch loss 195 : 0.3569335639476776

[1;37mBatch loss 196 : 0.5589039325714111

[1;37mBatch loss 197 : 0.43584609031677246

[1;37mBatch loss 198 : 0.42100611329078674

[1;37mBatch loss 199 : 0.40156546235084534

[1;37mBatch loss 200 : 0.6085429191589355

[1;37mBatch loss 201 : 0.5818394422531128

[1;37mBatch loss 202 : 0.4414876699447632

[1;37mBatch loss 203 : 0.49805641174316406

[1;37mBatch loss 204 : 0.7147306203842163

[1;37mBatch loss 205 : 0.5031110644340515

[1;37mBatch loss 206 : 0.8079074621200562

[1;37mBatch loss 207 : 0.48511531949043274

[1;37mBatch loss 208 : 0.6972393989562988

[1;37mBatch loss 209 : 0.3946278393268585

[1;37mBatch loss 210 : 0.4996216893196106

[1;37mBatch loss 211 : 0.3665066957473755

[1;37mBatch loss 212 : 0.4743770658969879

[1;37mBatch loss 213 : 


[1;37mBatch loss 376 : 0.4643150269985199

[1;37mBatch loss 377 : 0.4792516231536865

[1;37mBatch loss 378 : 0.2123057246208191

[1;37mBatch loss 379 : 0.35334545373916626

[1;37mBatch loss 380 : 0.3295416235923767

[1;37mBatch loss 381 : 0.4040817320346832

[1;37mBatch loss 382 : 0.4929461181163788

[1;37mBatch loss 383 : 0.2450319230556488

[1;37mBatch loss 384 : 0.5611326098442078

[1;37mBatch loss 385 : 0.40208011865615845

[1;37mBatch loss 386 : 0.2663513720035553

[1;37mBatch loss 387 : 0.16368921101093292

[1;37mBatch loss 388 : 0.23266801238059998

[1;37mBatch loss 389 : 0.2758752703666687

[1;37mBatch loss 390 : 0.29366835951805115

[1;37mBatch loss 391 : 0.19291557371616364

[1;37mBatch loss 392 : 0.3920094966888428

[1;37mBatch loss 393 : 0.4728275239467621

[1;37mBatch loss 394 : 0.18893557786941528

[1;37mBatch loss 395 : 0.1926065981388092

[1;37mBatch loss 396 : 0.28047120571136475

[1;37mBatch loss 397 : 0.19021488726139069

[1;37mBatch loss 398 


[1;37mBatch loss 561 : 0.12271640449762344

[1;37mBatch loss 562 : 0.2650049328804016

[1;37mBatch loss 563 : 0.1413939744234085

[1;37mBatch loss 564 : 0.28863486647605896

[1;37mBatch loss 565 : 0.2336103320121765

[1;37mBatch loss 566 : 0.2165108621120453

[1;37mBatch loss 567 : 0.2008366733789444

[1;37mBatch loss 568 : 0.21868690848350525

[1;37mBatch loss 569 : 0.2613925635814667

[1;37mBatch loss 570 : 0.3823314607143402

[1;37mBatch loss 571 : 0.29909470677375793

[1;37mBatch loss 572 : 0.3502751886844635

[1;37mBatch loss 573 : 0.21332374215126038

[1;37mBatch loss 574 : 0.14541026949882507

[1;37mBatch loss 575 : 0.35166671872138977

[1;37mBatch loss 576 : 0.22683580219745636

[1;37mBatch loss 577 : 0.2986803948879242

[1;37mBatch loss 578 : 0.16736063361167908

[1;37mBatch loss 579 : 0.47920936346054077

[1;37mBatch loss 580 : 0.25590115785598755

[1;37mBatch loss 581 : 0.25252047181129456

[1;37mBatch loss 582 : 0.396200567483902

[1;37mBatch loss 58


[1;37mBatch loss 746 : 0.2224452942609787

[1;37mBatch loss 747 : 0.21815776824951172

[1;37mBatch loss 748 : 0.6051424145698547

[1;37mBatch loss 749 : 0.12747715413570404

[1;37mBatch loss 750 : 0.11171707510948181

[1;37mBatch loss 751 : 0.09987130761146545

[1;37mBatch loss 752 : 0.19854050874710083

[1;37mBatch loss 753 : 0.15268386900424957

[1;37mBatch loss 754 : 0.14032113552093506

[1;37mBatch loss 755 : 0.16440314054489136

[1;37mBatch loss 756 : 0.19442817568778992

[1;37mBatch loss 757 : 0.14426018297672272

[1;37mBatch loss 758 : 0.20946034789085388

[1;37mBatch loss 759 : 0.15253442525863647

[1;37mBatch loss 760 : 0.18218082189559937

[1;37mBatch loss 761 : 0.16835670173168182

[1;37mBatch loss 762 : 0.19252046942710876

[1;37mBatch loss 763 : 0.15227138996124268

[1;37mBatch loss 764 : 0.13974331319332123

[1;37mBatch loss 765 : 0.6498284339904785

[1;37mBatch loss 766 : 0.157866969704628

[1;32mValidation Batch loss 1 : 0.4209543466567993

[1;32


[1;32mValidation Batch loss 134 : 0.3941113352775574

[1;32mValidation Batch loss 135 : 0.3049771189689636

[1;32mValidation Batch loss 136 : 0.25088730454444885

[1;32mValidation Batch loss 137 : 0.16182729601860046

[1;32mValidation Batch loss 138 : 0.29607540369033813

[1;32mValidation Batch loss 139 : 0.1549631655216217

[1;32mValidation Batch loss 140 : 0.341058611869812

[1;32mValidation Batch loss 141 : 0.20114916563034058

[1;32mValidation Batch loss 142 : 0.1646166294813156

[1;32mValidation Batch loss 143 : 0.1666606068611145

[1;32mValidation Batch loss 144 : 0.20352187752723694

[1;32mValidation Batch loss 145 : 0.24417105317115784

[1;32mValidation Batch loss 146 : 0.1192338690161705

[1;32mValidation Batch loss 147 : 0.1274835169315338

[1;32mValidation Batch loss 148 : 0.22404661774635315

[1;32mValidation Batch loss 149 : 0.2060168832540512

[1;32mValidation Batch loss 150 : 0.38998138904571533

[1;32mValidation Batch loss 151 : 0.2719190716743469

[


[1;32mTest Batch loss 131 : 0.16864356398582458

[1;32mTest Batch loss 132 : 0.32925987243652344

[1;32mTest Batch loss 133 : 0.16733045876026154

[1;32mTest Batch loss 134 : 0.22559458017349243

[1;32mTest Batch loss 135 : 0.36517512798309326

[1;32mTest Batch loss 136 : 0.1668383628129959

[1;32mTest Batch loss 137 : 0.39194488525390625

[1;32mTest Batch loss 138 : 0.18670716881752014

[1;32mTest Batch loss 139 : 0.12083400785923004

[1;32mTest Batch loss 140 : 0.1169017106294632

[1;32mTest Batch loss 141 : 0.21968159079551697

[1;32mTest Batch loss 142 : 0.44437626004219055

[1;32mTest Batch loss 143 : 0.12909828126430511

[1;32mTest Batch loss 144 : 0.15705902874469757

[1;32mTest Batch loss 145 : 0.20313486456871033

[1;32mTest Batch loss 146 : 0.1308278888463974

[1;32mTest Batch loss 147 : 0.1911388635635376

[1;32mTest Batch loss 148 : 0.2127782702445984

[1;32mTest Batch loss 149 : 0.2720476984977722

[1;32mTest Batch loss 150 : 0.17322678864002228

[1;3

In [11]:
print(torch.cuda.memory_stats())

OrderedDict({'active.all.allocated': 1895106, 'active.all.current': 408, 'active.all.freed': 1894698, 'active.all.peak': 1028, 'active.large_pool.allocated': 1189601, 'active.large_pool.current': 152, 'active.large_pool.freed': 1189449, 'active.large_pool.peak': 489, 'active.small_pool.allocated': 705505, 'active.small_pool.current': 256, 'active.small_pool.freed': 705249, 'active.small_pool.peak': 725, 'active_bytes.all.allocated': 9850197817344, 'active_bytes.all.current': 910587904, 'active_bytes.all.freed': 9849287229440, 'active_bytes.all.peak': 5074602496, 'active_bytes.large_pool.allocated': 9740387545088, 'active_bytes.large_pool.current': 909324288, 'active_bytes.large_pool.freed': 9739478220800, 'active_bytes.large_pool.peak': 5065654272, 'active_bytes.small_pool.allocated': 109810272256, 'active_bytes.small_pool.current': 1263616, 'active_bytes.small_pool.freed': 109809008640, 'active_bytes.small_pool.peak': 10062336, 'allocated_bytes.all.allocated': 9850197817344, 'allocate

In [12]:
print(inputs.input_ids.max())
print(inputs.input_ids.min())

tensor(32348)
tensor(0)


In [13]:
# enregistrement du modèle

modelCamemBERTaV2_FT.save_pretrained('./saves/model/CamemBERTaV2_FT')
tokenizerCamemBERTaV2_FT.save_pretrained('./saves/tokenizer/CamemBERTaV2_FT')

('./saves/tokenizer/CamemBERTaV2_FT\\tokenizer_config.json',
 './saves/tokenizer/CamemBERTaV2_FT\\special_tokens_map.json',
 './saves/tokenizer/CamemBERTaV2_FT\\vocab.txt',
 './saves/tokenizer/CamemBERTaV2_FT\\added_tokens.json',
 './saves/tokenizer/CamemBERTaV2_FT\\tokenizer.json')