In [1]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import sys

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

import time
import gc

DATASET_PATH = Path("./data/text")

VOC_SIZE = 1000

def load_data(datapath, max_size=None):
    texts_files = list(datapath.glob("*.txt"))
    texts = []  
    for files in texts_files:
        with open(files, "r", encoding='utf8') as files:
            text = files.readlines()
            texts += text
    texts = list(set(texts))
    
    return texts

texts = load_data(DATASET_PATH)

from tokenizers import Tokenizer
# from transformers import AutoTokenizer
from transformers import BertForMaskedLM, AutoTokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace


model_checkpoint = "bert-base-uncased"

tokenizerBERT_FT = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
modelBERT_FT = BertForMaskedLM.from_pretrained(model_checkpoint)

inputs = tokenizerBERT_FT(texts, return_tensors='pt', max_length=100
                   , truncation=True, padding='max_length')

inputs['labels'] = inputs.input_ids.detach().clone()

print(inputs.tokens(1))


BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archite

['[CLS]', 'cr', '##uc', '##ifer', '##es', ',', 'col', '##za', 'flora', '##ison', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


In [2]:
rand = torch.rand(inputs.input_ids.shape)
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [3]:
import random as rd

In [4]:
# apply the [MASK] token with the mask array 
inputs.input_ids[mask_arr] = 103

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, idx):
        self.encodings = encodings
        self.idx = idx
        self.encodings = {key: [val[i] for i in self.idx] for key, val in self.encodings.items()}
        
    def __getitem__(self, idx):
        return {key : torch.tensor(val[idx]) for key, val in self.encodings.items()}
    
    def __len__(self):
        return len(self.encodings['input_ids'])

sample_idx = [i for i in range(len(inputs.input_ids))]

shuffled_sample_idx = rd.sample(sample_idx, len(sample_idx))

train_idx = shuffled_sample_idx[:int(0.70*len(shuffled_sample_idx))]
val_idx = shuffled_sample_idx[int(0.70*len(shuffled_sample_idx)):int(0.85*len(shuffled_sample_idx))]
test_idx = shuffled_sample_idx[int(0.85*len(shuffled_sample_idx)):]
                                
dataset_train = CustomDataset(inputs, train_idx)
dataset_val = CustomDataset(inputs, val_idx)
dataset_test = CustomDataset(inputs, test_idx)

train_dataloaded = torch.utils.data.DataLoader(dataset_train, batch_size=16, shuffle=True)
val_dataloaded = torch.utils.data.DataLoader(dataset_val, batch_size=16, shuffle=True)
test_dataloaded = torch.utils.data.DataLoader(dataset_test, batch_size=16, shuffle=True)

In [5]:
#class MLM_model(nn.Module):
#    def __init__(self, model):
#        super(MLM_model, self).__init__()
#        self.history = {"epochs":[], "test":[]}
#        self.model = model
    
#    def parameters(self):
#        return self.model.parameters()

#    def forward(self, x, attention_mask, labels):
#        return self.model(x, attention_mask, labels)
    
#    def train_log(self, train_batch_losses, val_batch_losses, train_loss, validation_loss):
#        self.history["epochs"].append({"train_batch_losses":train_batch_losses, 
#                                "val_batch_losses":val_batch_losses, 
#                                "train_loss":train_loss, 
#                                "validation_loss":validation_loss})
    
#    def test_log(self, test_batch_losses, test_loss):
#        self.history["test"].append({"test_batch_losses":test_batch_losses,
#                                "test_loss":test_loss})

In [6]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
#model = MLM_model(model)
modelBERT_FT.to(device)
print(device)

cuda


In [7]:
def train_step(module, batch, batch_idx, optimizer):
    module.train(True)
    
    inputs_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    
    outputs = module(inputs_ids, attention_mask, labels=labels)
    
    loss = outputs.loss
    print(f"\n\033[1;37mBatch loss {batch_idx+1} : {loss.item()}")
    loss.backward()
    
    torch.nn.utils.clip_grad_norm_(module.parameters(), max_norm=1.0)
    optimizer.step()
    optimizer.zero_grad()
    
    return module, loss

def eval_step(module, batch, batch_idx, optimizer=None, training=True):
    if training == False :
        module.to('cpu')
    with torch.no_grad():
        
        inputs_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
    
        outputs = module(inputs_ids, attention_mask, labels=labels)
    
        loss = outputs.loss
         
        if training:
            print(f"\n\033[1;32mValidation Batch loss {batch_idx+1} : {loss.item()}")
            return module, loss
        else:
            print(f"\n\033[1;32mTest Batch loss {batch_idx+1} : {loss.item()}")
            return module, loss, outputs, labels

def train_loop(module, EPOCHS, train_dataset, val_dataset, optimizer, lr_scheduler=None):
    for epoch in range(EPOCHS):
        deb=time.time()
        
        module.train(True)
        
        train_batch_losses = []
        for batch_idx in range(len(train_dataset)):
            batch = next(iter(train_dataset))
            module, loss = train_step(module, batch, batch_idx, optimizer)
            train_batch_losses.append(loss.item())
            
        if lr_scheduler is not None:
          lr_scheduler.step()
        train_loss = np.mean(train_batch_losses)

        module.train(False)
        val_batch_losses = []
        for batch_idx in range(len(val_dataset)):
            batch = next(iter(val_dataset))
            module, loss = eval_step(module, batch, batch_idx)
            val_batch_losses.append(loss.item())
        val_loss = np.mean(val_batch_losses)

#        module.train_log(train_batch_losses, val_batch_losses, train_loss, val_loss)
        print(f"\n\033[1;33mEpoch {epoch+1} :\n\033[1;37mTraining Loss : {train_loss}")
        print(f"\033[1;32mValidation Loss : {val_loss}")
        print(f"\033[1;31mDurée epoch : {time.time()-deb} secondes")
    return module

def evaluate(module, test_dataset):
    module.train(False)
    test_batch_losses = []
    predictions = []
    true_targets = []
    for batch_idx in range(len(test_dataset)):
        batch = next(iter(test_dataset))
        module, loss, outputs, labels = eval_step(module, batch, batch_idx, training=False)

        test_batch_losses.append(loss.item())
        predictions.append(outputs)
        true_targets.append(labels)

    test_loss = np.mean(test_batch_losses)
#    module.test_log(test_batch_losses, test_loss)
    print(f"\nTest Loss : {test_loss}")
    return predictions, true_targets

In [8]:
if __name__ == "__main__":
    EPOCHS = 1
    LR = 1e-4
    
    optimizer = torch.optim.Adam(modelBERT_FT.parameters(), lr=LR, eps=5e-8)
    module = train_loop(module=modelBERT_FT,
                        EPOCHS=EPOCHS, 
                        train_dataset=train_dataloaded, 
                        val_dataset=val_dataloaded,
                        optimizer=optimizer)
    device = 'cpu'
    predictions, true_targets = evaluate(module, 
                                         test_dataloaded)



  return {key : torch.tensor(val[idx]) for key, val in self.encodings.items()}



[1;37mBatch loss 1 : 10.015976905822754

[1;37mBatch loss 2 : 7.16989803314209

[1;37mBatch loss 3 : 5.97263240814209

[1;37mBatch loss 4 : 5.151910781860352

[1;37mBatch loss 5 : 3.9510135650634766

[1;37mBatch loss 6 : 3.5560452938079834

[1;37mBatch loss 7 : 3.0861096382141113

[1;37mBatch loss 8 : 2.552812099456787

[1;37mBatch loss 9 : 2.1852333545684814

[1;37mBatch loss 10 : 1.7060927152633667

[1;37mBatch loss 11 : 1.5518842935562134

[1;37mBatch loss 12 : 1.2483863830566406

[1;37mBatch loss 13 : 0.9671221375465393

[1;37mBatch loss 14 : 0.7905038595199585

[1;37mBatch loss 15 : 0.658480167388916

[1;37mBatch loss 16 : 0.4967593252658844

[1;37mBatch loss 17 : 0.40568307042121887

[1;37mBatch loss 18 : 0.41909778118133545

[1;37mBatch loss 19 : 0.38154059648513794

[1;37mBatch loss 20 : 0.32720473408699036

[1;37mBatch loss 21 : 0.2920907437801361

[1;37mBatch loss 22 : 0.3172966241836548

[1;37mBatch loss 23 : 0.2509572207927704

[1;37mBatch loss 24 :


[1;37mBatch loss 187 : 0.04398718848824501

[1;37mBatch loss 188 : 0.04692811146378517

[1;37mBatch loss 189 : 0.09959032386541367

[1;37mBatch loss 190 : 0.06682781875133514

[1;37mBatch loss 191 : 0.0643148198723793

[1;37mBatch loss 192 : 0.05941101163625717

[1;37mBatch loss 193 : 0.03663650527596474

[1;37mBatch loss 194 : 0.05901409685611725

[1;37mBatch loss 195 : 0.06762278079986572

[1;37mBatch loss 196 : 0.07380421459674835

[1;37mBatch loss 197 : 0.06400299817323685

[1;37mBatch loss 198 : 0.09313797205686569

[1;37mBatch loss 199 : 0.04881320893764496

[1;37mBatch loss 200 : 0.0973479300737381

[1;37mBatch loss 201 : 0.05228410288691521

[1;37mBatch loss 202 : 0.09519591927528381

[1;37mBatch loss 203 : 0.07802216708660126

[1;37mBatch loss 204 : 0.06779863685369492

[1;37mBatch loss 205 : 0.04498574882745743

[1;37mBatch loss 206 : 0.05348963662981987

[1;37mBatch loss 207 : 0.0867738127708435

[1;37mBatch loss 208 : 0.08564994484186172

[1;37mBatch


[1;37mBatch loss 369 : 0.030529938638210297

[1;37mBatch loss 370 : 0.03899368271231651

[1;37mBatch loss 371 : 0.04775836318731308

[1;37mBatch loss 372 : 0.06384135037660599

[1;37mBatch loss 373 : 0.050309695303440094

[1;37mBatch loss 374 : 0.055152975022792816

[1;37mBatch loss 375 : 0.06687694787979126

[1;37mBatch loss 376 : 0.03596597909927368

[1;37mBatch loss 377 : 0.03613467141985893

[1;37mBatch loss 378 : 0.049278534948825836

[1;37mBatch loss 379 : 0.04847557470202446

[1;37mBatch loss 380 : 0.061841823160648346

[1;37mBatch loss 381 : 0.04260622337460518

[1;37mBatch loss 382 : 0.09594631940126419

[1;37mBatch loss 383 : 0.044280119240283966

[1;37mBatch loss 384 : 0.050975095480680466

[1;37mBatch loss 385 : 0.022749679163098335

[1;37mBatch loss 386 : 0.06224841624498367

[1;37mBatch loss 387 : 0.07056812196969986

[1;37mBatch loss 388 : 0.06944659352302551

[1;37mBatch loss 389 : 0.0303342342376709

[1;37mBatch loss 390 : 0.04486014321446419

[


[1;37mBatch loss 550 : 0.012240168638527393

[1;37mBatch loss 551 : 0.06540932506322861

[1;37mBatch loss 552 : 0.025776639580726624

[1;37mBatch loss 553 : 0.03813769295811653

[1;37mBatch loss 554 : 0.017622720450162888

[1;37mBatch loss 555 : 0.06668172776699066

[1;37mBatch loss 556 : 0.02338511496782303

[1;37mBatch loss 557 : 0.04723621904850006

[1;37mBatch loss 558 : 0.04028474912047386

[1;37mBatch loss 559 : 0.03333281725645065

[1;37mBatch loss 560 : 0.0349687896668911

[1;37mBatch loss 561 : 0.03669969365000725

[1;37mBatch loss 562 : 0.03190365061163902

[1;37mBatch loss 563 : 0.032557740807533264

[1;37mBatch loss 564 : 0.018017416819930077

[1;37mBatch loss 565 : 0.03861093893647194

[1;37mBatch loss 566 : 0.045645929872989655

[1;37mBatch loss 567 : 0.05149345099925995

[1;37mBatch loss 568 : 0.038020577281713486

[1;37mBatch loss 569 : 0.02989261783659458

[1;37mBatch loss 570 : 0.013011613860726357

[1;37mBatch loss 571 : 0.0485173799097538

[1


[1;37mBatch loss 731 : 0.012831399217247963

[1;37mBatch loss 732 : 0.0627390518784523

[1;37mBatch loss 733 : 0.04997320473194122

[1;37mBatch loss 734 : 0.013624458573758602

[1;37mBatch loss 735 : 0.03252684697508812

[1;37mBatch loss 736 : 0.01564124785363674

[1;37mBatch loss 737 : 0.023759357631206512

[1;37mBatch loss 738 : 0.02560504525899887

[1;37mBatch loss 739 : 0.03899998962879181

[1;37mBatch loss 740 : 0.027096092700958252

[1;37mBatch loss 741 : 0.03998586907982826

[1;37mBatch loss 742 : 0.02774680405855179

[1;37mBatch loss 743 : 0.020962834358215332

[1;37mBatch loss 744 : 0.05647410824894905

[1;37mBatch loss 745 : 0.03185613080859184

[1;37mBatch loss 746 : 0.03649086132645607

[1;37mBatch loss 747 : 0.034211646765470505

[1;37mBatch loss 748 : 0.035404909402132034

[1;37mBatch loss 749 : 0.040070995688438416

[1;37mBatch loss 750 : 0.042590152472257614

[1;37mBatch loss 751 : 0.09169220179319382

[1;37mBatch loss 752 : 0.027819547802209854




[1;32mValidation Batch loss 120 : 0.07025668770074844

[1;32mValidation Batch loss 121 : 0.04002432897686958

[1;32mValidation Batch loss 122 : 0.0417574942111969

[1;32mValidation Batch loss 123 : 0.02263101376593113

[1;32mValidation Batch loss 124 : 0.02179848589003086

[1;32mValidation Batch loss 125 : 0.0452536903321743

[1;32mValidation Batch loss 126 : 0.027805175632238388

[1;32mValidation Batch loss 127 : 0.04959941282868385

[1;32mValidation Batch loss 128 : 0.043229278177022934

[1;32mValidation Batch loss 129 : 0.10605429857969284

[1;32mValidation Batch loss 130 : 0.06026044860482216

[1;32mValidation Batch loss 131 : 0.044485971331596375

[1;32mValidation Batch loss 132 : 0.058913879096508026

[1;32mValidation Batch loss 133 : 0.10013172775506973

[1;32mValidation Batch loss 134 : 0.031843315809965134

[1;32mValidation Batch loss 135 : 0.0347464382648468

[1;32mValidation Batch loss 136 : 0.031696390360593796

[1;32mValidation Batch loss 137 : 0.0392737


[1;32mTest Batch loss 113 : 0.04744454473257065

[1;32mTest Batch loss 114 : 0.015269845724105835

[1;32mTest Batch loss 115 : 0.05768264830112457

[1;32mTest Batch loss 116 : 0.07395852357149124

[1;32mTest Batch loss 117 : 0.08158751577138901

[1;32mTest Batch loss 118 : 0.05849819630384445

[1;32mTest Batch loss 119 : 0.08079972118139267

[1;32mTest Batch loss 120 : 0.03491669520735741

[1;32mTest Batch loss 121 : 0.072422094643116

[1;32mTest Batch loss 122 : 0.07711680233478546

[1;32mTest Batch loss 123 : 0.040944747626781464

[1;32mTest Batch loss 124 : 0.05172586068511009

[1;32mTest Batch loss 125 : 0.053630877286195755

[1;32mTest Batch loss 126 : 0.10420077294111252

[1;32mTest Batch loss 127 : 0.019975222647190094

[1;32mTest Batch loss 128 : 0.08123281598091125

[1;32mTest Batch loss 129 : 0.07940315455198288

[1;32mTest Batch loss 130 : 0.018268810585141182

[1;32mTest Batch loss 131 : 0.0802968442440033

[1;32mTest Batch loss 132 : 0.0399785190820694

In [9]:
print(torch.cuda.memory_stats())

OrderedDict({'active.all.allocated': 1087630, 'active.all.current': 406, 'active.all.freed': 1087224, 'active.all.peak': 1024, 'active.large_pool.allocated': 506367, 'active.large_pool.current': 152, 'active.large_pool.freed': 506215, 'active.large_pool.peak': 381, 'active.small_pool.allocated': 581263, 'active.small_pool.current': 254, 'active.small_pool.freed': 581009, 'active.small_pool.peak': 721, 'active_bytes.all.allocated': 4169757224960, 'active_bytes.all.current': 894567424, 'active_bytes.all.freed': 4168862657536, 'active_bytes.all.peak': 3127888384, 'active_bytes.large_pool.allocated': 4083723288064, 'active_bytes.large_pool.current': 893321216, 'active_bytes.large_pool.freed': 4082829966848, 'active_bytes.large_pool.peak': 3116457984, 'active_bytes.small_pool.allocated': 86033936896, 'active_bytes.small_pool.current': 1246208, 'active_bytes.small_pool.freed': 86032690688, 'active_bytes.small_pool.peak': 13091328, 'allocated_bytes.all.allocated': 4169757224960, 'allocated_by

In [10]:
print(inputs.input_ids.max())
print(inputs.input_ids.min())

tensor(29674)
tensor(0)


In [11]:
# enregistrement du modèle

modelBERT_FT.save_pretrained('./saves/model/BERT_FT')
tokenizerBERT_FT.save_pretrained('./saves/tokenizer/BERT_FT')

('./saves/tokenizer/BERT_FT\\tokenizer_config.json',
 './saves/tokenizer/BERT_FT\\special_tokens_map.json',
 './saves/tokenizer/BERT_FT\\vocab.txt',
 './saves/tokenizer/BERT_FT\\added_tokens.json',
 './saves/tokenizer/BERT_FT\\tokenizer.json')