In [None]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import sys

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

DATASET_PATH = Path("./data/text")

VOC_SIZE = 1000

def load_data(datapath, max_size=None):
    texts_files = list(datapath.glob("*.txt"))
    texts = []  
    for files in texts_files:
        with open(files, "r") as files:
            text = files.readlines()
            texts += text
    texts = list(set(texts))
    return texts

texts = load_data(DATASET_PATH)

from tokenizers import Tokenizer
# from transformers import AutoTokenizer
from transformers import BertTokenizer, BertForMaskedLM
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model_checkpoint = "distilgpt2"
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
tokenizer.pre_tokenizer = Whitespace()
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

inputs = tokenizer(texts, return_tensors='pt', max_length=100
                   , truncation=True, padding='max_length')

inputs['labels'] = inputs.input_ids.detach().clone()

rand = torch.rand(inputs.input_ids.shape)

In [36]:
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [37]:
import random as rd

In [38]:
selection = []

for i in range(mask_arr.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
        )

selection[:5]

for i in range(mask_arr.shape[0]):
    inputs.input_ids[i, selection[i]] = 103 # application du token [MASK]

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, idx):
        self.encodings = encodings
        self.idx = idx
        self.encodings = {key: [val[i] for i in self.idx] for key, val in self.encodings.items()}
        
    def __getitem__(self, idx):
        return {key : torch.tensor(val[idx]) for key, val in self.encodings.items()}
    
    def __len__(self):
        return len(self.encodings['input_ids'])

sample_idx = [i for i in range(len(inputs.input_ids))]

shuffled_sample_idx = rd.sample(sample_idx, len(sample_idx))

train_idx = shuffled_sample_idx[:int(0.70*len(shuffled_sample_idx))]
val_idx = shuffled_sample_idx[int(0.70*len(shuffled_sample_idx)):int(0.85*len(shuffled_sample_idx))]
test_idx = shuffled_sample_idx[int(0.85*len(shuffled_sample_idx)):]
                                
dataset_train = CustomDataset(inputs, train_idx)
dataset_val = CustomDataset(inputs, val_idx)
dataset_test = CustomDataset(inputs, test_idx)

train_dataloaded = torch.utils.data.DataLoader(dataset_train, batch_size=16, shuffle=True)
val_dataloaded = torch.utils.data.DataLoader(dataset_val, batch_size=16, shuffle=True)
test_dataloaded = torch.utils.data.DataLoader(dataset_test, batch_size=16, shuffle=True)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

class MLM_model(nn.Module):
    def __init__(self):
        super(MLM_model, self).__init__()
        self.history = {"epochs":[], "test":[]}

    def forward(self, x):
        return model(x)
    
    def train_log(self, train_batch_losses, val_batch_losses, train_loss, validation_loss):
        self.history["epochs"].append({"train_batch_losses":train_batch_losses, 
                                "val_batch_losses":val_batch_losses, 
                                "train_loss":train_loss, 
                                "validation_loss":validation_loss})
    
    def test_log(self, test_batch_losses, test_loss):
        self.history["test"].append({"test_batch_losses":test_batch_losses,
                                "test_loss":test_loss})

model = MLM_model()
model.to(device)

In [71]:
def train_step(module, batch, batch_idx, optimizer):
    module.train(True)
    
    inputs_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    
    outputs = module(inputs_ids, attention_mask, labels=labels)
    
    loss = outputs.loss
    print(f"\n\033[1;37mBatch loss {batch_idx+1} : {loss.item()}")
    loss.backward()
    
    torch.nn.utils.clip_grad_norm_(module.parameters(), max_norm=1.0)
    optimizer.step()
    optimizer.zero_grad()
    
    return module, loss

def eval_step(module, batch, batch_idx, optimizer=None, training=True):
    with torch.no_grad():
        
        inputs_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
    
        outputs = module(inputs_ids, attention_mask, labels=labels)
    
        loss = outputs.loss
         
        if training:
            print(f"\n\033[1;32mValidation Batch loss {batch_idx+1} : {loss.item()}")
            return module, loss
        else:
            print(f"\n\033[1;32mTest Batch loss {batch_idx+1} : {loss.item()}")
            return module, loss, outputs, labels

def train_loop(module, EPOCHS, train_dataset, val_dataset, optimizer, lr_scheduler=None):
    for epoch in range(EPOCHS):
        
        module.train(True)
        
        train_batch_losses = []
        for batch_idx in range(len(train_dataset)):
            batch = next(iter(train_dataset))
            module, loss = train_step(module, batch, batch_idx, optimizer)
            train_batch_losses.append(loss.item())
            
        if lr_scheduler is not None:
          lr_scheduler.step()
        train_loss = np.mean(train_batch_losses)

        module.train(False)
        val_batch_losses = []
        for batch_idx in range(len(val_dataset)):
            batch = next(iter(val_dataset))
            module, loss = eval_step(module, batch, batch_idx)
            val_batch_losses.append(loss.item())
        val_loss = np.mean(val_batch_losses)

        module.train_log(train_batch_losses, val_batch_losses, train_loss, val_loss)
        print(f"\n\033[1;33mEpoch {epoch+1} :\n\033[1;37mTraining Loss : {train_loss}")
        print(f"\033[1;32mValidation Loss : {val_loss}")
    return module

def evaluate(module, test_dataset):
    module.train(False)
    test_batch_losses = []
    predictions = []
    true_targets = []
    for batch_idx in range(len(test_dataset)):
        batch = next(iter(test_dataset))
        module, loss, outputs, labels = eval_step(module, batch, batch_idx, training=False)

        test_batch_losses.append(loss.item())
        predictions.append(outputs)
        true_targets.append(labels)

    test_loss = np.mean(test_batch_losses)
    module.test_log(test_batch_losses, test_loss)
    print(f"\nTest Loss : {test_loss}")
    return predictions, true_targets

In [None]:
if __name__ == "__main__":
    EPOCHS = 100
    LR = 1e-4
    
    optimizer = torch.optim.Adam(model.parameters(), lr=LR, eps=5e-8)
    module = train_loop(module=model, 
                        EPOCHS=EPOCHS, 
                        train_dataset=train_dataloaded, 
                        val_dataset=val_dataloaded,
                        optimizer=optimizer)
    predictions, true_targets = evaluate(module, 
                                         test_dataloaded, 
                                         nn.SmoothL1Loss(reduction='mean'))



  return {key : torch.tensor(val[idx]) for key, val in self.encodings.items()}



[1;37mBatch loss 1 : 10.61193561553955

[1;37mBatch loss 2 : 7.5935139656066895

[1;37mBatch loss 3 : 6.683854579925537

[1;37mBatch loss 4 : 4.44725227355957

[1;37mBatch loss 5 : 3.9821932315826416

[1;37mBatch loss 6 : 3.674833297729492

[1;37mBatch loss 7 : 3.031693696975708

[1;37mBatch loss 8 : 2.7303740978240967

[1;37mBatch loss 9 : 2.252066135406494

[1;37mBatch loss 10 : 1.5920186042785645

[1;37mBatch loss 11 : 1.526757836341858

[1;37mBatch loss 12 : 1.1759874820709229

[1;37mBatch loss 13 : 0.8581132292747498

[1;37mBatch loss 14 : 0.7906060218811035

[1;37mBatch loss 15 : 0.778357982635498

[1;37mBatch loss 16 : 0.6566455960273743

[1;37mBatch loss 17 : 0.4071978032588959

[1;37mBatch loss 18 : 0.3631610870361328

[1;37mBatch loss 19 : 0.4673421084880829

[1;37mBatch loss 20 : 0.3310314118862152

[1;37mBatch loss 21 : 0.29903626441955566

[1;37mBatch loss 22 : 0.20867100358009338

[1;37mBatch loss 23 : 0.15724527835845947

[1;37mBatch loss 24 : 0.

In [49]:
print(inputs.input_ids.max())
print(inputs.input_ids.min())

tensor(29674)
tensor(0)
