In [1]:
from datasets import load_dataset
import sklearn
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch
import random
from transformers import BertTokenizer, BertForMaskedLM, Trainer, TrainingArguments

In [2]:
from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset

# Load the tokenizer and model
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
model = XLMRobertaForMaskedLM.from_pretrained('xlm-roberta-base')

# Freeze all layers except the last 3 layers
for name, param in model.named_parameters():
    if 'encoder' in name:
        # Check if the last character of the name is a digit
        if name[-1].isdigit():
            layer_num = int(name[-1])
            # Fine-tune the last 3 layers of the encoder
            if layer_num >= 9:
                param.requires_grad = True
        else:
            # Skip this parameter if it doesn't have a valid layer number
            continue
    else:
        # Freeze all other layers
        param.requires_grad = False

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [None]:
model.named_parameters

In [None]:
list_of_spoken_english_sentences  = ["I am walking", "I love pytorch", "I don't know where it is"]

In [None]:
from typing import List, Tuple

def get_cloze_dataset(sentences: List[str]) -> List[Tuple[str, str, str]]:
    cloze_dataset = []
    for sentence in sentences:
        # Split the sentence into tokens
        tokens = sentence.split() # utiliser un autre tokenizer ! 

        # Loop over the tokens and create a cloze sentence for each
        for i, token in enumerate(tokens):
            # Create a copy of the tokens with the current token masked out
            masked_tokens = tokens[:]
            masked_tokens[i] = "[MASK]"

            # Create the input sentence, masked sentence, and label
            input_sentence = " ".join(tokens)
            masked_sentence = " ".join(masked_tokens)
            label = token

            # Add the tuple to the cloze dataset
            cloze_dataset.append(( masked_sentence,input_sentence, label))
    return cloze_dataset

In [None]:
dataset = get_cloze_dataset(list_of_spoken_english_sentences)

In [None]:
def tokenize_sentence(sentence):
    inputs = tokenizer.encode_plus(
        sentence,
        return_tensors='pt',
        add_special_tokens=True,
        padding='max_length',
        max_length=128,
        truncation=True,
    )
    return inputs

In [None]:
def get_masked_index(inputs):
    masked_index = torch.where(inputs['input_ids'][0] == tokenizer.mask_token_id)[0][0]
    return masked_index

In [None]:
def get_masked_word_tensor(masked_word):
    masked_word_tensor = torch.tensor(tokenizer.convert_tokens_to_ids(masked_word))
    return masked_word_tensor

In [None]:
def replace_masked_token(inputs, masked_word_tensor):
    masked_index = get_masked_index(inputs)
    inputs['input_ids'][0][masked_index] = masked_word_tensor
    return inputs

In [None]:
def create_labels(inputs):
    labels = inputs['input_ids'].clone()
    masked_index = get_masked_index(inputs)
    labels[0][masked_index] = -100
    return labels[0]

In [None]:
class ClozeDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        masked_sentence, sentence, masked_word = self.data[index]
        
        inputs = tokenize_sentence(masked_sentence)
        masked_word_tensor = get_masked_word_tensor(masked_word)
        inputs = replace_masked_token(inputs, masked_word_tensor)
        labels = create_labels(inputs)
        
        return {
            'input_ids': inputs['input_ids'][0],
            'attention_mask': inputs['attention_mask'][0],
            'labels': labels
        }

## Training

In [None]:
# data_collator allows to batch 
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

In [None]:
import evaluate
metric = evaluate.load("cross-entropy")

In [None]:
# training arguments and trainer
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy = "steps",
    eval_steps = 5,
    save_total_limit = 2,
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

In [None]:
# Fine-tune the model on the training dataset
trainer.train()