In [1]:
from datasets import load_dataset
import sklearn
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch
import random
from transformers import BertTokenizer, BertForMaskedLM, Trainer, TrainingArguments

In [2]:
train_data = load_dataset("open_subtitles", split="train[:10%]", lang1="en", lang2="fr")

Found cached dataset open_subtitles (/home/onyxia/.cache/huggingface/datasets/open_subtitles/en-fr-lang1=en,lang2=fr/0.0.0/c1ec973ca4b6e588740d8f167cc0e24ea3f626e70bc7ffe467e944730500e198)


In [3]:
test_data = train_data.select(range(500, 600))
train_data = train_data.select(range(100000))


In [4]:
train_data

Dataset({
    features: ['id', 'meta', 'translation'],
    num_rows: 100000
})

## Vérifier paramètre entrainable et modifier la boucle

In [5]:
from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset

# Load the tokenizer and model
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
model = XLMRobertaForMaskedLM.from_pretrained('xlm-roberta-base')

In [6]:
num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Number of trainable parameters before freezing layers:", num_trainable_params)

Number of trainable parameters before freezing layers: 278295186


In [7]:


# Freeze all layers except the last 3 layers
for name, param in model.named_parameters():
    if 'encoder' in name:
        # Check if the last character of the name is a digit
        if name[-1].isdigit():
            layer_num = int(name[-1])
            # Fine-tune the last 3 layers of the encoder
            if layer_num >= 9:
                param.requires_grad = True
        else:
            # Skip this parameter if it doesn't have a valid layer number
            continue
    else:
        # Freeze all other layers
        param.requires_grad = False

In [8]:
num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Number of trainable parameters:", num_trainable_params)

Number of trainable parameters: 85054464


In [None]:
model.ro

In [7]:
model.named_parameters

<bound method Module.named_parameters of XLMRobertaForMaskedLM(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0): XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768

In [None]:
# Load the OpenSubtitles dataset

# Preprocess the sentences to create a cloze dataset
def preprocess_sentence(sentence):
    tokenized = tokenizer.encode(sentence, add_special_tokens=False)
    for i, token_id in enumerate(tokenized):
        masked_tokenized = list(tokenized)
        masked_tokenized[i] = tokenizer.mask_token_id
        yield masked_tokenized, [token_id] + [-100] * (len(tokenized) - 1)

train_cloze_data = []

text = train_data['translation']

list_of_spoken_english_sentences = []

# Recuperer les uniquement les phrases. 
for elem in text:
    
    sentence = elem["en"]
    train_cloze_data += list(preprocess_sentence(sentence))

In [9]:
text = train_data['translation']

list_of_spoken_english_sentences = []

# Recuperer les uniquement les phrases. 
for d in text:
    
    tex = d["en"]
    list_of_spoken_english_sentences.append(tex)

# Print sentencessss
#print(list_of_spoken_english_sentences)

In [9]:
from typing import List, Tuple

def get_cloze_dataset(sentences: List[str]) -> List[Tuple[str, str, str]]:
    cloze_dataset = []
    for sentence in sentences:
        # Split the sentence into tokens
        tokens = sentence.split() # utiliser un autre tokenizer ! 

        # Loop over the tokens and create a cloze sentence for each
        for i, token in enumerate(tokens):
            # Create a copy of the tokens with the current token masked out
            masked_tokens = tokens[:]
            masked_tokens[i] = "[MASK]"

            # Create the input sentence, masked sentence, and label
            input_sentence = " ".join(tokens)
            masked_sentence = " ".join(masked_tokens)
            label = token

            # Add the tuple to the cloze dataset
            cloze_dataset.append(( masked_sentence,input_sentence, label))
    return cloze_dataset

In [10]:
dataset = get_cloze_dataset(list_of_spoken_english_sentences)

In [12]:
def tokenize_sentence(sentence):
    inputs = tokenizer.encode_plus(
        sentence,
        return_tensors='pt',
        add_special_tokens=True,
        padding='max_length',
        max_length=128,
        truncation=True,
    )
    return inputs

In [13]:
def get_masked_index(inputs):
    masked_index = torch.where(inputs['input_ids'][0] == tokenizer.mask_token_id)[0][0]
    return masked_index

In [14]:
def get_masked_word_tensor(masked_word):
    masked_word_tensor = torch.tensor(tokenizer.convert_tokens_to_ids(masked_word))
    return masked_word_tensor

In [15]:
def replace_masked_token(inputs, masked_word_tensor):
    masked_index = get_masked_index(inputs)
    inputs['input_ids'][0][masked_index] = masked_word_tensor
    return inputs

In [16]:
def create_labels(inputs):
    labels = inputs['input_ids'].clone()
    masked_index = get_masked_index(inputs)
    labels[0][masked_index] = -100
    return labels[0]

In [20]:
from torch.utils.data import Dataset

class ClozeDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        masked_sentence, sentence, masked_word = self.data[index]
        
        inputs = tokenize_sentence(masked_sentence)
        masked_word_tensor = get_masked_word_tensor(masked_word)
        inputs = replace_masked_token(inputs, masked_word_tensor)
        labels = create_labels(inputs)
        
        return {
            'input_ids': inputs['input_ids'][0],
            'attention_mask': inputs['attention_mask'][0],
            'labels': labels
        }

## Training

### https://towardsdatascience.com/transformers-retraining-roberta-base-using-the-roberta-mlm-procedure-7422160d5764

In [10]:
text = test_data['translation']

list_of_spoken_english_sentences_test = []

# Recuperer les uniquement les phrases. 
for d in text:
    
    tex = d["en"]
    list_of_spoken_english_sentences_test.append(tex)

# Print sentencessss
#print(list_of_spoken_english_sentences_test)


In [11]:
# train
with open('my_file_train.txt', 'w') as file:
    for item in list_of_spoken_english_sentences:
        file.write(f"{item}\n")
    print('Done')

Done


In [10]:
# test
with open('my_file_test.txt', 'w') as file:
    for item in list_of_spoken_english_sentences_test:
        file.write(f"{item}\n")
    print('Done')

Done


In [12]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="my_file_train.txt",
    block_size=512,
)



In [12]:
dataset_test = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="my_file_test.txt",
    block_size=512,
)

In [13]:
dataset[0]

{'input_ids': tensor([    0,    87,  8306, 48869,   297,  8108,     2])}

In [13]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [14]:
import torch.nn as nn
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Convert arrays to tensors
    predictions = torch.from_numpy(predictions)
    labels = torch.from_numpy(labels)
    # Calculate the test loss
    loss = nn.CrossEntropyLoss()(predictions, labels).item()
    return {"test_loss": loss}

In [15]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./roberta-trained",
    overwrite_output_dir=True,
    #evaluation_strategy = "steps",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=1000,
    save_total_limit=3,
    seed=1,
    #eval_steps= 50,
    #evaluation_strategy = "steps",
    #eval_accumulation_steps = 16,
    learning_rate=0.000001
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    #eval_dataset = dataset_test,
    #compute_metrics=compute_metrics
    
)

In [16]:
trainer.train()

***** Running training *****
  Num examples = 100000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 37500
  Number of trainable parameters = 85054464


Step,Training Loss
500,2.477
1000,2.3603
1500,2.3762
2000,2.1719
2500,2.1889
3000,2.2228
3500,2.201
4000,2.1855
4500,2.2047
5000,2.132


Saving model checkpoint to ./roberta-retrained/checkpoint-1000
Configuration saved in ./roberta-retrained/checkpoint-1000/config.json
Model weights saved in ./roberta-retrained/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./roberta-retrained/checkpoint-2000
Configuration saved in ./roberta-retrained/checkpoint-2000/config.json
Model weights saved in ./roberta-retrained/checkpoint-2000/pytorch_model.bin
Deleting older checkpoint [roberta-retrained/checkpoint-200] due to args.save_total_limit
Saving model checkpoint to ./roberta-retrained/checkpoint-3000
Configuration saved in ./roberta-retrained/checkpoint-3000/config.json
Model weights saved in ./roberta-retrained/checkpoint-3000/pytorch_model.bin
Deleting older checkpoint [roberta-retrained/checkpoint-250] due to args.save_total_limit
Saving model checkpoint to ./roberta-retrained/checkpoint-4000
Configuration saved in ./roberta-retrained/checkpoint-4000/config.json
Model weights saved in ./roberta-retrained/checkpoint

TrainOutput(global_step=37500, training_loss=2.093136355794271, metrics={'train_runtime': 4984.4995, 'train_samples_per_second': 60.187, 'train_steps_per_second': 7.523, 'total_flos': 3747969054813600.0, 'train_loss': 2.093136355794271, 'epoch': 3.0})

In [65]:
from transformers import AutoConfig, RobertaForMaskedLM 

In [66]:
config = AutoConfig.from_pretrained("./roberta-retrained/checkpoint-12000")
model = RobertaForMaskedLM(config=config)

loading configuration file ./roberta-retrained/checkpoint-12000/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "./roberta-retrained/checkpoint-12000",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}



In [69]:
model = XLMRobertaForMaskedLM.from_pretrained('xlm-roberta-base')

loading configuration file config.json from cache at /home/onyxia/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/42f548f32366559214515ec137cdd16002968bf6/config.json
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

loading weights file pytorch_model.bin from cache at /home/onyxia/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/42f548f32366559214515ec137cd

In [70]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model= model,
    tokenizer=tokenizer
)
fill_mask("Send these <mask> back!")

[{'score': 0.2370229959487915,
  'token': 22317,
  'token_str': 'links',
  'sequence': 'Send these links back!'},
 {'score': 0.06669365614652634,
  'token': 55769,
  'token_str': 'items',
  'sequence': 'Send these items back!'},
 {'score': 0.06505978852510452,
  'token': 6305,
  'token_str': 'tips',
  'sequence': 'Send these tips back!'},
 {'score': 0.06319175660610199,
  'token': 69141,
  'token_str': 'tags',
  'sequence': 'Send these tags back!'},
 {'score': 0.04327935352921486,
  'token': 4136,
  'token_str': 'cookies',
  'sequence': 'Send these cookies back!'}]

In [22]:
from transformers import XLMRobertaForMaskedLM

model_path = "./XLM-roberta-retrained-EN/checkpoint-37000"
model = XLMRobertaForMaskedLM.from_pretrained(model_path)


loading configuration file ./XLM-roberta-retrained-EN/checkpoint-37000/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

loading weights file ./XLM-roberta-retrained-EN/checkpoint-37000/pytorch_model.bin
All model checkpoint weights were used when initializing XLMRobertaForMaskedLM.

All the weights

In [23]:
model.roberta._modules.keys()

odict_keys(['embeddings', 'encoder'])

In [19]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)

num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Number of trainable parameters:", num_trainable_params)


roberta.embeddings.word_embeddings.weight
roberta.embeddings.position_embeddings.weight
roberta.embeddings.token_type_embeddings.weight
roberta.embeddings.LayerNorm.weight
roberta.embeddings.LayerNorm.bias
roberta.encoder.layer.0.attention.self.query.weight
roberta.encoder.layer.0.attention.self.query.bias
roberta.encoder.layer.0.attention.self.key.weight
roberta.encoder.layer.0.attention.self.key.bias
roberta.encoder.layer.0.attention.self.value.weight
roberta.encoder.layer.0.attention.self.value.bias
roberta.encoder.layer.0.attention.output.dense.weight
roberta.encoder.layer.0.attention.output.dense.bias
roberta.encoder.layer.0.attention.output.LayerNorm.weight
roberta.encoder.layer.0.attention.output.LayerNorm.bias
roberta.encoder.layer.0.intermediate.dense.weight
roberta.encoder.layer.0.intermediate.dense.bias
roberta.encoder.layer.0.output.dense.weight
roberta.encoder.layer.0.output.dense.bias
roberta.encoder.layer.0.output.LayerNorm.weight
roberta.encoder.layer.0.output.LayerNorm

In [20]:
model.roberta.pooler

In [21]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("./XLM-roberta-retrained-EN/checkpoint-37000")



loading configuration file ./XLM-roberta-retrained-EN/checkpoint-37000/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "./XLM-roberta-retrained-EN/checkpoint-37000",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

loading weights file ./XLM-roberta-retrained-EN/checkpoint-37000/pytorch_model.bin
Some weights of the model checkpoint at ./XLM-roberta-retrained-EN/ch

In [None]:
num_trainable_params = [sum(p.numel() for p in layer.parameters() if p.requires_grad) for layer in model.encoder.layer[-3:]]
print("Number of trainable parameters in last 3 layers:", num_trainable_params)