Reference :  https://github.com/sleepingcat4/bert-textgeneration

## Import the libraries

In [1]:
!pip install datasets
!pip install accelerate
!pip install sentencepiece
!pip install --upgrade huggingface_hub
!pip -q install git+https://github.com/huggingface/transformers

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.22.2-py3-none-

In [2]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Load CSV File

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
from datasets import load_dataset

data_path = '/content/drive/MyDrive/MALIN/exos_to_csv.csv'
dataset = load_dataset('csv', data_files=data_path)

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
# Split the dataset into 3 sets for train, test and validation
train_testsplit = dataset['train'].train_test_split(test_size=0.2)
test_validsplit = train_testsplit['test'].train_test_split(test_size=0.5)

In [6]:
train_ds = train_testsplit['train']
valid_ds = test_validsplit['train']
test_ds = test_validsplit['test']

In [7]:
train_ds

Dataset({
    features: ['filename', 'consigne', 'enonce'],
    num_rows: 1022
})

In [8]:
valid_ds

Dataset({
    features: ['filename', 'consigne', 'enonce'],
    num_rows: 128
})

In [9]:
test_ds

Dataset({
    features: ['filename', 'consigne', 'enonce'],
    num_rows: 128
})

## Load Model

In [10]:
import torch
import torch.nn.functional as F

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [11]:
from transformers import BertTokenizer, BertForMaskedLM

In [12]:
model = BertForMaskedLM.from_pretrained("bert-base-multilingual-cased").to(device)
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

## Random word selection with [MASK]

In [13]:
import random

In [14]:
def mask_random_batch(batch, field_names, mask_proba=0.15):

    masked_fields = {f'masked_{field}': [] for field in field_names}

    for field in field_names:
        # Process each sentence
        for sentence in batch[field]:
            tokens = sentence.split()
            num_tokens_to_mask = max(1, int(len(tokens) * mask_proba))
            mask_indices = random.sample(range(len(tokens)), num_tokens_to_mask)
            for idx in mask_indices:
                tokens[idx] = '[MASK]'
            masked_fields[f'masked_{field}'].append(' '.join(tokens))

    # Return the new fields
    return masked_fields

In [15]:
fields_to_mask = ['consigne', 'enonce']

# Map the dataset
masked_train = train_ds.map(lambda example: mask_random_batch(example, fields_to_mask),
                              batched=True,
                              batch_size=8)

Map:   0%|          | 0/1022 [00:00<?, ? examples/s]

In [16]:
# View the first row of 'masked_consigne'
masked_train["masked_consigne"][0]

'Complète les expressions avec le mot qui [MASK]'

In [17]:
# View the first row of 'masked_enonce'
masked_train["masked_enonce"][0]

'main ◆ pied [MASK] tête ◆ [MASK] a. hocher la … b. [MASK] des … c. tendre la … d. donner un coup de …'

## Model Generation

In [18]:
def generate_predictions(batch, field_name, device):
    generated_sentences = []

    for original_text in batch[field_name]:
        # Tokenize the masked sentences
        tokenized_inputs = tokenizer.tokenize(original_text)

        input_ids = tokenizer.convert_tokens_to_ids(tokenized_inputs)

        # Find all indices of the [MASK] tokens
        mask_indices = [i for i, token in enumerate(tokenized_inputs) if token == '[MASK]']


        tokens_tensor = torch.tensor([input_ids], device=device)

        # Generate predictions
        with torch.no_grad():
            outputs = model(tokens_tensor)
            predictions = outputs.logits

        # Replace each [MASK] token with the predicted token
        for mask_index in mask_indices:
            # Get the logits
            mask_logits = predictions[0, mask_index]
            # Convert logits to probabilities
            probs = F.softmax(mask_logits, dim=0)
            # Sample from the softmax to get the prediction
            predicted_index = torch.multinomial(probs, 1).item()
            predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])


            tokenized_inputs[mask_index] = predicted_token[0]

        # Convert the tokenized inputs back to a string
        output_sentence = tokenizer.convert_tokens_to_string(tokenized_inputs)
        generated_sentences.append(output_sentence)

    return {f'generated_{field_name}': generated_sentences}

In [19]:
generated_dataset = masked_train.map(lambda batch: generate_predictions(batch, 'masked_consigne', "cuda"),
                                     batched=True,
                                     batch_size=2
)

Map:   0%|          | 0/1022 [00:00<?, ? examples/s]

In [20]:
for example in generated_dataset.select(range(3)):
    print(f"Original Consigne: {example['consigne']}\n")
    print(f"Masked Consigne: {example['masked_consigne']}\n")
    print(f"Generated Consigne: {example['generated_masked_consigne']}\n")

Original Consigne: Complète les expressions avec le mot qui manque.

Masked Consigne: Complète les expressions avec le mot qui [MASK]

Generated Consigne: Complète les expressions avec le mot qui n

Original Consigne: Recopie les phrases dont les verbes sont conjugués à l’imparfait.

Masked Consigne: Recopie les phrases dont les [MASK] sont conjugués à l’imparfait.

Generated Consigne: Recopie les phrases dont les temps sont conjugués à l [UNK] imparfait .

Original Consigne: Remets les lettres dans le bon ordre pour écrire un mot avec un m devant m, b ou p.

Masked Consigne: Remets les lettres dans le bon ordre pour écrire [MASK] mot avec un m devant m, b ou [MASK]

Generated Consigne: Remets les lettres dans le bon ordre pour écrire le mot avec un m devant m , b ou c



In [21]:
# Update
generated_dataset = generated_dataset.map(lambda batch: generate_predictions(batch, 'masked_enonce', "cuda"),
                                     batched=True,
                                     batch_size=2
)

Map:   0%|          | 0/1022 [00:00<?, ? examples/s]

In [22]:
# View
for example in generated_dataset.select(range(3)):
    print(f"Original Enonce: {example['enonce']}\n")
    print(f"Masked Enonce: {example['masked_enonce']}\n")
    print(f"Generated Enonce: {example['generated_masked_enonce']}\n")

Original Enonce: main ◆ pied ◆ tête ◆ yeux
a. hocher la …
b. cligner des …
c. tendre la …
d. donner un coup de …

Masked Enonce: main ◆ pied [MASK] tête ◆ [MASK] a. hocher la … b. [MASK] des … c. tendre la … d. donner un coup de …

Generated Enonce: main ◆ pied la tête ◆бір a . hocher la [UNK] b . faire des [UNK] c . tendre la [UNK] d . donner un coup de [UNK]

Original Enonce: a. Tu viendras seul au rendez-vous.
b. Nous venions là tous les jours.
c. Où veux-tu aller ?

Masked Enonce: [MASK] Tu viendras seul au rendez-vous. [MASK] Nous venions là tous les jours. c. Où veux-tu aller ?

Generated Enonce: . Tu viendras seul au rendez - vous . " Nous venions là tous les jours . c . Où veux - tu aller ?

Original Enonce: a. o – p – o – m – n – p
b. p – i – o – p – m – r – e
c. m – c – a – a – p – n – e – g
d. p – e – l – e – m – e – x
e. l – e – m – e – b – e – s – n

Masked Enonce: a. o – p – o – m – n – p b. p – i – o – p – m – r – e c. m – c – a – [MASK] – p – n – e – [MASK] [MASK] [MASK

In [23]:
import pandas as pd

df = generated_dataset.to_pandas()

# Save
df.to_csv('generated_dataset.csv', index=False)