Reference :  
https://github.com/sleepingcat4/bert-textgeneration  

https://github.com/huggingface/notebooks/blob/main/examples/language_modeling.ipynb

## Import the libraries

In [None]:
!pip install datasets
!pip install accelerate
!pip install sentencepiece
!pip install --upgrade huggingface_hub
!pip -q install git+https://github.com/huggingface/transformers

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
from transformers import AutoModelForSequenceClassification, CamembertForMaskedLM, AutoTokenizer, AutoConfig

In [10]:
camembert = CamembertForMaskedLM.from_pretrained('camembert-base')

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
tokenizer = AutoTokenizer.from_pretrained('camembert-base')

## Load CSV File

View [Source](https://stackoverflow.com/questions/76001128/splitting-dataset-into-train-test-and-validation-using-huggingface-datasets-fun)

In [3]:
from datasets import *

data_path = 'exos_to_csv.csv'
dataset = load_dataset('csv', data_files=data_path)

In [4]:
# Split the dataset into 3 sets for train, test and validation
train_testsplit = dataset['train'].train_test_split(test_size=0.2)
test_validsplit = train_testsplit['test'].train_test_split(test_size=0.5)

In [6]:
ds = DatasetDict({
    'train': train_testsplit['train'],
    'test': test_validsplit['test'],
    'valid': test_validsplit['train']
})

In [7]:
ds

DatasetDict({
    train: Dataset({
        features: ['filename', 'consigne', 'enonce'],
        num_rows: 1022
    })
    test: Dataset({
        features: ['filename', 'consigne', 'enonce'],
        num_rows: 128
    })
    valid: Dataset({
        features: ['filename', 'consigne', 'enonce'],
        num_rows: 128
    })
})

Create a random sample

In [8]:
sample = ds["train"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> Enonce: {row['enonce']}'")


'>>> Enonce: a. On forme toujours le féminin des adjectifs qualificatifs en ajoutant en -e.
b. court, long et fidèle sont des adjectifs masculins.
c. autoritaire, splendide et sage sont des adjectifs féminins ou masculins.
d. Au féminin, l’adjectif naïf devient naïve.
e. Au masculin, gauchère devient gaucheur.'

'>>> Enonce: a. … pouvions • … allais • … disiez
b. … disais • … allaient • … pouvait
c. … prenais • … faisais • … voyions
d. … venais • … allions • … vouliez'

'>>> Enonce: a. rome est la capitale de l’italie.
b. laetitia et natacha sont amies.
c. Les châteaux de la loire sont magnifiques.
d. La seine, le rhône, la loire, la garonne sont les quatre grands fleuves français.'


For both auto-regressive and masked language modeling, a common preprocessing step is to concatenate all the examples and then split the whole corpus into chunks of equal size.

In [14]:
def tokenize_function(examples):
    result = tokenizer(examples["enonce"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

In [15]:
# Use batched=True to activate fast multithreading!
tokenized_datasets = ds.map(
    tokenize_function, batched=True, remove_columns=["filename", "consigne", "enonce"]
)
tokenized_datasets

Map:   0%|          | 0/1022 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 1022
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 128
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 128
    })
})

In [33]:
tokenized_datasets['train'][0]['input_ids']

[5,
 877,
 271,
 4191,
 38,
 177,
 53,
 221,
 7,
 150,
 38,
 443,
 53,
 28,
 3438,
 15119,
 5345,
 35,
 9,
 659,
 38,
 177,
 53,
 18961,
 11512,
 5077,
 9,
 659,
 38,
 443,
 53,
 28,
 22431,
 9,
 659,
 38,
 177,
 53,
 104,
 26,
 177,
 93,
 668,
 83,
 6]

In [34]:
tokenized_datasets['train'][0]['word_ids']

[None,
 0,
 1,
 2,
 3,
 3,
 3,
 4,
 4,
 5,
 6,
 6,
 6,
 7,
 8,
 8,
 9,
 9,
 9,
 10,
 11,
 11,
 11,
 12,
 12,
 12,
 12,
 13,
 14,
 14,
 14,
 15,
 16,
 16,
 17,
 18,
 18,
 18,
 19,
 19,
 19,
 20,
 21,
 22,
 None]

Inspecting the `model_max_length` attribute of the tokenizer:

In [16]:
tokenizer.model_max_length

512

In [28]:
tokenizer.is_fast

True

In [29]:
example = "Qu'est-ce que je peux faire pour vous?"
encoding = tokenizer(example)
print(type(encoding))

<class 'transformers.tokenization_utils_base.BatchEncoding'>


In [30]:
encoding.is_fast

True

In [31]:
encoding.tokens()

['<s>',
 '▁Qu',
 "'",
 'est',
 '-',
 'ce',
 '▁que',
 '▁je',
 '▁peux',
 '▁faire',
 '▁pour',
 '▁vous',
 '?',
 '</s>']

In [32]:
encoding.word_ids()

[None, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 6, None]

In [39]:
chunk_size = 32

In [19]:
# Slicing produces a list of lists for each feature
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Enonce {idx} length: {len(sample)}'")

'>>> Enonce 0 length: 45'
'>>> Enonce 1 length: 31'
'>>> Enonce 2 length: 80'


In [21]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated length: {total_length}'")

'>>> Concatenated length: 156'


In [40]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 32'
'>>> Chunk length: 32'
'>>> Chunk length: 32'
'>>> Chunk length: 32'
'>>> Chunk length: 28'


In [41]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [42]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1695
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 214
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 221
    })
})

In [43]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

'fusée. Tu (être) peut-être même président!</s><s> a. un chronomètre b. un cadran solaire c. une montre à gousset d'

In [44]:
tokenizer.decode(lm_datasets["train"][1]["labels"])

'fusée. Tu (être) peut-être même président!</s><s> a. un chronomètre b. un cadran solaire c. une montre à gousset d'

In [45]:
def group_texts(examples, chunk_size=chunk_size, mask_token_id=tokenizer.mask_token_id):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Mask the last token of each chunk
    for i in range(len(result["input_ids"])):
        result["input_ids"][i][-1] = mask_token_id
    
    # Create labels for MLM
    result["labels"] = [x[:-1] + [x[-1]] for x in result["input_ids"]]  # Copy all ids
    for chunk in result["labels"]:
        # Set all tokens to -100 except for the last one which is masked
        chunk[:-1] = [-100] * (len(chunk) - 1)

    return result

In [46]:
test_part = tokenized_datasets.map(group_texts, batched=True)
test_part

Map:   0%|          | 0/1022 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1695
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 214
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 221
    })
})

In [47]:
tokenizer.decode(test_part["train"][1]["input_ids"])

'fusée. Tu (être) peut-être même président!</s><s> a. un chronomètre b. un cadran solaire c. une montre à gousset<mask>'

In [48]:
tokenizer.decode(lm_datasets["train"][1]["labels"])

'fusée. Tu (être) peut-être même président!</s><s> a. un chronomètre b. un cadran solaire c. une montre à gousset d'