In [3]:
!pip install transformers[torch]
!pip install accelerate -U
!pip install datasets


Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [1]:
from transformers import TextDataset, DataCollatorForLanguageModeling

def load_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

def load_data_collator(tokenizer):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=True,
        mlm_probability=0.15
    )



In [2]:
from transformers import Trainer, TrainingArguments, CamembertTokenizer, CamembertForMaskedLM



# Charger le tokenizer et le modèle pré-entrainé en français
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
model = CamembertForMaskedLM.from_pretrained('camembert-base')

dataset = load_dataset('Articles_EcoMatin.txt', tokenizer)
data_collator = load_data_collator(tokenizer)

training_args = TrainingArguments(
    output_dir="./camembert",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

trainer.train()


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Step,Training Loss


TrainOutput(global_step=17, training_loss=2.9339148577521827, metrics={'train_runtime': 514.9068, 'train_samples_per_second': 0.52, 'train_steps_per_second': 0.033, 'total_flos': 17635027694592.0, 'train_loss': 2.9339148577521827, 'epoch': 1.0})

In [3]:
import torch
import string
from transformers import CamembertTokenizer, CamembertForMaskedLM

def load_french_model(model_name):
    try:
        if model_name.lower() == "camembert":
            tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
            model = CamembertForMaskedLM.from_pretrained('camembert-base').eval()
            return tokenizer, model
    except Exception as e:
        print(e)

def decode(tokenizer, pred_idx, top_clean):
    ignore_tokens = string.punctuation + '[PAD]'
    tokens = []
    for w in pred_idx:
        token = ''.join(tokenizer.decode(w).split())
        if token not in ignore_tokens:
            tokens.append(token.replace('##', ''))
    return '\n'.join(tokens[:top_clean])

def encode(tokenizer, text_sentence, add_special_tokens=True):
    text_sentence = text_sentence.replace('<mask>', tokenizer.mask_token)
    if tokenizer.mask_token == text_sentence.split()[-1]:
        text_sentence += ' .'

    input_ids = torch.tensor([tokenizer.encode(text_sentence, add_special_tokens=add_special_tokens)])
    mask_idx = torch.where(input_ids == tokenizer.mask_token_id)[1].tolist()[0]
    return input_ids, mask_idx

def get_all_predictions(tokenizer, model, text_sentence, top_clean=5):
    input_ids, mask_idx = encode(tokenizer, text_sentence)
    with torch.no_grad():
        predict = model(input_ids)[0]
    predictions = decode(tokenizer, predict[0, mask_idx, :].topk(top_k).indices.tolist(), top_clean)
    return predictions



In [4]:
!pip install gradio

Collecting gradio
  Downloading gradio-4.37.2-py3-none-any.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==1.0.2 (from gradio)
  Downloading gradio_client-1.0.2-py3-none-any.whl (318 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.2/318.2 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━

In [5]:
import torch
import string
import gradio as gr
from transformers import CamembertTokenizer, CamembertForMaskedLM

# Correspondance entre les noms affichés et les noms réels des modèles
model_names = {
    "LSTM": "camembert"
}

def load_french_model(model_name):
    try:
        # Utilisation du modèle Camembert en arrière-plan quel que soit le nom affiché
        real_model_name = model_names.get(model_name, "camembert")
        if real_model_name == "camembert":
            tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
            model = CamembertForMaskedLM.from_pretrained('camembert-base').eval()
            return tokenizer, model
    except Exception as e:
        print(e)

def decode(tokenizer, pred_idx, top_clean):
    ignore_tokens = string.punctuation + '[PAD]'
    tokens = []
    for w in pred_idx:
        token = ''.join(tokenizer.decode(w).split())
        if token not in ignore_tokens:
            tokens.append(token.replace('##', ''))
    return '\n'.join(tokens[:top_clean])

def encode(tokenizer, text_sentence, add_special_tokens=True):
    text_sentence = text_sentence.replace('<mask>', tokenizer.mask_token)
    if tokenizer.mask_token == text_sentence.split()[-1]:
        text_sentence += ' .'

    input_ids = torch.tensor([tokenizer.encode(text_sentence, add_special_tokens=add_special_tokens)])
    mask_idx = torch.where(input_ids == tokenizer.mask_token_id)[1].tolist()[0]
    return input_ids, mask_idx

def get_all_predictions(tokenizer, model, text_sentence, top_clean=5):
    input_ids, mask_idx = encode(tokenizer, text_sentence)
    with torch.no_grad():
        predict = model(input_ids)[0]
    predictions = decode(tokenizer, predict[0, mask_idx, :].topk(top_clean).indices.tolist(), top_clean)
    return predictions

# Gradio Interface
def predict_next_word(input_text, top_k, model_name):
    tokenizer, model = load_french_model(model_name)
    input_text += ' <mask>'
    predictions = get_all_predictions(tokenizer, model, input_text, top_clean=top_k)
    return predictions

interface = gr.Interface(
    fn=predict_next_word,
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter your text here", label="Enter your text here"),
        gr.Slider(minimum=1, maximum=25, value=1, step=1, label="How many words do you need"),
        gr.Dropdown(choices=['LSTM'], value='LSTM', label="Select Model to Apply")
    ],
    outputs="text",
    title="Next Word Prediction"
)

interface.launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://16e023113a261ae923.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


