In [None]:
!pip install transformers datasets
!sudo apt-get install git-lfs
!pip install apache_beam mwparserfromhell
!pip install creolenltk
!pip install accelerate -U

In [None]:
from huggingface_hub import notebook_login
from datasets import concatenate_datasets, load_dataset, Dataset
from google.colab import drive
from huggingface_hub import HfApi
import torch
import re
from bs4 import BeautifulSoup
from creolenltk.contraction_expansion import ContractionToExpansion
from tokenizers import ByteLevelBPETokenizer
import os
from transformers import RobertaConfig, RobertaTokenizer, RobertaForMaskedLM, LineByLineTextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## 1. Prepare the dataset

### Load the datasets

In [None]:
# Load the c4 dataset
c4 = load_dataset("allenai/c4", "ht", split="train")
c4 = c4.remove_columns([col for col in c4.column_names if col != "text"])
c4

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

Dataset({
    features: ['text'],
    num_rows: 269174
})

In [None]:
# Load the wikimedia dataset
wiki = load_dataset("wikimedia/wikipedia", "20231101.ht", split="train")
wiki = wiki.remove_columns([col for col in wiki.column_names if col != "text"])
wiki

Dataset({
    features: ['text'],
    num_rows: 70159
})

In [None]:
# Load different data
drive.mount('/content/gdrive/', force_remount=True)

with open("gdrive/MyDrive/Correction Créole Haïtien/data.txt", "r") as f:
    data = [line.rstrip('\n') for line in f.readlines()]

Mounted at /content/gdrive/


In [None]:
# Create a dataset
custom_dataset = Dataset.from_dict({"text": data})
custom_dataset

Dataset({
    features: ['text'],
    num_rows: 8180
})

In [None]:
# Concatenate the datasets
raw_datasets = concatenate_datasets([wiki, custom_dataset])
raw_datasets

Dataset({
    features: ['text'],
    num_rows: 78339
})

### Preprocessing

In [None]:
def preprocess_text(example):
    text = example['text']

    # Remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    text = re.sub(r'<.*?>', '', soup.get_text())

    # Remove non-alphabetic characters
    cleaned_text = re.sub(r'[^\w\s,.!? ]+', ' ', text)

    # Remove extra whitespaces
    cleaned_text = re.sub(' +', ' ', cleaned_text)

    # Expand contractions using CreoleNLTK
    expander = ContractionToExpansion()
    expanded_sentence = expander.expand_contractions(cleaned_text.strip())

    example['text'] = expanded_sentence

    return example

In [None]:
# Apply preprocessing to the raw_datasets
preprocessed_datasets = raw_datasets.map(preprocess_text)
preprocessed_datasets

Map:   0%|          | 0/78339 [00:00<?, ? examples/s]

  soup = BeautifulSoup(text, 'html.parser')


Dataset({
    features: ['text'],
    num_rows: 78339
})

## 2. Train a Tokenizer

In [None]:
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()
data_path = "gdrive/MyDrive/Correction Créole Haïtien/preprocessed_data.txt"
paths = [data_path]

In [None]:
# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

In [None]:
# Save files to disk
token_dir = 'gdrive/MyDrive/Correction Créole Haïtien/HaiRoBERT'

if not os.path.exists(token_dir):
 os.makedirs(token_dir)

tokenizer.save_model(directory=token_dir)

['gdrive/MyDrive/Correction Créole Haïtien/HaiRoBERT/vocab.json',
 'gdrive/MyDrive/Correction Créole Haïtien/HaiRoBERT/merges.txt']

## 3. Preprocess the dataset

In [None]:
#Define the configuration of the Model
config = RobertaConfig(
 vocab_size=52_000,
 max_position_embeddings=514,
 num_attention_heads=12,
 num_hidden_layers=12,
 type_vocab_size=1,
)

In [None]:
# Load trained tokenizer
tokenizer = RobertaTokenizer.from_pretrained(token_dir, max_length=512)

In [None]:
# Initialize the Model
model = RobertaForMaskedLM(config=config).cuda()

In [None]:
# Build the Dataset
dataset = LineByLineTextDataset(
 tokenizer=tokenizer,
 file_path=data_path,
 block_size=128,
)



In [None]:
# Define the Data Collator
data_collator = DataCollatorForLanguageModeling(
 tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

## 4. Pre-train the model

In [None]:
training_args = TrainingArguments(
 output_dir=token_dir,
 overwrite_output_dir=True,
 num_train_epochs=3,
 per_device_train_batch_size=64,
 save_steps=10_000,
 save_total_limit=2,
)

trainer = Trainer(
 model=model,
 args=training_args,
 data_collator=data_collator,
 train_dataset=dataset,
)

trainer.train()

In [None]:
# Save the Model
trainer.save_model(token_dir)

## Test The Model

In [None]:
from transformers import pipeline
fill_mask = pipeline(
 'fill-mask',
 model=token_dir,
 tokenizer=token_dir
)
fill_mask('Bonjou koman ou <mask>?')

[{'score': 0.028453530743718147,
  'token': 744,
  'token_str': ' di',
  'sequence': 'Bonjou koman ou di?'},
 {'score': 0.021115528419613838,
  'token': 709,
  'token_str': ' ou',
  'sequence': 'Bonjou koman ou ou?'},
 {'score': 0.0176890566945076,
  'token': 323,
  'token_str': ' sa',
  'sequence': 'Bonjou koman ou sa?'},
 {'score': 0.016276126727461815,
  'token': 1817,
  'token_str': ' ye',
  'sequence': 'Bonjou koman ou ye?'},
 {'score': 0.013095790520310402,
  'token': 993,
  'token_str': ' kapab',
  'sequence': 'Bonjou koman ou kapab?'}]

In [None]:
fill_mask('Li <mask> anle a.')

[{'score': 0.07001323252916336,
  'token': 408,
  'token_str': ' gen',
  'sequence': 'Li gen anle a.'},
 {'score': 0.052880171686410904,
  'token': 1399,
  'token_str': ' ale',
  'sequence': 'Li ale anle a.'},
 {'score': 0.04952032119035721,
  'token': 337,
  'token_str': ' te',
  'sequence': 'Li te anle a.'},
 {'score': 0.02636015973985195,
  'token': 516,
  'token_str': ' genyen',
  'sequence': 'Li genyen anle a.'},
 {'score': 0.0230342298746109,
  'token': 316,
  'token_str': ' se',
  'sequence': 'Li se anle a.'}]