In [1]:
!pip3 install evaluate datasets tokenizers accelerate --q

In [23]:
!nvidia-smi

Tue Dec 12 03:08:56 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   72C    P0    31W /  70W |   4895MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
token = 'hf_AAAAAAAAAAAAAAAAAAAAAAAAA'

In [3]:
!huggingface-cli login --token "$token"

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
from datasets import load_dataset

dataset = load_dataset("Ransaka/sinhala-450M-sample")

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 10000
    })
})

In [6]:
from transformers import AutoModel, AutoTokenizer

In [7]:
sample_text = dataset["train"]['text'][39]
sample_text

'මට මතකයි ඇල්. ඩී. මැන්දිස්ගෙ “පනංගල අයියා“ පොතේ තිබ්බා, අතුරු මිතුරු කියන හැටි අපි අහල තියෙන විදිහට වෙනස් විදිහකට. සමහර විට සිරි කියන්නෙ ඒකද දන්නෑ.'

In [8]:
tokenizer_base = AutoTokenizer.from_pretrained("bert-base-cased")

In [9]:
tokens = tokenizer_base(sample_text).input_ids
[tokenizer_base.decode(id) for id in tokens]

['[CLS]',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '.',
 '[UNK]',
 '.',
 '[UNK]',
 '“',
 '[UNK]',
 '[UNK]',
 '“',
 '[UNK]',
 '[UNK]',
 ',',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '.',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '.',
 '[SEP]']

In [10]:
training_corpus = (
    dataset['train'][i : i + 1000]["text"]
    for i in range(0, len(dataset), 1000)
)

In [11]:
# train tokenizer with train_new_from_iterator method
tokenizer = tokenizer_base.train_new_from_iterator(training_corpus, 2_000)

In [12]:
tokens = tokenizer(sample_text).input_ids
[tokenizer.decode(id) for id in tokens]

['[CLS]',
 'මට',
 'මතක',
 '##යි',
 'ඇ',
 '##ල්',
 '.',
 'ඩ',
 '##ී',
 '.',
 'මැ',
 '##න්ද',
 '##ිස්',
 '##ගෙ',
 '“',
 'පන',
 '##ංගල',
 'අයි',
 '##යා',
 '“',
 'පොත',
 '##ේ',
 'තිබ්බ',
 '##ා',
 ',',
 'අතුරු',
 'මිතුර',
 '##ු',
 'කියන',
 'හැටි',
 'අපි',
 'අහ',
 '##ල',
 'තියෙන',
 'විදිහට',
 'වෙනස්',
 'විදිහ',
 '##කට',
 '.',
 'සමහර',
 'විට',
 'සිර',
 '##ි',
 'කියන්නෙ',
 'ඒක',
 '##ද',
 'දන්න',
 '##ෑ',
 '.',
 '[SEP]']

In [13]:
tokenizer.save_pretrained("tokenizer/sinhala-wordpiece-sample")

('tokenizer/sinhala-wordpiece-sample/tokenizer_config.json',
 'tokenizer/sinhala-wordpiece-sample/special_tokens_map.json',
 'tokenizer/sinhala-wordpiece-sample/vocab.txt',
 'tokenizer/sinhala-wordpiece-sample/added_tokens.json',
 'tokenizer/sinhala-wordpiece-sample/tokenizer.json')

In [14]:
from transformers import BertConfig, BertForMaskedLM

config = BertConfig(
    hidden_size = 384,
    vocab_size= tokenizer.vocab_size,
    num_hidden_layers = 6,
    num_attention_heads = 6,
    intermediate_size = 1024,
    max_position_embeddings = 256
)

model = BertForMaskedLM(config=config)
print(model.num_parameters()) #10457864

9302864


In [15]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [16]:
import torch
from torch.utils.data import Dataset
from accelerate import Accelerator, DistributedType

class LineByLineTextDataset(Dataset):
    def __init__(self, tokenizer, raw_datasets, max_length: int):
        self.padding = "max_length"
        self.text_column_name = 'text'
        self.max_length = max_length
        self.accelerator = Accelerator(gradient_accumulation_steps=1)
        self.tokenizer = tokenizer

        with self.accelerator.main_process_first():
            self.tokenized_datasets = raw_datasets.map(
                self.tokenize_function,
                batched=True,
                num_proc=4,
                remove_columns=[self.text_column_name],
                desc="Running tokenizer on dataset line_by_line",
            )
            self.tokenized_datasets.set_format('torch',columns=['input_ids'],dtype=torch.long)

    def tokenize_function(self,examples):
        examples[self.text_column_name] = [
            line for line in examples[self.text_column_name] if len(line[0]) > 0 and not line[0].isspace()
        ]
        return self.tokenizer(
            examples[self.text_column_name],
            padding=self.padding,
            truncation=True,
            max_length=self.max_length,
            return_special_tokens_mask=True,
        )
    def __len__(self):
        return len(self.tokenized_datasets)

    def __getitem__(self, i):
        return self.tokenized_datasets[i]

In [17]:
tokenized_dataset_train = LineByLineTextDataset(
    tokenizer= tokenizer,
    raw_datasets = dataset,
    max_length=256, # adjust this based on your requrements
)

Running tokenizer on dataset line_by_line (num_proc=4):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [18]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./model",
    overwrite_output_dir=True,
    push_to_hub=True,
    hub_model_id="Ransaka/sinhala-bert-yt",
    learning_rate=1e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    max_steps=500,
    eval_steps=100,
    logging_steps=100,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    report_to='none',
    hub_private_repo = True,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset_train['train'],
    eval_dataset= tokenized_dataset_train['train'], # change to your actual evaluation dataset
    )

In [19]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
100,7.4172,7.251174
200,7.1845,7.121541
300,7.0801,7.04647
400,7.0408,7.016527
500,7.0206,7.002804


There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].


TrainOutput(global_step=500, training_loss=7.14864013671875, metrics={'train_runtime': 312.1458, 'train_samples_per_second': 102.516, 'train_steps_per_second': 1.602, 'total_flos': 412770186166272.0, 'train_loss': 7.14864013671875, 'epoch': 3.18})

In [20]:
results = trainer.evaluate()

In [22]:
import math

print(f">>> Perplexity: {math.exp(results['eval_loss']):.2f}")

>>> Perplexity: 1102.21
