In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [2]:
!nvidia-smi

Tue Jan 16 19:16:00 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA RTX A6000               On  | 00000000:01:00.0 Off |                  Off |
| 36%   66C    P2             205W / 300W |  17509MiB / 49140MiB |     55%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA RTX A6000               On  | 00000000:41:00.0 Off |  

In [3]:
from datasets import load_dataset

# wiki dataset: https://huggingface.co/datasets/wikipedia
dataset = load_dataset("wikipedia", "20220301.simple")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 205328
    })
})

In [5]:
from transformers import AutoModel, AutoTokenizer

In [6]:
sample_text = dataset["train"]['text'][39]
sample_text

'A boot is a type of footwear that protects the foot and ankle. Boots are higher and larger than shoes and sandals. Some boots are high enough to protect the calves (lower part of the leg) as well.  Some boots are held on with bootstraps or bootlaces.  Some also have spats or gaiters to keep water out.  Most have a very strong boot sole, the bottom part of a boot.\n\nTypes of boots \n Rain boots (or rubber boots) are made from rubber or plastic. Rain boots protect a person\'s feet from water and rain. People who work on fishing boats and farmers wear rubber boots to keep their feet dry. People who work in chemical factories wear rubber boots to protect their feet from dangerous chemicals.\n\n Winter boots are boots that keep a person\'s feet warm in cold weather. People in cold countries such as Canada and Sweden wear winter boots during the cold season. Winter boots can be made from many different materials, such as leather, fabric, or plastic. Winter boots are insulated with wool or 

In [7]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [8]:
tokens = tokenizer(sample_text).input_ids
[tokenizer.decode(id) for id in tokens]

['[CLS]',
 'A',
 'boot',
 'is',
 'a',
 'type',
 'of',
 'foot',
 '##wear',
 'that',
 'protects',
 'the',
 'foot',
 'and',
 'ankle',
 '.',
 'Boots',
 'are',
 'higher',
 'and',
 'larger',
 'than',
 'shoes',
 'and',
 'sand',
 '##als',
 '.',
 'Some',
 'boots',
 'are',
 'high',
 'enough',
 'to',
 'protect',
 'the',
 'ca',
 '##lves',
 '(',
 'lower',
 'part',
 'of',
 'the',
 'leg',
 ')',
 'as',
 'well',
 '.',
 'Some',
 'boots',
 'are',
 'held',
 'on',
 'with',
 'boots',
 '##tra',
 '##ps',
 'or',
 'boot',
 '##lace',
 '##s',
 '.',
 'Some',
 'also',
 'have',
 'spat',
 '##s',
 'or',
 'g',
 '##ait',
 '##ers',
 'to',
 'keep',
 'water',
 'out',
 '.',
 'Most',
 'have',
 'a',
 'very',
 'strong',
 'boot',
 'sole',
 ',',
 'the',
 'bottom',
 'part',
 'of',
 'a',
 'boot',
 '.',
 'Type',
 '##s',
 'of',
 'boots',
 'Rain',
 'boots',
 '(',
 'or',
 'rubber',
 'boots',
 ')',
 'are',
 'made',
 'from',
 'rubber',
 'or',
 'plastic',
 '.',
 'Rain',
 'boots',
 'protect',
 'a',
 'person',
 "'",
 's',
 'feet',
 'from',

In [10]:
from transformers import BertConfig, BertForMaskedLM

config = BertConfig(
    hidden_size = 384,
    vocab_size= tokenizer.vocab_size,
    num_hidden_layers = 6,
    num_attention_heads = 6,
    intermediate_size = 1024,
    max_position_embeddings = 256
)

model = BertForMaskedLM(config=config)
print(model.num_parameters()) #10457864

19696324


In [11]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [12]:
import torch
from torch.utils.data import Dataset
from accelerate import Accelerator, DistributedType

class LineByLineTextDataset(Dataset):
    def __init__(self, tokenizer, raw_datasets, max_length: int):
        self.padding = "max_length"
        self.text_column_name = 'text'
        self.max_length = max_length
        self.accelerator = Accelerator(gradient_accumulation_steps=1)
        self.tokenizer = tokenizer

        with self.accelerator.main_process_first():
            self.tokenized_datasets = raw_datasets.map(
                self.tokenize_function,
                batched=True,
                num_proc=4,
                remove_columns=[self.text_column_name],
                desc="Running tokenizer on dataset line_by_line",
            )
            self.tokenized_datasets.set_format('torch',columns=['input_ids'],dtype=torch.long)

    def tokenize_function(self,examples):
        examples[self.text_column_name] = [
            line for line in examples[self.text_column_name] if len(line[0]) > 0 and not line[0].isspace()
        ]
        return self.tokenizer(
            examples[self.text_column_name],
            padding=self.padding,
            truncation=True,
            max_length=self.max_length,
            return_special_tokens_mask=True,
        )
    def __len__(self):
        return len(self.tokenized_datasets)

    def __getitem__(self, i):
        return self.tokenized_datasets[i]

In [14]:
tokenized_dataset_train = LineByLineTextDataset(
    tokenizer= tokenizer,
    raw_datasets = dataset,
    max_length=256, # adjust this based on your requrements
)

Running tokenizer on dataset line_by_line (num_proc=4):   3%|▎         | 6000/205328 [00:01<00:54, 3661.31 examples/s]


ArrowInvalid: Column 3 named input_ids expected length 1000 but got length 998

In [18]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./model",
    overwrite_output_dir=True,
    push_to_hub=True,
    hub_model_id="Ransaka/sinhala-bert-yt",
    learning_rate=1e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    max_steps=500,
    eval_steps=100,
    logging_steps=100,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    report_to='none',
    hub_private_repo = True,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset_train['train'],
    eval_dataset= tokenized_dataset_train['train'], # change to your actual evaluation dataset
    )

In [19]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
100,7.4172,7.251174
200,7.1845,7.121541
300,7.0801,7.04647
400,7.0408,7.016527
500,7.0206,7.002804


There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].


TrainOutput(global_step=500, training_loss=7.14864013671875, metrics={'train_runtime': 312.1458, 'train_samples_per_second': 102.516, 'train_steps_per_second': 1.602, 'total_flos': 412770186166272.0, 'train_loss': 7.14864013671875, 'epoch': 3.18})

In [20]:
results = trainer.evaluate()

In [22]:
import math

print(f">>> Perplexity: {math.exp(results['eval_loss']):.2f}")

>>> Perplexity: 1102.21
