In [1]:
#!pip3 install datasets transformers sentencepiece

In [2]:
from datasets import *
from transformers import *
from tokenizers import *
import os
import json



In [3]:
import pandas as pd
pd.set_option('colwidth',0)
df = pd.read_csv("LM_corpus.csv", names=["sent"])
df.head()

Unnamed: 0,sent
0,ကံ က ကိုယ်တိုင် ဖန်တီး ရ တာ ပဲ
1,ကံ က ခွဲ ရင် ဘာ မ ဟုတ် တာ လေး နဲ့ လည်း လွဲ ရ တာ ပဲ
2,ကံကြမ္မာ ယုံ လား
3,ကံကြမ္မာ ဆိုတာ အောင်မြင် ဖို့ အခွင့်အရေး တွေ ပေး လေ့ ရှိ ပါ တယ်
4,ကံကြမ္မာ နေ မ ထွက် သေး ခင် မှာ နေမဝင် ပါ ရ စေ နဲ့


In [4]:
df = df.rename(columns={'sent': 'text'})

In [5]:
df.head()

Unnamed: 0,text
0,ကံ က ကိုယ်တိုင် ဖန်တီး ရ တာ ပဲ
1,ကံ က ခွဲ ရင် ဘာ မ ဟုတ် တာ လေး နဲ့ လည်း လွဲ ရ တာ ပဲ
2,ကံကြမ္မာ ယုံ လား
3,ကံကြမ္မာ ဆိုတာ အောင်မြင် ဖို့ အခွင့်အရေး တွေ ပေး လေ့ ရှိ ပါ တယ်
4,ကံကြမ္မာ နေ မ ထွက် သေး ခင် မှာ နေမဝင် ပါ ရ စေ နဲ့


In [6]:
len(df)

80596

In [7]:
from sklearn.model_selection import train_test_split

# Split the DataFrame into training and test sets
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

In [8]:
train_df.to_csv("train.txt",index=False,header=False)
test_df.to_csv("test.txt",index=False,header=False)

In [9]:
len(train_df), len(test_df)

(72536, 8060)

# Training the tokenizer

In [10]:
from tokenizers import SentencePieceBPETokenizer
from transformers import PreTrainedTokenizerFast

vocab_size = 30522
max_length = 512
truncate_longer_samples = True

with open("train.txt") as f:
    data = [line.rstrip() for line in f]
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"]

tokenizer = SentencePieceBPETokenizer()
tokenizer.train_from_iterator(data, vocab_size=vocab_size, min_frequency=2, show_progress=True, special_tokens=special_tokens)






In [11]:
model_path = "pretrained-bert"
# make the directory if not already there
if not os.path.isdir(model_path):
  os.mkdir(model_path)

In [12]:
# save the tokenizer
tokenizer.save_model(model_path)

['pretrained-bert/vocab.json', 'pretrained-bert/merges.txt']

In [13]:
fast_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer._tokenizer, 
    model_max_length=max_length, truncation=True
)
fast_tokenizer.add_special_tokens({'pad_token': '[PAD]','unk_token':'[UNK]','cls_token':'[CLS]','sep_token':'[SEP]','mask_token':'[MASK]'})

0

In [14]:
output = fast_tokenizer.encode("နေကောင်းလား တကယ် လား")

In [15]:
output

[1489, 677, 417, 135]

# Tokenizing the dataset

In [16]:
from datasets import load_dataset

# Load the dataset from train.txt file
training_dataset = load_dataset('text', data_files={'train': 'train.txt'})
testing_dataset = load_dataset('text', data_files={'test': 'test.txt'})

def encode_with_truncation(examples):
  """Mapping function to tokenize the sentences passed with truncation"""
  return fast_tokenizer(examples["text"], truncation=True, padding="max_length",
                   max_length=max_length, return_special_tokens_mask=True)

def encode_without_truncation(examples):
  """Mapping function to tokenize the sentences passed without truncation"""
  return fast_tokenizer(examples["text"], return_special_tokens_mask=True)

# The encode function will depend on the truncate_longer_samples variable
encode = encode_with_truncation if truncate_longer_samples else encode_without_truncation

# Tokenize the train dataset
train_dataset = training_dataset["train"].map(encode, batched=True)
test_dataset = testing_dataset["test"].map(encode, batched=True)

if truncate_longer_samples:
    # Remove other columns and set input_ids and attention_mask as PyTorch tensors
    train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
    test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
else:
    # Remove other columns and remain them as Python lists
    train_dataset.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])
    test_dataset.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/72536 [00:00<?, ? examples/s]

Map:   0%|          | 0/8060 [00:00<?, ? examples/s]

In [17]:
train_dataset[0]

{'input_ids': tensor([811, 101, 773, 105,  99, 135,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0, 

In [18]:
test_dataset[0]

{'input_ids': tensor([3495,  824,  105,  203,  436,  158, 3347,  335, 2946,  270,  513,  170,
          136,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0, 

In [19]:
from itertools import chain

def group_texts(examples):
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])

    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

if not truncate_longer_samples:
  train_dataset = train_dataset.map(group_texts, batched=True,
                                    desc=f"Grouping texts in chunks of {max_length}")
  test_dataset = test_dataset.map(group_texts, batched=True,
                                  desc=f"Grouping texts in chunks of {max_length}")
  # convert them from lists to torch tensors
  train_dataset.set_format("torch")
  test_dataset.set_format("torch")

In [20]:
len(train_dataset), len(test_dataset)

(72536, 8060)

# Loading the Model

In [21]:
# initialize the model with the config
model_config = BertConfig(vocab_size=vocab_size, max_position_embeddings=max_length)
model = BertForMaskedLM(config=model_config)

Generate config GenerationConfig {
  "pad_token_id": 0
}



In [22]:
# initialize the data collator, randomly masking 20% (default is 15%) of the tokens for the Masked Language
# Modeling (MLM) task
data_collator = DataCollatorForLanguageModeling(
    tokenizer=fast_tokenizer, mlm=True, mlm_probability=0.2
)

In [23]:
training_args = TrainingArguments(
    output_dir=model_path,          # output directory to where save model checkpoint
    evaluation_strategy="steps",    # evaluate each `logging_steps` steps
    overwrite_output_dir=True,
    num_train_epochs=1,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=10, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=8,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=64,  # evaluation batch size
    logging_steps=1000,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=1000,
    # load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    # save_total_limit=10,           # whether you don't have much space so you let only 3 model weights saved in the disk
)

using `logging_steps` to initialize `eval_steps` to 1000
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [24]:
# initialize the trainer and pass everything to it
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [25]:
# train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: text, special_tokens_mask. If text, special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 72,536
  Num Epochs = 1
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 80
  Gradient Accumulation steps = 8
  Total optimization steps = 906
  Number of trainable parameters = 109,514,298
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=906, training_loss=5.896653602717991, metrics={'train_runtime': 4844.5186, 'train_samples_per_second': 14.973, 'train_steps_per_second': 0.187, 'total_flos': 1.9077084831744e+16, 'train_loss': 5.896653602717991, 'epoch': 1.0})

# Using the Model

In [26]:
from datasets import *
from transformers import *
from tokenizers import *
import os
import json

In [27]:
from tokenizers import SentencePieceBPETokenizer
max_length = 256

model_path = 'pretrained-bert'
# Replace these with the actual paths to your model files
vocab_file = model_path+"/vocab.json"
merges_file = model_path+"/merges.txt"

# Load the tokenizer
tokenizer = SentencePieceBPETokenizer(vocab_file, merges_file)

In [28]:
fast_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer._tokenizer, 
    model_max_length=max_length, truncation=True
)
fast_tokenizer.add_special_tokens({'pad_token': '[PAD]','unk_token':'[UNK]','cls_token':'[CLS]','sep_token':'[SEP]','mask_token':'[MASK]'})

5

In [29]:
# when you load from pretrained
model = BertForMaskedLM.from_pretrained(os.path.join(model_path, "checkpoint-45000"))
# or simply use pipeline
fill_mask = pipeline("fill-mask", model=model, tokenizer=fast_tokenizer)

loading configuration file pretrained-bert/checkpoint-45000/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 256,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.35.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pretrained-bert/checkpoint-45000/model.safetensors
Generate config GenerationConfig {
  "pad_token_id": 0
}

All model checkpoint weights were used when initializing BertForMaskedLM.

All the weights of BertForMaskedLM were initialized from the model checkpoint at pretrained-bert/checkpoint-4

In [30]:
# perform predictions
example = "ထမင်း [MASK] ပြီး ပြီ လား"
for prediction in fill_mask(example):
  print(prediction)

{'score': 0.6793836355209351, 'token': 225, 'token_str': 'စား', 'sequence': 'ထမင်း  စား ပြီး ပြီ လား'}
{'score': 0.06020977720618248, 'token': 494, 'token_str': 'ချက်', 'sequence': 'ထမင်း  ချက် ပြီး ပြီ လား'}
{'score': 0.04290827736258507, 'token': 447, 'token_str': 'သောက်', 'sequence': 'ထမင်း  သောက် ပြီး ပြီ လား'}
{'score': 0.017751095816493034, 'token': 513, 'token_str': 'ထည့်', 'sequence': 'ထမင်း  ထည့် ပြီး ပြီ လား'}
{'score': 0.015311525203287601, 'token': 742, 'token_str': 'ကျွေး', 'sequence': 'ထမင်း  ကျွေး ပြီး ပြီ လား'}


In [31]:
# perform predictions
examples = [
  "ထမင်း [MASK] ပြီး ပြီ လား",
  "ဈေးဝယ် [MASK] ခဲ့ လား",
  "ကျေးဇူး အများကြီး [MASK] ပါ တယ်",
  "ကြိုးကြိုးစားစား အလုပ်လုပ် [MASK] လား",
  "[MASK] ခွက် ထဲ မှာ ပဲ ကော်ဖီ ရှိ နေ လို့"
]
for example in examples:
  for prediction in fill_mask(example):
    print(f"{prediction['sequence']}, confidence: {prediction['score']}")
  print("="*50)

ထမင်း  စား ပြီး ပြီ လား, confidence: 0.6793836355209351
ထမင်း  ချက် ပြီး ပြီ လား, confidence: 0.06020977720618248
ထမင်း  သောက် ပြီး ပြီ လား, confidence: 0.04290827736258507
ထမင်း  ထည့် ပြီး ပြီ လား, confidence: 0.017751095816493034
ထမင်း  ကျွေး ပြီး ပြီ လား, confidence: 0.015311525203287601
ဈေးဝယ်  လာ ခဲ့ လား, confidence: 0.6065731048583984
ဈေးဝယ်  ဝယ် ခဲ့ လား, confidence: 0.07704158127307892
ဈေးဝယ်  သွား ခဲ့ လား, confidence: 0.02894228883087635
ဈေးဝယ်  ထွက် ခဲ့ လား, confidence: 0.01704169623553753
ဈေးဝယ်  ပြန် ခဲ့ လား, confidence: 0.0162653811275959
ကျေးဇူး အများကြီး  တင် ပါ တယ်, confidence: 0.9796635508537292
ကျေးဇူး အများကြီး  ရှိ ပါ တယ်, confidence: 0.006047429051250219
ကျေးဇူး အများကြီး  ပါ ပါ တယ်, confidence: 0.0007229391485452652
ကျေးဇူး အများကြီး  အထူး ပါ တယ်, confidence: 0.00059018365573138
ကျေးဇူး အများကြီး  ကျန် ပါ တယ်, confidence: 0.00043052074033766985
ကြိုးကြိုးစားစား အလုပ်လုပ်  တာ လား, confidence: 0.5289212465286255
ကြိုးကြိုးစားစား အလုပ်လုပ်  ရ လား, confidence: 0.197596