<a href="https://colab.research.google.com/github/marzinouri/AzeriPipeline/blob/main/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Acknowledgment: This portion of the code is based on the work available at [The Python Code](https://www.thepythoncode.com/article/pretraining-bert-huggingface-transformers-in-python).


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Install and Import Requirements

In [None]:
%%capture
!pip install datasets
!pip install tokenizers
!pip install transformers
!pip install sentencepiece

In [None]:
from datasets import *
from transformers import *
from tokenizers import *
import os
import json
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm
import math



# Load Data

In [None]:
def load_data_to_df(path):
    """
    Load dataset
    """

    # load data into a DataFrame object:
    df = pd.DataFrame()
    sents = []

    with open(path, "r") as f:
        lines = f.readlines()
    # lines = Path(path).open(encoding="utf-8").read().strip().split("\n")
    for line in lines:
        if line.strip() != "":
            sents.append(line.strip())
    df["text"] = sents
    return df

In [None]:
src_path = "/content/drive/MyDrive/Azari/Preprocessed_Datasets/Monolingual/ALL_v4.txt"
df = load_data_to_df(src_path)

In [None]:
from datasets import Dataset

azeri_dataset = Dataset.from_pandas(df)
azeri_dataset

Dataset({
    features: ['text'],
    num_rows: 1323130
})

In [None]:
data_train_test = azeri_dataset.train_test_split(test_size=0.1)
data_train, data_test = data_train_test["train"], data_train_test["test"]

data_train, data_test

(Dataset({
     features: ['text'],
     num_rows: 1190817
 }), Dataset({
     features: ['text'],
     num_rows: 132313
 }))

In [None]:
def dataset_to_text(dataset, output_filename="data.txt"):
  """Utility function to save dataset text to disk,
  useful for using the texts to train the tokenizer
  (as the tokenizer accepts files)"""
  with open(output_filename, "w") as f:
    for t in dataset["text"]:
      print(t, file=f)

dataset_to_text(data_train, "train.txt")
dataset_to_text(data_test, "test.txt")

# Tokenizer

In [None]:
from pathlib import Path
from tokenizers import BertWordPieceTokenizer

special_tokens = [
  "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"
]

files = ["train.txt"]
vocab_size = 10000
max_length = 64
truncate_longer_samples = True

In [None]:
# initialize the WordPiece tokenizer
tokenizer = BertWordPieceTokenizer()
# train the tokenizer
tokenizer.train(files=files, vocab_size=vocab_size, special_tokens=special_tokens)
# enable truncation up to the maximum 64 tokens
tokenizer.enable_truncation(max_length=max_length)

In [None]:
model_path = "/content/drive/MyDrive/Azari/Models/AzerBert_v2"

In [None]:
# make the directory if not already there
if not os.path.isdir(model_path):
  os.mkdir(model_path)
# save the tokenizer
tokenizer.save_model(model_path)
# tokenizer config
with open(os.path.join(model_path, "config.json"), "w") as f:
  tokenizer_cfg = {
      "do_lower_case": True,
      "unk_token": "[UNK]",
      "sep_token": "[SEP]",
      "pad_token": "[PAD]",
      "cls_token": "[CLS]",
      "mask_token": "[MASK]",
      "model_max_length": max_length,
      "max_len": max_length,
  }
  json.dump(tokenizer_cfg, f)

In [None]:
tokenizer = BertTokenizerFast.from_pretrained(model_path)

loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file /content/drive/MyDrive/Azari/Models/AzerBert_v2/config.json
loading configuration file /content/drive/MyDrive/Azari/Models/AzerBert_v2/config.json


In [None]:
def encode_with_truncation(examples):
  """Mapping function to tokenize the sentences passed with truncation"""
  return tokenizer(examples["text"], truncation=True, padding="max_length",
                   max_length=max_length, return_special_tokens_mask=True)

def encode_without_truncation(examples):
  """Mapping function to tokenize the sentences passed without truncation"""
  return tokenizer(examples["text"], return_special_tokens_mask=True)

encode = encode_with_truncation if truncate_longer_samples else encode_without_truncation

# tokenizing the train dataset
train_dataset = data_train.map(encode, batched=True)
# tokenizing the test dataset
test_dataset = data_test.map(encode, batched=True)

  0%|          | 0/1191 [00:00<?, ?ba/s]

  0%|          | 0/133 [00:00<?, ?ba/s]

In [None]:
if truncate_longer_samples:
  train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
  test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
else:
  test_dataset.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])
  train_dataset.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])

In [None]:
from itertools import chain

def group_texts(examples):
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

if not truncate_longer_samples:
  train_dataset = train_dataset.map(group_texts, batched=True,
                                    desc=f"Grouping texts in chunks of {max_length}")
  test_dataset = test_dataset.map(group_texts, batched=True,
                                  desc=f"Grouping texts in chunks of {max_length}")
  # convert them from lists to torch tensors
  train_dataset.set_format("torch")
  test_dataset.set_format("torch")

In [None]:
len(train_dataset), len(test_dataset)

(1190817, 132313)

# Model

In [None]:
# initialize the model with the config
model_config = BertConfig(vocab_size=vocab_size, max_position_embeddings=max_length)
model = BertForMaskedLM(config=model_config)

In [None]:
# initialize the data collator, randomly masking 15% of the tokens for MLM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
training_args = TrainingArguments(
    output_dir=model_path,
    evaluation_strategy="epoch",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=128,
    gradient_accumulation_steps=8,
    per_device_eval_batch_size=128,
    # logging_steps=10,
    logging_strategy="epoch",
    save_strategy="epoch",
    # save_steps=10,
    resume_from_checkpoint=model_path,
    load_best_model_at_end=True,
    save_total_limit=1,
    )

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [None]:
trainer.train()

# Load The Best Checkpoint

In [None]:
# load the model checkpoint
model = BertForMaskedLM.from_pretrained(os.path.join(model_path, "checkpoint-11630"))
# load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_path)

loading configuration file /content/drive/MyDrive/Azari/Models/AzerBert_v2/checkpoint-11630/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 64,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 10000
}

loading weights file /content/drive/MyDrive/Azari/Models/AzerBert_v2/checkpoint-11630/pytorch_model.bin
All model checkpoint weights were used when initializing BertForMaskedLM.

All the weights of BertForMaskedLM were initialized from the model checkpoint at /content/drive/MyDrive/

In [None]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: text, special_tokens_mask. If text, special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 132313
  Batch size = 128


Perplexity: 48.05


In [None]:
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

In [None]:
# perform predictions
import random

for example in data_test["text"][:100]:
    print("Input: ", example)
    arr_example = example.split(" ")
    index = random.randint(0, len(arr_example)-1)
    print("Masked Word: ", arr_example[index])
    arr_example[index] = "[MASK]"
    masked_example = f" ".join(arr_example)
    # print(masked_example)

    try:
        a = fill_mask(masked_example, arr_example)
    except:
        continue
    for prediction in a:
        print("Predicted Word: ", prediction["token_str"])
        print(f"{prediction['sequence']}, confidence: {prediction['score']}")
    print("="*50)

Input:  بو افاده ، بو شێلتاقلێق بؤیوک مسولیت یارادا بیلر .
Masked Word:  بو
Predicted Word:  بو
بو افاده ، بو شێلتاقلێق بویوک مسولیت یارادا بیلر., confidence: 0.04148092120885849
Predicted Word:  سیاسی
بو افاده ، سیاسی شێلتاقلێق بویوک مسولیت یارادا بیلر., confidence: 0.023639466613531113
Predicted Word:  یينی
بو افاده ، یينی شێلتاقلێق بویوک مسولیت یارادا بیلر., confidence: 0.020893214270472527
Predicted Word:  ینی
بو افاده ، ینی شێلتاقلێق بویوک مسولیت یارادا بیلر., confidence: 0.018106702715158463
Predicted Word:  بۆتۆن
بو افاده ، بۆتۆن شێلتاقلێق بویوک مسولیت یارادا بیلر., confidence: 0.017479099333286285
Input:  أللرین ایله بالینجیوین توْخونوشوندان ، ده‌ریسی ایسلانمیش ناغارا سسی قالخیر .
Masked Word:  أللرین
Predicted Word:  الی
الی ایله بالینجیوین توخونوشوندان ، دهریسی ایسلانمیش ناغارا سسی قالخیر., confidence: 0.05082245543599129
Predicted Word:  جی
جی ایله بالینجیوین توخونوشوندان ، دهریسی ایسلانمیش ناغارا سسی قالخیر., confidence: 0.02185274474322796
Predicted Word:  اللری
اللری ایله