In [1]:
# !pip install transformers==4.28.0

In [2]:
# !pip install peft

In [3]:
# !pip install transformers[torch] datasets

# Labelling verb root





In [4]:
import pandas as pd

# Load the TSV files into DataFrames
df_branch = pd.read_csv('../data/branch2.tsv', sep='\t')
print(df_branch.head())

df_news = pd.read_csv('../data/news2.tsv', sep='\t')
print(df_news.head())

df_products = pd.read_csv('../data/products2.tsv', sep='\t')
print(df_products.head())

df_problem = pd.read_csv('../data/problem2.tsv', sep='\t')
print(df_problem.head())

                                            prompt    tag
0                      Төв салбар хаана байдаг вэ?  <bra>
1                    Багануур салбарын хаяг юу вэ?  <bra>
2            Баянгол салбар ажлын хэдэн цагтэй вэ?  <bra>
3            Сонгинохайрхан салбарын утасны дугаар  <bra>
4  Сүхбаатар салбарын ажлын өдрүүд ямар байдаг вэ?  <bra>
                                              prompt    tag
0  Хасбанкны хамгийн сүүлийн үеийн мэдээ хаана ни...  <npa>
1              Банкны жилийн тайланг хэрхэн үзэх вэ   <npa>
2           Сайн уу, хасбанкны олон улсын үнэлгээ     <npa>
3  Хас банкны түүхэн амжилтуудын талаар дэлгэрэнг...  <npa>
4  Ажлын цагийн өөрчлөлттэй холбоотой мэдээллийг ...  <npa>
                                              prompt    tag
0  Цалингийн зээл авахад тавигдах шаардлагуудыг т...  <pro>
1  Хэрэглээний худалдан авалтын зээлд ямар төрлий...  <pro>
2  Өрхийн зээлийн нөхцлийг бусад зээлтэй харьцуул...  <pro>
3          Зээлийн хүүгийн тооцооллыг хэрхэн хийх вэ

In [5]:
merged_df = pd.concat([df_branch, df_news, df_products, df_problem]) 
# Shuffle the DataFrame 
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

print(merged_df.head())
print(len(merged_df))

                                              prompt    tag
0            Нарантуул тооцооны төв хэзээ хаадаг вэ?  <bra>
1  БГБ банкны хадгаламжийн хураамжийн төлбөрийн т...  <noa>
2  Капитал банкны харилцагчдын үйлчилгээний талаа...  <noa>
3  Хас банк ирээдүйн зорилтуудаа хэрхэн тодорхойл...  <npa>
4         ХасБанк ямар хөтөлбөр хэрэгжүүлж байна вэ?  <pro>
1745


In [6]:
import datasets
from datasets import Dataset, DatasetDict

df = pd.DataFrame(columns=['prompt', 'tag'])
df['prompt'] = '<s> bb: '+ merged_df['prompt']
df['tag'] = merged_df['tag']+'</s>'

infl_dataset = Dataset.from_pandas(df)
print(df.head())

                                              prompt        tag
0    <s> bb: Нарантуул тооцооны төв хэзээ хаадаг вэ?  <bra></s>
1  <s> bb: БГБ банкны хадгаламжийн хураамжийн төл...  <noa></s>
2  <s> bb: Капитал банкны харилцагчдын үйлчилгээн...  <noa></s>
3  <s> bb: Хас банк ирээдүйн зорилтуудаа хэрхэн т...  <npa></s>
4  <s> bb: ХасБанк ямар хөтөлбөр хэрэгжүүлж байна...  <pro></s>


Split the dataset's `train_asks` split into a train and test set with the [train_test_split](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.train_test_split) method:

In [7]:
ds_train_devtest = infl_dataset.train_test_split(test_size=0.025, seed = 10)
ds_devtest = ds_train_devtest['test'].train_test_split(test_size=0.5, seed = 10)

ds_splits = DatasetDict({
    'train': ds_train_devtest['train'],
    'valid': ds_devtest['train'],
    'test': ds_devtest['test']
})
print(ds_splits)

DatasetDict({
    train: Dataset({
        features: ['prompt', 'tag'],
        num_rows: 1701
    })
    valid: Dataset({
        features: ['prompt', 'tag'],
        num_rows: 22
    })
    test: Dataset({
        features: ['prompt', 'tag'],
        num_rows: 22
    })
})


Then take a look at an example:

In [8]:
def concatenate_columns(example):
    example["prompt"] = example["prompt"] + " " + example["tag"]
    example["tag"] = example["tag"]
    return example

ds_splits["train"] = ds_splits["train"].map(concatenate_columns)
ds_splits["valid"] = ds_splits["valid"].map(concatenate_columns)
ds_splits = ds_splits.flatten()

Map:   0%|          | 0/1701 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

In [9]:
ds_splits["train"][0:5]

{'prompt': ['<s> bb: Яагаад хүмүүс цаг хугацааг мэдэрдэг юм бол? <noa></s>',
  '<s> bb: Төв салбарын хэдэн цагт хаадаг вэ? <bra></s>',
  '<s> bb: Миний дансанд орлого орсон эсэхийг яаж мэдэх вэ? <pro></s>',
  '<s> bb: Хадгаламж барьцаалсан зээлд ямар төрлийн хадгаламж барьцаалж болох вэ <pro></s>',
  '<s> bb: Мэдээллийн ёс зүй гэж юуг хэлэх вэ  <noa></s>'],
 'tag': ['<noa></s>', '<bra></s>', '<pro></s>', '<pro></s>', '<noa></s>']}

## Preprocess

The next step is to load a DistilGPT2 tokenizer to process the `text` subfield:

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bayartsogt/mongolian-gpt2")



You'll notice from the example above, the `text` field is actually nested inside `answers`. This means you'll need to
extract the `text` subfield from its nested structure with the [`flatten`](https://huggingface.co/docs/datasets/process.html#flatten) method:

In [11]:
def preprocess_function(examples):
    return tokenizer(examples["prompt"])

In [12]:
try:
    tokenized_ds_ = ds_splits.map(
        preprocess_function,
        batched=True,
        num_proc=4,
        remove_columns=ds_splits["train"].column_names
    )
except Exception as e:
    print(f"Error encountered with num_proc=4: {e}")
    print("Retrying with num_proc=1...")
    try:
        tokenized_ds_ = ds_splits.map(
            preprocess_function,
            batched=True,
            num_proc=1,
            remove_columns=ds_splits["train"].column_names
        )
    except Exception as e2:
        print(f"Error encountered with num_proc=1: {e2}")
        raise

Map (num_proc=4):   0%|          | 0/1701 [00:00<?, ? examples/s]

Error encountered with num_proc=4: name 'tokenizer' is not defined
Retrying with num_proc=1...


Map:   0%|          | 0/1701 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

In [13]:
print(tokenized_ds_["train"][0])

{'input_ids': [0, 2118, 70, 30, 3398, 679, 577, 4661, 11316, 394, 320, 35, 11210, 38297, 69, 34, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [14]:
block_size = 64

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [15]:
try:
    lm_dataset = tokenized_ds_.map(group_texts, batched=True, num_proc=4)
except Exception as e:
    print(f"Error encountered with num_proc=4: {e}")
    print("Retrying with num_proc=1...")
    try:
        lm_dataset = tokenized_ds_.map(group_texts, batched=True, num_proc=1)
    except Exception as e2:
        print(f"Error encountered with num_proc=1: {e2}")
        raise

Map (num_proc=4):   0%|          | 0/1701 [00:00<?, ? examples/s]

Error encountered with num_proc=4: name 'block_size' is not defined
Retrying with num_proc=1...


Map:   0%|          | 0/1701 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

In [16]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [17]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model = AutoModelForCausalLM.from_pretrained("bayartsogt/mongolian-gpt2")



In [18]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [19]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
print(f"Using device: {device}")

Using device: cuda


In [21]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# Set up training arguments with push_to_hub enabled
training_args = TrainingArguments(
    output_dir='./results3',
    evaluation_strategy="steps", 
    learning_rate=5e-5,
    per_device_train_batch_size=8,   
    per_device_eval_batch_size=8,    
    num_train_epochs=20,              
    weight_decay=0.01,              
    save_strategy="steps", 
    save_steps=500,
    logging_dir='./logs',            
    logging_steps=100,
    push_to_hub=True,                # Enable pushing to hub
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    warmup_steps=300,
    hub_model_id="Ikhee10/khasbank_three_classifier_v13"
)

model.to(device)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["valid"],
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=0.05)]
)

# Train the model
trainer.train()

# Evaluate the model
evaluation_results = trainer.evaluate()
print(evaluation_results)

# Push the model to Hugging Face Hub
trainer.push_to_hub()


For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/Ikhee10/khasbank_three_classifier_v13 into local empty directory.


  0%|          | 0/1180 [00:00<?, ?it/s]

{'loss': 7.4999, 'learning_rate': 1.6666666666666667e-05, 'epoch': 1.69}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.9377739429473877, 'eval_runtime': 0.0209, 'eval_samples_per_second': 286.911, 'eval_steps_per_second': 47.818, 'epoch': 1.69}
{'loss': 3.0483, 'learning_rate': 3.3333333333333335e-05, 'epoch': 3.39}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.114480972290039, 'eval_runtime': 0.028, 'eval_samples_per_second': 214.414, 'eval_steps_per_second': 35.736, 'epoch': 3.39}
{'loss': 1.5643, 'learning_rate': 5e-05, 'epoch': 5.08}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7666038274765015, 'eval_runtime': 0.0234, 'eval_samples_per_second': 256.713, 'eval_steps_per_second': 42.785, 'epoch': 5.08}
{'loss': 0.9731, 'learning_rate': 4.431818181818182e-05, 'epoch': 6.78}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.9449290037155151, 'eval_runtime': 0.0297, 'eval_samples_per_second': 201.911, 'eval_steps_per_second': 33.652, 'epoch': 6.78}
{'loss': 0.5582, 'learning_rate': 3.8636363636363636e-05, 'epoch': 8.47}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.591663122177124, 'eval_runtime': 0.0224, 'eval_samples_per_second': 268.424, 'eval_steps_per_second': 44.737, 'epoch': 8.47}
{'loss': 0.3162, 'learning_rate': 3.295454545454545e-05, 'epoch': 10.17}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.8739731311798096, 'eval_runtime': 0.0201, 'eval_samples_per_second': 298.58, 'eval_steps_per_second': 49.763, 'epoch': 10.17}
{'loss': 0.1668, 'learning_rate': 2.7272727272727273e-05, 'epoch': 11.86}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.904003381729126, 'eval_runtime': 0.0178, 'eval_samples_per_second': 336.689, 'eval_steps_per_second': 56.115, 'epoch': 11.86}
{'loss': 0.0922, 'learning_rate': 2.1590909090909093e-05, 'epoch': 13.56}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.0118744373321533, 'eval_runtime': 0.0108, 'eval_samples_per_second': 554.106, 'eval_steps_per_second': 92.351, 'epoch': 13.56}
{'loss': 0.0659, 'learning_rate': 1.590909090909091e-05, 'epoch': 15.25}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.0164401531219482, 'eval_runtime': 0.0106, 'eval_samples_per_second': 566.606, 'eval_steps_per_second': 94.434, 'epoch': 15.25}
{'loss': 0.0529, 'learning_rate': 1.0227272727272729e-05, 'epoch': 16.95}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.009570837020874, 'eval_runtime': 0.0201, 'eval_samples_per_second': 298.202, 'eval_steps_per_second': 49.7, 'epoch': 16.95}


  state_dict = torch.load(best_model_path, map_location="cpu")


{'train_runtime': 116.4566, 'train_samples_per_second': 80.717, 'train_steps_per_second': 10.133, 'train_loss': 1.4337734851837158, 'epoch': 16.95}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.591663122177124, 'eval_runtime': 0.0177, 'eval_samples_per_second': 339.799, 'eval_steps_per_second': 56.633, 'epoch': 16.95}


Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 1.00/487M [00:00<?, ?B/s]

remote: error: cannot lock ref 'refs/heads/main': is at 8336bed6500fba256a51c20bd6a5530881bcd368 but expected e6973ebc4dbfd57314d8c8f8a5f446ce1f76042c        
To https://huggingface.co/Ikhee10/khasbank_three_classifier_v13
 ! [remote rejected] main -> main (failed to update ref)
error: failed to push some refs to 'https://huggingface.co/Ikhee10/khasbank_three_classifier_v13'



OSError: remote: error: cannot lock ref 'refs/heads/main': is at 8336bed6500fba256a51c20bd6a5530881bcd368 but expected e6973ebc4dbfd57314d8c8f8a5f446ce1f76042c        
To https://huggingface.co/Ikhee10/khasbank_three_classifier_v13
 ! [remote rejected] main -> main (failed to update ref)
error: failed to push some refs to 'https://huggingface.co/Ikhee10/khasbank_three_classifier_v13'


In [22]:
trainer.push_to_hub()

To https://huggingface.co/Ikhee10/khasbank_three_classifier_v13
   8336bed..9f2fbcb  main -> main



In [20]:
!git config --global http.postBuffer 157286400

Once training is completed, use the [evaluate()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.evaluate) method to evaluate your model and get its perplexity:

In [23]:
import math

# eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(evaluation_results['eval_loss']):.2f}")

Perplexity: 13.35


<Tip>

For a more in-depth example of how to finetune a model for causal language modeling, take a look at the corresponding
[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)
or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).

</Tip>

## Inference

Great, now that you've finetuned a model, you can use it for inference!

Come up with a prompt you'd like to generate text from:

The simplest way to try out your finetuned model for inference is to use it in a [pipeline()](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.pipeline). Instantiate a `pipeline` for text generation with your model, and pass your text to it:

In [24]:
from transformers import pipeline

generator = pipeline("text-generation", model=model, tokenizer = tokenizer, device=0 if device.type == 'cuda' else -1,num_beams=5)

In [25]:
import re

def process_generated_text(text):
    # Example regex to extract data in the format you've specified
    matches = re.findall(r'<(.*?)>', text)
    ans = matches[1] if len(matches) > 0 else ''
    if ' ' in ans:
        ans = ans.split(' ')[0].strip()
    return ans

def get_new_prompt(prompt, df=merged_df):
    matched_row = df[df['prompt'] == prompt]
    
    if not matched_row.empty:
        return matched_row['newPrompt'].values[0]
    else:
        return None

In [26]:
def generate_and_process(prompt):
    root_prompt = get_new_prompt(prompt)
    result = generator(prompt, max_length=32, pad_token_id=tokenizer.eos_token_id)
    generated_text = result[0]['generated_text']

    return process_generated_text(generated_text)

In [27]:
# Testing
prompt = '<s> bb: Өрхийн зээлд бүх гэр бүлийн гишүүдийн орлогыг тооцоолох уу'
ans = generate_and_process(prompt)
print(ans)



pro


In [28]:
def evaluate_accuracy(dataset):
    correct = 0
    total = len(dataset['prompt'])

    for i in range(total):
        prompt = dataset['prompt'][i]
        
        actual_tag = dataset['tag'][i].split('<')[1].split('>')[0]
        
        ans = generate_and_process(prompt)
        
        if ans == actual_tag:
            correct += 1

    verb_root_accuracy = correct / total

    return verb_root_accuracy

import warnings
warnings.filterwarnings("ignore")

test_dataset = ds_splits["test"]
accuracy = evaluate_accuracy(test_dataset)

print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 90.91%
