# Fine-tuning a LLM as a Cantonese Jyutping to Text Translator

In [None]:
import os

# Uncomment the following lines if AudoDL is used
# os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, TaskType, get_peft_model
import torch

model_name = "hon9kon9ize/CantoneseLLMChat-v1.0-7B"

tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    use_fast=False  
)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16  
).eval()


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [2]:
base_model.enable_input_require_grads()

In [None]:
import pandas as pd
from datasets import Dataset

df = pd.read_csv("train_jyutping_no_tone.csv")
# For example, the CSV might look like:
#    yue               jyutping
# 0  泥水佬開門口過得人過得自己  nai seoi lou hoi mun hau gwo dak jan gwo dak zi gei
# 1  杞人嘅朋友嘆咗一口氣       ...
# ... etc.

dataset = Dataset.from_pandas(df)
print(dataset)

Dataset({
    features: ['yue', 'zh', 'jyutping'],
    num_rows: 22503
})


In [None]:
def format_example(ex):
    # Define the system message to inform the model of its role and task
    system_message = "你是一個粵語翻譯助手。你的任務是將粵拼（Jyutping）翻譯成對應的廣東話文字。"
    
    # User prompt with Jyutping input
    user_prompt = f"請將以下粵拼翻譯成廣東話：\n{ex['jyutping']}"
    
    # Target output is the Cantonese text corresponding to the Jyutping input
    target_output = ex['yue']
    
    # Construct the full prompt with start and end tags
    full_prompt = (
        f"<|im_start|>system\n{system_message}<|im_end|>\n"
        f"<|im_start|>user\n{user_prompt}<|im_end|>\n"
        f"<|im_start|>assistant\n"
    )
    
    # Target Output + Ending Tag
    full_target = f"{target_output}<|im_end|>"
    
    return {
        "prompt": full_prompt,
        "target": full_target
    }

formatted_dataset = dataset.map(format_example)

Map:   0%|          | 0/22503 [00:00<?, ? examples/s]

In [6]:
print(formatted_dataset['prompt'][:1])  # Check the first two examples

['<|im_start|>system\n你是一個粵語翻譯助手。你的任務是將粵拼（Jyutping）翻譯成對應的廣東話文字。<|im_end|>\n<|im_start|>user\n請將以下粵拼翻譯成廣東話：\ngei jan ge pang jau taan zo jat hau hei<|im_end|>\n<|im_start|>assistant\n']


In [7]:
print(formatted_dataset['target'][:1])  # Check the first two examples

['杞人嘅朋友嘆咗一口氣<|im_end|>']


In [None]:
def tokenize_function(ex, max_length=512):
    # Tokenize prompt
    prompt_tokens = tokenizer(
        ex["prompt"],
        truncation=True,
        max_length=max_length,
        add_special_tokens=False
    )
    # Tokenize target
    target_tokens = tokenizer(
        ex["target"],
        truncation=True,
        max_length=max_length,
        add_special_tokens=False
    )

    # Combining the prompt and the target
    input_ids = prompt_tokens["input_ids"] + target_tokens["input_ids"] + [tokenizer.eos_token_id]
    attention_mask = (
        prompt_tokens["attention_mask"] + target_tokens["attention_mask"] + [1]
    )

    # Build labels so that only the target part (plus EOS) is trainable (the prompt tokens can be -100 to ignore them)
    labels = [-100] * len(prompt_tokens["input_ids"]) + target_tokens["input_ids"] + [tokenizer.eos_token_id]

    # Optional truncation step (in case combined length is too long)
    input_ids = input_ids[:max_length]
    attention_mask = attention_mask[:max_length]
    labels = labels[:max_length]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

tokenized_dataset = formatted_dataset.map(tokenize_function, batched=False, remove_columns=formatted_dataset.column_names)
tokenized_dataset


Map:   0%|          | 0/22503 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 22503
})

In [None]:
from peft import LoraConfig, TaskType, get_peft_model

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], 
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    inference_mode=False
)

peft_model = get_peft_model(base_model, lora_config)
peft_model.print_trainable_parameters()  


trainable params: 20,185,088 || all params: 7,635,801,600 || trainable%: 0.2643


In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8, padding=True)
training_args = TrainingArguments(
    output_dir="./cantonese-llm-lora-output",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=8,  
    num_train_epochs=3,            
    learning_rate=1e-4,
    logging_steps=10,
    save_steps=100,
    bf16=True,  
    optim="adamw_torch",
    report_to="none",  
    gradient_checkpointing=False     
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,2.6539
20,1.9228
30,1.5347
40,1.3242
50,1.1648
60,1.0109
70,0.9193
80,0.8235
90,0.8454
100,0.777


TrainOutput(global_step=1053, training_loss=0.40258165526027806, metrics={'train_runtime': 2661.6802, 'train_samples_per_second': 25.363, 'train_steps_per_second': 0.396, 'total_flos': 4.141878294265528e+17, 'train_loss': 0.40258165526027806, 'epoch': 2.9925346605047993})

In [14]:
trainer.save_model("cantonese-lora-checkpoint")

# A Simple Example on using the Fine-Tuned Model

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

base_model_name = "hon9kon9ize/CantoneseLLMChat-v1.0-7B"
lora_model_path = "cantonese-lora-checkpoint"

tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_fast=False)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name, 
    device_map="auto", 
    torch_dtype=torch.bfloat16
).eval()

lora_model = PeftModel.from_pretrained(base_model, lora_model_path)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
def generate_cantonese(jyutping_input):
    
    system_message = "你是一個粵語翻譯助手。你的任務是將粵拼（Jyutping）翻譯成對應的廣東話文字。"
    user_prompt = f"請將以下粵拼翻譯成廣東話：\n{jyutping_input}"
    
    prompt_text = (
        f"<|im_start|>system\n{system_message}<|im_end|>\n"
        f"<|im_start|>user\n{user_prompt}<|im_end|>\n"
        f"<|im_start|>assistant\n"  
    )
    
    inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")  
    
    with torch.no_grad():
        outputs = lora_model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.1,
            do_sample=True
        )
    
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if decoded_output.startswith(prompt_text):
        generated_text = decoded_output[len(prompt_text):].strip()
    else:
        generated_text = decoded_output.strip()
    return generated_text

test_input = "m hai gong siu zan hai gei sai lei"
prediction = generate_cantonese(test_input)
print("Prediction:", prediction)

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Prediction: system
你是一個粵語翻譯助手。你的任務是將粵拼（Jyutping）翻譯成對應的廣東話文字。
user
請將以下粵拼翻譯成廣東話：
m hai gong siu zan hai gei sai lei
assistant
唔係講笑真係幾犀利
