In [45]:
import os

import pandas as pd
import numpy as np

import torch
import nltk.translate.bleu_score as bleu


from modelscope import snapshot_download
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
from peft import get_peft_model, LoraConfig, TaskType



In [22]:
REPO_DIRECTORY = r'/root/'
ABC_DICT_PATH = r'autodl-tmp/AIST4010-Cantonese-Translator-Data/ABC-Dict/abc_dict.csv'

def load_abc_dataset():
    abc_dict = pd.read_csv(REPO_DIRECTORY + ABC_DICT_PATH)
    abc_dataset = Dataset.from_pandas(abc_dict)
    return abc_dataset

abc_set = load_abc_dataset()
abc_shuffled_set = abc_set.shuffle(seed=42).train_test_split(test_size=0.1)
abc_train_set = abc_shuffled_set['train']
abc_test_set = abc_shuffled_set['test']
for (i, example) in enumerate(abc_train_set):
    print(example)
    if i == 5:
        break

{'en': "It doesn't matter what other people say, as the first thing you should do is to ask yourself if what you did is right or not.", 'yue': '姑勿論人哋點，諗吓你自己做得啱唔啱先。'}
{'en': 'Why does he lack confidence in doing things? Does he now just want to muddle through?', 'yue': '佢點解冇信心做嘢？而家就想打因住波？'}
{'en': 'At the beginning of the year I will start a new job.', 'yue': '年頭我會開始新嘅工作。'}
{'en': 'You need at least two people to move this dining table to the dining room.', 'yue': '呢張餐枱起碼要兩個人先夠力搬去飯廳度。'}
{'en': 'He is really an effeminate young man.', 'yue': '佢真係個女人形㗎。'}
{'en': "When I was in the sixth grade, I was bullied by a kid of mixed race, so in a neighboring class there was a big kid who suddenly ran out and helped me thoroughly bash and bloody the mongrel's head. After that he never bullied me.", 'yue': '讀小六時我俾條雜種仔㗇，噉隔籬班有個好高大男仔突然走埋嚟幫我打到個雜種仔爆晒缸，以後佢都唔㗇我。'}


In [23]:
def count_dataset_tokens(dataset):
    en_count = 0
    yue_count = 0
    for example in dataset:
        en_count += len(example['en'])
        yue_count += len(example['yue'])
    return en_count, yue_count


counts = np.array(count_dataset_tokens(abc_train_set))
print(counts)
print(counts/len(abc_train_set))


[918672 191407]
[70.52602487 14.69422693]


In [24]:
model_path=r'/root/autodl-tmp/01ai/Yi-6B-Chat'

# model = Model.from_pretrained('01ai/Yi-6B')

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     device_map="auto",
#     torch_dtype='auto'
# ).eval()


# tokenizer = AutoTokenizer.from_pretrained(model_name)

In [25]:
model_dir = snapshot_download('01ai/Yi-6B-Chat', cache_dir='/root/autodl-tmp', revision='master')

In [26]:
base_tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)



# Since transformers 4.35.0, the GPT-Q/AWQ model can be loaded using AutoModelForCausalLM.
base_model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype='auto',
)


# # Prompt content: "hi"
# messages = [
#     {"role": "user", "content": "hi"}
# ]


# input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
# output_ids = model.generate(input_ids.to('cuda'))
# response = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)

# # Model response: "Hello! How can I assist you today?"
# print(response)

Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00,  3.36it/s]


In [27]:
# messages = [
#     {"role": "user", "content": "你識唔識講廣東話?"},
# ]

# input_ids = base_tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
# output_ids = base_model.generate(input_ids.to('cuda'))
# response = base_tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)

# # Model response: "Hello! How can I assist you today?"
# print(response)

作為一個人工智能，我沒有語言能力，因為我沒有感受和思維，只是一個由程式和數據組成的系統。但我可以幫助你解答問題或提供信息，如果你使用英文問話。


In [28]:
print(input_ids)
print(output_ids)
print(base_tokenizer.decode(input_ids[0]))
print(base_tokenizer.decode(input_ids[0]))

#get text of list of tokens in output_ids stored in array
print([base_tokenizer.decode([token]) for token in output_ids[0]])

tensor([[    6,  2942,   144, 59725, 62028,   534,   453,   453, 62028, 62364,
         62098, 61518, 61845,   100,     7, 59568,   144,     6, 14135,   144]])
tensor([[    6,  2942,   144, 59725, 62028,   534,   453,   453, 62028, 62364,
         62098, 61518, 61845,   100,     7, 59568,   144,     6, 14135,   144,
         26747, 12666, 13992,   101, 59646, 10870, 53202,  2604,   101, 12354,
         59646, 10870,  6905, 59652, 60084, 62108,   101,  3331, 12666, 59903,
         56241, 59652, 37228, 61728, 11984, 25786,   102, 16097,  1229, 36293,
         59725, 21226, 14375, 59876,  2479,  2530,   101,  7953,  2253, 14877,
         61420, 61845,   102,     7]], device='cuda:0')
<|im_start|> user
你識唔識講廣東話?<|im_end|> 
<|im_start|> assistant

<|im_start|> user
你識唔識講廣東話?<|im_end|> 
<|im_start|> assistant

['<|im_start|>', 'user', '\n', '你', '識', '�', '�', '�', '識', '講', '廣', '東', '話', '?', '<|im_end|>', '', '\n', '<|im_start|>', 'assistant', '\n', '作為', '一個', '人工智能', '，', '我', '沒有', '語言

In [29]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['en'])):
        text1 = f"""
        <|im_start|> user
        Translate the following words into Cantonese: 
        {example['en'][i]}
        <|im_start|>assistant
        {example['yue'][i]}
        """
        text2 = f"""
        <|im_start|> user
        Translate the following words into English:
        {example['yue'][i]}
        <|im_start|>assistant
        {example['en'][i]}
        """
        output_texts.append(text1)
        output_texts.append(text2)
    return output_texts

In [30]:
prompts = formatting_prompts_func(abc_set[:10])
for prompt in prompts:
    print(prompt)


        <|im_start|> user
        Translate the following words into Cantonese: 
        Scoop up water
        <|im_start|>assistant
        㧾水
        

        <|im_start|> user
        Translate the following words into English:
        㧾水
        <|im_start|>assistant
        Scoop up water
        

        <|im_start|> user
        Translate the following words into Cantonese: 
        Ladle out soup
        <|im_start|>assistant
        㧾湯
        

        <|im_start|> user
        Translate the following words into English:
        㧾湯
        <|im_start|>assistant
        Ladle out soup
        

        <|im_start|> user
        Translate the following words into Cantonese: 
        Third son of a rich family
        <|im_start|>assistant
        三少
        

        <|im_start|> user
        Translate the following words into English:
        三少
        <|im_start|>assistant
        Third son of a rich family
        

        <|im_start|> user
        Translate the follow

In [31]:
# for name, param in base_model.named_parameters():
#     print(f"Parameter name: {name}")
#     print(param)
#     print("-" * 50)

In [32]:
print(base_model.config)

LlamaConfig {
  "_name_or_path": "/root/autodl-tmp/01ai/Yi-6B-Chat",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 5000000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.38.2",
  "use_cache": true,
  "vocab_size": 64000
}



In [33]:
lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules = ["k_proj", "q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
peft_model = get_peft_model(base_model, 
                            lora_config)

peft_model.print_trainable_parameters()

trainable params: 17,825,792 || all params: 6,078,861,312 || trainable%: 0.293242288071467


**Train Tokenizer**

In [34]:
def get_training_corpus(dataset):
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        sample_en = samples["en"]
        sample_yue = samples["yue"]
        for i in range(len(sample_en)):
            yield sample_en[i]
            yield sample_yue[i]

training_corpus = get_training_corpus(abc_train_set)

tokenizer = base_tokenizer.train_new_from_iterator(training_corpus, vocab_size=40000)
tokenizer.save_pretrained("tokenizer")






('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/tokenizer.model',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [35]:
print(tokenizer("嗌呃畀啲嘢噃"))
print(base_tokenizer("嗌呃畀啲嘢噃"))
print(tokenizer("Good morning"))
print(base_tokenizer("Good morning"))

{'input_ids': [89, 633, 504, 1928, 3992, 668], 'attention_mask': [1, 1, 1, 1, 1, 1]}
{'input_ids': [59568, 534, 456, 445, 534, 450, 436, 536, 454, 433, 534, 454, 483, 534, 457, 467, 534, 458, 436], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [25778, 5554], 'attention_mask': [1, 1]}
{'input_ids': [6076, 4040], 'attention_mask': [1, 1]}


In [40]:
# bleu = evaluate.load('bleu')

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    print(predictions.shape, labels.shape)
    return {"bleu": bleu(predictions, labels)}

In [46]:
training_args = TrainingArguments(
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=1,
    logging_steps=100,
    output_dir="root/peft_model"
)

trainer = SFTTrainer(
    peft_model,
    args=training_args,
    train_dataset= abc_train_set,
    eval_dataset= abc_test_set,
    tokenizer=tokenizer,
    formatting_func=formatting_prompts_func,
    compute_metrics=compute_metrics,
)
trainer.train()

TypeError: TrainingArguments.__init__() missing 1 required positional argument: 'output_dir'

In [None]:
trainer.model.save_pretrained("root/peft_model")