In [1]:
import os

import pandas as pd
import numpy as np

import torch
import nltk.translate.bleu_score as bleu


from modelscope import snapshot_download
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
from peft import get_peft_model, LoraConfig, TaskType



2024-03-17 20:30:50,961 - modelscope - INFO - PyTorch version 2.2.1 Found.
2024-03-17 20:30:50,964 - modelscope - INFO - Loading ast index from /root/.cache/modelscope/ast_indexer
2024-03-17 20:30:51,074 - modelscope - INFO - Loading done! Current index file version is 1.13.1, with md5 b5a2c5fe01f7460b3e700a8ce7e6fc94 and a total number of 972 components indexed
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
REPO_DIRECTORY = r'/root/'
ABC_DICT_PATH = r'autodl-tmp/AIST4010-Cantonese-Translator-Data/ABC-Dict/abc_dict.csv'

def load_abc_dataset():
    abc_dict = pd.read_csv(REPO_DIRECTORY + ABC_DICT_PATH)
    abc_dataset = Dataset.from_pandas(abc_dict)
    return abc_dataset

abc_set = load_abc_dataset()
abc_shuffled_set = abc_set.shuffle(seed=42).train_test_split(test_size=0.1)
abc_train_set = abc_shuffled_set['train']
abc_test_set = abc_shuffled_set['test']
for (i, example) in enumerate(abc_train_set):
    print(example)
    if i == 5:
        break

{'en': 'All the knives have become rusty.', 'yue': '啲刀都生咗銹。'}
{'en': 'Today our boss got arrested, so now what can we do?', 'yue': '今日大佬衰咗喇，噉而家我哋點算好啊？'}
{'en': 'Your Mom misses you so much.', 'yue': '你媽咪好掛住你。'}
{'en': 'As soon as it rains, then my back feels sore.', 'yue': '一落雨，我就腰酸背痛嘅喇。'}
{'en': "Yesterday you forgot to punch your time card, so will the boss mistakenly think you didn't come to work?", 'yue': '琴日你唔記得打咭，會唔會俾波士誤會你冇返工呢？'}
{'en': "Some Hong Kong movie stars go to mainland China to try to make money, in fact in the mainland's world of show business supply of actors exceeds demand, so if the Hongkongers can't become famous, then there's no certainty they'll make easy money there.", 'yue': '有啲香港明星返內哋搵錢，其實內地演藝圈都係粥少僧多，所以如果佢哋紅唔起就唔一定有金執。'}


In [3]:
def count_dataset_tokens(dataset):
    en_count = 0
    yue_count = 0
    for example in dataset:
        en_count += len(example['en'])
        yue_count += len(example['yue'])
    return en_count, yue_count


counts = np.array(count_dataset_tokens(abc_train_set))
print(counts)
print(counts/len(abc_train_set))


[919203 191544]
[70.5667895  14.70474436]


In [4]:
model_path=r'/root/autodl-tmp/01ai/Yi-6B-Chat'

# model = Model.from_pretrained('01ai/Yi-6B')

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     device_map="auto",
#     torch_dtype='auto'
# ).eval()


# tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
model_dir = snapshot_download('01ai/Yi-6B-Chat', cache_dir='/root/autodl-tmp', revision='master')

In [6]:
base_tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, padding_side='right', max_length=512, return_tensors='pt')
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, padding_side='right', max_length=512, return_tensors='pt')

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


# Since transformers 4.35.0, the GPT-Q/AWQ model can be loaded using AutoModelForCausalLM.
base_model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map=device,
    torch_dtype='auto',
)


# # Prompt content: "hi"
# messages = [
#     {"role": "user", "content": "hi"}
# ]


# input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
# output_ids = model.generate(input_ids.to('cuda'))
# response = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)

# # Model response: "Hello! How can I assist you today?"
# print(response)

Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.14s/it]


In [7]:
messages = [
    {"role": "user", "content": "hi"},
]

input_ids = base_tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
output_ids = base_model.generate(input_ids.to('cuda'))
response = base_tokenizer.decode(output_ids[0], skip_special_tokens=False, max_length=100)

# Model response: "Hello! How can I assist you today?"
print(response)

Hello! It's great to see you using the ChatGPT service. How can I assist you today? If you have any questions or need help with something, feel free to ask!


In [8]:
# print(input_ids)
# print(output_ids)
# print(base_tokenizer.decode(input_ids[0]))
# print(base_tokenizer.decode(input_ids[0]))

# #get text of list of tokens in output_ids stored in array
# print([base_tokenizer.decode([token]) for token in output_ids[0]])

In [9]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['en'])):
        text1 = f"<|im_start|> user
        Translate the following words into Cantonese: {example['en'][i]} <|im_end|> 
        <|im_start|> assistant
        {example['yue'][i]} <|im_end|>"
        text2 = f"<|im_start|> user
        Translate the following words into English:
        {example['yue'][i]} <|im_end|>
        <|im_start|> assistant
        {example['en'][i]} <|im_end|>"
        output_texts.append(text1)
        output_texts.append(text2)
    return output_texts

In [10]:
prompts = formatting_prompts_func(abc_set[:10])
for prompt in prompts:
    print(prompt)

<|im_start|> user
        Translate the following words into Cantonese: Scoop up water <|im_end|> 
        <|im_start|> assistant
        㧾水 <|im_end|>
<|im_start|> user
        Translate the following words into English:
        㧾水 <|im_end|>
        <|im_start|> assistant
        Scoop up water <|im_end|>
<|im_start|> user
        Translate the following words into Cantonese: Ladle out soup <|im_end|> 
        <|im_start|> assistant
        㧾湯 <|im_end|>
<|im_start|> user
        Translate the following words into English:
        㧾湯 <|im_end|>
        <|im_start|> assistant
        Ladle out soup <|im_end|>
<|im_start|> user
        Translate the following words into Cantonese: Third son of a rich family <|im_end|> 
        <|im_start|> assistant
        三少 <|im_end|>
<|im_start|> user
        Translate the following words into English:
        三少 <|im_end|>
        <|im_start|> assistant
        Third son of a rich family <|im_end|>
<|im_start|> user
        Translate the following

In [11]:
# for name, param in base_model.named_parameters():
#     print(f"Parameter name: {name}")
#     print(param)
#     print("-" * 50)

In [12]:
print(base_model.config)

LlamaConfig {
  "_name_or_path": "/root/autodl-tmp/01ai/Yi-6B-Chat",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 5000000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.38.2",
  "use_cache": true,
  "vocab_size": 64000
}



In [13]:
lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules = ["k_proj", "q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
peft_model = get_peft_model(base_model, 
                            lora_config)

peft_model.print_trainable_parameters()

trainable params: 17,825,792 || all params: 6,078,861,312 || trainable%: 0.293242288071467


**Train Tokenizer**

In [14]:
def get_training_corpus(dataset):
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        sample_en = samples["en"]
        sample_yue = samples["yue"]
        for i in range(len(sample_en)):
            yield sample_en[i]
            yield sample_yue[i]

def get_yue_training_corpus(dataset):
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        sample_yue = samples["yue"]
        for i in range(len(sample_yue)):
            yield sample_yue[i]

training_corpus = get_yue_training_corpus(abc_train_set)


curr_vocab = set(tokenizer.vocab)
# print(curr_vocab)
new_vocab = set()
#iterate through all training_corpus lines
for i, line in enumerate(training_corpus):
    unique_chars = set(list(line))
    print(unique_chars)
    new_tokens = unique_chars - curr_vocab - new_vocab
    new_vocab |= new_tokens


# for i in range(training_corpus):
#     next(training_corpus)
#     line = next(training_corpus)
#     for char in line:
#         if char not in tokenizer.vocab and char not in new_vocab:
#             new_vocab.add(char)
print(new_vocab)
print(len(new_vocab))
tokenizer.add_tokens(list(new_vocab))
tokenizer.save_pretrained("/root/tokenizer")

{'咗', '。', '啲', '刀', '銹', '都', '生'}
{'而', '噉', '咗', '啊', '哋', '？', '點', '日', '好', '佬', '大', '，', '衰', '喇', '我', '家', '算', '今'}
{'住', '你', '。', '咪', '掛', '好', '媽'}
{'嘅', '。', '一', '腰', '就', '，', '背', '酸', '落', '痛', '我', '喇', '雨'}
{'唔', '返', '波', '冇', '你', '得', '日', '咭', '打', '呢', '？', '俾', '琴', '，', '記', '工', '會', '誤', '士'}
{'金', '多', '執', '唔', '返', '。', '啲', '內', '就', '定', '僧', '星', '地', '香', '藝', '紅', '起', '少', '明', '哋', '錢', '粥', '以', '搵', '，', '都', '演', '圈', '係', '如', '一', '實', '佢', '果', '所', '有', '港', '其'}
{'𠾍', '嘅', '？', '啲', '點', '公', '務', '員', '解', '咁', '有', '會'}
{'書', '唔', '你', '仲', '？', '緊', '讀', '嗰', '係', '咩', '陣'}
{'食', '飯', '至', '。', '完', '就', '我'}
{'唔', '。', '啲', '波', '鑊', '日', '同', '打', '我', '輸', '高', '教', '，', '練', '學', '今', '甘', '興', '好'}
{'。', '仔', '度', '嘅', '收', '呢', '和', '哋', '班', '𡃁', '，', '都', '嗰', '係', '古', '勝', '惑', '佢', '嚟'}
{'政', '兩', '。', '香', '推', '策', '語', '廣', '三', '文', '港', '府'}
{'房', '你', '？', '乜', '刀', '喱', '㗎', '單', '間', '攪', '到', '烏'}
{'多', '！', '你', '

('/root/tokenizer/tokenizer_config.json',
 '/root/tokenizer/special_tokens_map.json',
 '/root/tokenizer/tokenizer.model',
 '/root/tokenizer/added_tokens.json',
 '/root/tokenizer/tokenizer.json')

In [15]:
print(tokenizer("嗌呃畀啲嘢噃")['input_ids'])
print(base_tokenizer("嗌呃畀啲嘢噃")['input_ids'])
print(tokenizer.tokenize("嗌呃畀啲嘢噃"))
print(base_tokenizer.tokenize("嗌呃畀啲嘢噃"))
print(tokenizer("Good morning")['input_ids'])
print(base_tokenizer("Good morning")['input_ids'])

[65022, 534, 450, 436, 536, 454, 433, 534, 454, 483, 534, 457, 467, 534, 458, 436]
[59568, 534, 456, 445, 534, 450, 436, 536, 454, 433, 534, 454, 483, 534, 457, 467, 534, 458, 436]
['▁嗌', '<0xE5>', '<0x91>', '<0x83>', '<0xE7>', '<0x95>', '<0x80>', '<0xE5>', '<0x95>', '<0xB2>', '<0xE5>', '<0x98>', '<0xA2>', '<0xE5>', '<0x99>', '<0x83>']
['▁', '<0xE5>', '<0x97>', '<0x8C>', '<0xE5>', '<0x91>', '<0x83>', '<0xE7>', '<0x95>', '<0x80>', '<0xE5>', '<0x95>', '<0xB2>', '<0xE5>', '<0x98>', '<0xA2>', '<0xE5>', '<0x99>', '<0x83>']
[6076, 4040]
[6076, 4040]


In [16]:
# bleu = evaluate.load('bleu')

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    print(predictions.shape, labels.shape)
    return {"bleu": bleu(predictions, labels)}

In [17]:
peft_model.resize_token_embeddings(len(tokenizer))

Embedding(65167, 4096)

In [18]:
training_args = TrainingArguments(
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=3,
    max_steps=300,
    logging_steps=100,
    output_dir="/root/peft_model"
)

trainer = SFTTrainer(
    peft_model,
    args=training_args,
    train_dataset= abc_train_set,
    eval_dataset= abc_test_set,
    tokenizer=tokenizer,
    formatting_func=formatting_prompts_func,
    compute_metrics=compute_metrics,
)
trainer.train()

Map: 100%|██████████| 13026/13026 [00:01<00:00, 8402.78 examples/s]
Map: 100%|██████████| 1448/1448 [00:00<00:00, 8497.09 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


TrainOutput(global_step=300, training_loss=1.1928130594889323, metrics={'train_runtime': 66.8764, 'train_samples_per_second': 35.887, 'train_steps_per_second': 4.486, 'total_flos': 8621311562612736.0, 'train_loss': 1.1928130594889323, 'epoch': 0.09})

In [19]:
trainer.model.save_pretrained("/root/peft_model")



In [20]:
# #get random data from test dataset
# for i in range(5):
#     example = abc_test_set[i]
#     print(example)
#     text1 = f"""Translate the following words into Cantonese: 
#         {example['en']}
#         """
#     text2 = f"""Translate the following words into English:
#         {example['yue']}
#         """
#     texts = [text1, text2]
#     for text in texts:
#         messages = [
#             {"role": "user", "content": text}
#         ]
#         print(messages)
#         #print model outputs for base_model and peft_model
#         base_input_ids = base_tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
#         peft_input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
#         print("Base ID:", base_input_ids)
#         print("Base Input:", base_tokenizer.decode(base_input_ids[base_input_ids.shape[1]:], skip_special_tokens=True))
#         print("PEFT ID:", peft_input_ids)
#         print("PEFT Input:", tokenizer.decode(peft_input_ids[peft_input_ids.shape[1]:], skip_special_tokens=True))
#         print(peft_input_ids)
#         base_output_ids = base_model.generate(base_input_ids.to('cuda'), max_new_tokens=100)
#         peft_output_ids = peft_model.generate(peft_input_ids.to('cuda'), max_new_tokens=100)
#         print(base_output_ids.shape, peft_output_ids.shape)
#         print("Base model: ", base_tokenizer.decode(base_output_ids[0][base_input_ids.shape[1]:], skip_special_tokens=True))
#         print("Fine-tuned: ", tokenizer.decode(peft_output_ids[0][peft_input_ids.shape[1]:], skip_special_tokens=True))


In [21]:
print(pd.DataFrame(trainer.state.log_history))

     loss  grad_norm  learning_rate  epoch  step  train_runtime  \
0  1.4801   0.750000       0.000667   0.03   100            NaN   
1  1.0638   0.558594       0.000333   0.06   200            NaN   
2  1.0345   0.507812       0.000000   0.09   300            NaN   
3     NaN        NaN            NaN   0.09   300        66.8764   

   train_samples_per_second  train_steps_per_second    total_flos  train_loss  
0                       NaN                     NaN           NaN         NaN  
1                       NaN                     NaN           NaN         NaN  
2                       NaN                     NaN           NaN         NaN  
3                    35.887                   4.486  8.621312e+15    1.192813  
