In [1]:
import os

import pandas as pd
import numpy as np

import torch
import nltk.translate.bleu_score as bleu


from modelscope import snapshot_download
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
from peft import get_peft_model, LoraConfig, TaskType



2024-03-14 11:07:50,871 - modelscope - INFO - PyTorch version 2.2.1 Found.
2024-03-14 11:07:50,874 - modelscope - INFO - Loading ast index from /root/.cache/modelscope/ast_indexer
2024-03-14 11:07:50,967 - modelscope - INFO - Loading done! Current index file version is 1.13.1, with md5 b5a2c5fe01f7460b3e700a8ce7e6fc94 and a total number of 972 components indexed
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
REPO_DIRECTORY = r'/root/'
ABC_DICT_PATH = r'autodl-tmp/AIST4010-Cantonese-Translator-Data/ABC-Dict/abc_dict.csv'

def load_abc_dataset():
    abc_dict = pd.read_csv(REPO_DIRECTORY + ABC_DICT_PATH)
    abc_dataset = Dataset.from_pandas(abc_dict)
    return abc_dataset

abc_set = load_abc_dataset()
abc_shuffled_set = abc_set.shuffle(seed=42).train_test_split(test_size=0.1)
abc_train_set = abc_shuffled_set['train']
abc_test_set = abc_shuffled_set['test']
for (i, example) in enumerate(abc_train_set):
    print(example)
    if i == 5:
        break

{'en': 'He weighed eight pounds when he was born.', 'yue': '佢出世嗰陣有八磅重。'}
{'en': 'To be an insurance agent you have to rush around here and there.', 'yue': '做保險呢行，係咁周街走趯㗎喇。'}
{'en': "As for temporary work, it's available from time to time, so people doing it are living from hand to mouth.", 'yue': '臨時工，工作時有時冇，手搵口食。'}
{'en': 'He parts his hair on the left.', 'yue': '佢喺左邊挑頭路。'}
{'en': "It's quite hard to recover fully from this sort of chronic disease.", 'yue': '呢啲慢性病好難斷尾嘅。'}
{'en': 'I had drunk quite a lot of beer, and afterwards I started to hiccup.', 'yue': '我飲咗好多啤酒，然後開始打飽噎。'}


In [3]:
def count_dataset_tokens(dataset):
    en_count = 0
    yue_count = 0
    for example in dataset:
        en_count += len(example['en'])
        yue_count += len(example['yue'])
    return en_count, yue_count


counts = np.array(count_dataset_tokens(abc_train_set))
print(counts)
print(counts/len(abc_train_set))


[921005 191992]
[70.70512821 14.73913711]


In [4]:
model_path=r'/root/autodl-tmp/01ai/Yi-6B-Chat'

# model = Model.from_pretrained('01ai/Yi-6B')

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     device_map="auto",
#     torch_dtype='auto'
# ).eval()


# tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
model_dir = snapshot_download('01ai/Yi-6B-Chat', cache_dir='/root/autodl-tmp', revision='master')

In [6]:
base_tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, padding_side='left', max_length=512, return_tensors='pt')

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


# Since transformers 4.35.0, the GPT-Q/AWQ model can be loaded using AutoModelForCausalLM.
base_model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map=device,
    torch_dtype='auto',
)


# # Prompt content: "hi"
# messages = [
#     {"role": "user", "content": "hi"}
# ]


# input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
# output_ids = model.generate(input_ids.to('cuda'))
# response = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)

# # Model response: "Hello! How can I assist you today?"
# print(response)

Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.17it/s]


In [28]:
messages = [
    {"role": "user", "content": "hi"},
]

input_ids = base_tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
output_ids = base_model.generate(input_ids.to('cuda'))
response = base_tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True, max_length=100)

# Model response: "Hello! How can I assist you today?"
print(response)

KeyboardInterrupt: 

In [8]:
# print(input_ids)
# print(output_ids)
# print(base_tokenizer.decode(input_ids[0]))
# print(base_tokenizer.decode(input_ids[0]))

# #get text of list of tokens in output_ids stored in array
# print([base_tokenizer.decode([token]) for token in output_ids[0]])

In [9]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['en'])):
        text1 = f"""
        <|im_start|> user
        Translate the following words into Cantonese: 
        {example['en'][i]}
        <|im_start|>assistant
        {example['yue'][i]}
        """
        text2 = f"""
        <|im_start|> user
        Translate the following words into English:
        {example['yue'][i]}
        <|im_start|>assistant
        {example['en'][i]}
        """
        output_texts.append(text1)
        output_texts.append(text2)
    return output_texts

In [10]:
prompts = formatting_prompts_func(abc_set[:10])
for prompt in prompts:
    print(prompt)


        <|im_start|> user
        Translate the following words into Cantonese: 
        Scoop up water
        <|im_start|>assistant
        㧾水
        

        <|im_start|> user
        Translate the following words into English:
        㧾水
        <|im_start|>assistant
        Scoop up water
        

        <|im_start|> user
        Translate the following words into Cantonese: 
        Ladle out soup
        <|im_start|>assistant
        㧾湯
        

        <|im_start|> user
        Translate the following words into English:
        㧾湯
        <|im_start|>assistant
        Ladle out soup
        

        <|im_start|> user
        Translate the following words into Cantonese: 
        Third son of a rich family
        <|im_start|>assistant
        三少
        

        <|im_start|> user
        Translate the following words into English:
        三少
        <|im_start|>assistant
        Third son of a rich family
        

        <|im_start|> user
        Translate the follow

In [11]:
# for name, param in base_model.named_parameters():
#     print(f"Parameter name: {name}")
#     print(param)
#     print("-" * 50)

In [12]:
print(base_model.config)

LlamaConfig {
  "_name_or_path": "/root/autodl-tmp/01ai/Yi-6B-Chat",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 5000000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.38.2",
  "use_cache": true,
  "vocab_size": 64000
}



In [13]:
lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules = ["k_proj", "q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
peft_model = get_peft_model(base_model, 
                            lora_config)

peft_model.print_trainable_parameters()

trainable params: 17,825,792 || all params: 6,078,861,312 || trainable%: 0.293242288071467


**Train Tokenizer**

In [14]:
def get_training_corpus(dataset):
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        sample_en = samples["en"]
        sample_yue = samples["yue"]
        for i in range(len(sample_en)):
            yield sample_en[i]
            yield sample_yue[i]

training_corpus = get_training_corpus(abc_train_set)

tokenizer = base_tokenizer.train_new_from_iterator(training_corpus, vocab_size=40000)
tokenizer.save_pretrained("tokenizer")





You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers





('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/tokenizer.model',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [23]:
print(tokenizer("嗌呃畀啲嘢噃"))
print(base_tokenizer("嗌呃畀啲嘢噃"))
print(tokenizer("Good morning"))
print(base_tokenizer("Good morning"))

{'input_ids': [89, 633, 503, 1930, 4012, 668], 'attention_mask': [1, 1, 1, 1, 1, 1]}
{'input_ids': [59568, 534, 456, 445, 534, 450, 436, 536, 454, 433, 534, 454, 483, 534, 457, 467, 534, 458, 436], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [19948, 4830], 'attention_mask': [1, 1]}
{'input_ids': [6076, 4040], 'attention_mask': [1, 1]}


In [16]:
# bleu = evaluate.load('bleu')

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    print(predictions.shape, labels.shape)
    return {"bleu": bleu(predictions, labels)}

In [17]:
training_args = TrainingArguments(
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=1,
    logging_steps=100,
    output_dir="root/peft_model"
)

trainer = SFTTrainer(
    peft_model,
    args=training_args,
    train_dataset= abc_train_set,
    eval_dataset= abc_test_set,
    tokenizer=tokenizer,
    formatting_func=formatting_prompts_func,
    compute_metrics=compute_metrics,
)
trainer.train()

Map: 100%|██████████| 13026/13026 [00:01<00:00, 9211.62 examples/s]
Map: 100%|██████████| 1448/1448 [00:00<00:00, 9622.02 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
100,2.7536
200,2.4125
300,2.5338
400,2.4516
500,2.4502
600,2.4755
700,2.5253
800,2.5643
900,2.918
1000,2.4711


Checkpoint destination directory root/peft_model/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory root/peft_model/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory root/peft_model/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory root/peft_model/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory root/peft_model/checkpoint-2500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory root/peft_model/checkpoint-3000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=3257, training_loss=2.4508797795193527, metrics={'train_runtime': 725.1774, 'train_samples_per_second': 35.925, 'train_steps_per_second': 4.491, 'total_flos': 9.450499724161843e+16, 'train_loss': 2.4508797795193527, 'epoch': 1.0})

In [18]:
trainer.model.save_pretrained("/root/peft_model")



In [25]:
#get random data from test dataset
for i in range(5):
    example = abc_test_set[i]
    print(example)
    text1 = f"""Translate the following words into Cantonese: 
        {example['en']}
        """
    text2 = f"""Translate the following words into English:
        {example['yue']}
        """
    texts = [text1, text2]
    for text in texts:
        messages = [
            {"role": "user", "content": text}
        ]
        print(messages)
        #print model outputs for base_model and peft_model
        base_input_ids = base_tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
        peft_input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
        print("Base ID:", base_input_ids)
        print("Base Input:", base_tokenizer.decode(base_input_ids[base_input_ids.shape[1]:], skip_special_tokens=True))
        print("PEFT ID:", peft_input_ids)
        print("PEFT Input:", tokenizer.decode(peft_input_ids[peft_input_ids.shape[1]:], skip_special_tokens=True))
        print(peft_input_ids)
        base_output_ids = base_model.generate(base_input_ids.to('cuda'), max_new_tokens=100)
        peft_output_ids = peft_model.generate(peft_input_ids.to('cuda'), max_new_tokens=100)
        print(base_output_ids.shape, peft_output_ids.shape)
        print("Base model: ", base_tokenizer.decode(base_output_ids[0][base_input_ids.shape[1]:], skip_special_tokens=True))
        print("Fine-tuned: ", tokenizer.decode(peft_output_ids[0][peft_input_ids.shape[1]:], skip_special_tokens=True))


{'en': 'Today they are going to the cemetery to dig up the dead body and collect the remaining bones so they can prepare the urn for storing the bones.', 'yue': '今日佢哋去墳場執金準備做金塔。'}
[{'role': 'user', 'content': 'Translate the following words into Cantonese: \n        Today they are going to the cemetery to dig up the dead body and collect the remaining bones so they can prepare the urn for storing the bones.\n        '}]
Base ID: tensor([[    6,  2942,   144,  7759, 14429,   567,  1926,  3151,  1029, 26212,
          2823, 59569, 59601, 59568,   144,   135, 25585,   838,   678,  1450,
           592,   567, 43944,   592,  3477,   828,   567,  4331,  2534,   597,
          3857,   567,  7444, 19794,   810,   838,   748,  9685,   567, 59568,
           985,   631, 26086,   567, 19794,    98,   144,   135,     7, 59568,
           144,     6, 14135,   144]])
Base Input: 
PEFT ID: tensor([[    3,  5103,  3328,     0,    50,  7989,    77,  3862,  3531,  7307,
         26346, 11545, 27187, 175

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


torch.Size([1, 154]) torch.Size([1, 167])
Base model:  B２２２２２２２２<|Human|>２ distribution distribution distributionuge２２２２２２ in base waysles P and
 Min в me２perature２２４２２２２２２２２２２２2２２ B E hon rec ways<filename>4 himself４ount rec honount２perature<commit_after>２２２２２２２２２２２２２２ount Min A２ ways B and<filename>４ himself A
 A２ still of
Fine-tuned:         The 好嘅嘍好.                       u                                                               
[{'role': 'user', 'content': 'Translate the following words into English:\n        今日佢哋去墳場執金準備做金塔。\n        '}]
Base ID: tensor([[    6,  2942,   144,  7759, 14429,   567,  1926,  3151,  1029,  4750,
         59601,   144,   135, 10721,   533,   494,   467,   534,   452,   444,
         59793,   534,   467,   484, 61082, 62445, 59850, 39976, 59842, 59850,
         60949,   102,   144,   135,     7, 59568,   144,     6, 14135,   144]])
Base Input: 
PEFT ID: tensor([[    3,  5103,  3328,     0,    50,  7989,    77,  3862,  3531,  7307,
         26346, 

KeyboardInterrupt: 