In [12]:
import os

import pandas as pd
import numpy as np
import torch

import bitsandbytes
import accelerate
from datasets import Dataset

from modelscope import snapshot_download
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import Trainer, TrainingArguments
from peft import PeftModel


In [13]:
model_path=r'/root/autodl-tmp/01ai/Yi-6B-Chat'
model_dir = snapshot_download('01ai/Yi-6B-Chat', cache_dir='/root/autodl-tmp', revision='master')

# base_tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, padding_side='left', max_length=512, return_tensors='pt')

# base_model = AutoModelForCausalLM.from_pretrained(
# 	 model_path,
# 	 device_map='auto',
# 	 torch_dtype=torch.bfloat16,
# 	 trust_remote_code=True 
# ).eval()

In [14]:
messages = [
    {"role": "user", "content": "Translate the following words into English:\n你係邊個？"},
]

messages_plain = [
    """
    <|im_start|> user
    hi<|im_end|> 
    <|im_start|> assistant
    """
]



# input_ids = base_tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
# output_ids = base_model.generate(input_ids.to('cuda'))
# # response = base_tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True, max_length=100)
# response = base_tokenizer.decode(output_ids[0], skip_special_tokens=False, max_length=100)

# # Model response: "Hello! How can I assist you today?"
# print(response)

In [15]:
tokenizer = AutoTokenizer.from_pretrained('/root/tokenizer', use_fast=True, padding_side='left', max_length=512, return_tensors='pt')
base_tokenizer = AutoTokenizer.from_pretrained('/root/autodl-tmp/01ai/Yi-6B-Chat', use_fast=True, padding_side='left', max_length=512, return_tensors='pt')
print(len(tokenizer.vocab))
# tokenizer = AutoTokenizer.from_pretrained('/root/AIST4010-Cantonese-Translator/tokenizer', use_fast=True, padding_side='left', max_length=512, return_tensors='pt')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


65167


In [16]:
# model = PeftModel.from_pretrained(
#     model,
#    '/root/peft_model',
#     is_trainable=False
# )

In [17]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
base_model = AutoModelForCausalLM.from_pretrained(
	 '/root/autodl-tmp/01ai/Yi-6B-Chat',
	 device_map=device,
	 torch_dtype=torch.bfloat16,
     quantization_config=BitsAndBytesConfig(load_in_8bit=True),
	 trust_remote_code=True 
).eval()
model = AutoModelForCausalLM.from_pretrained(
	 model_path,
	 device_map=device,
	 torch_dtype=torch.bfloat16,
	 quantization_config=BitsAndBytesConfig(load_in_8bit=True),
	 trust_remote_code=True 
).eval()
model.resize_token_embeddings(len(tokenizer))
model.load_adapter('/root/peft_model')

Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:03,  1.68s/it]

Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.43s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.48s/it]


In [18]:
REPO_DIRECTORY = r'/root/'
ABC_DICT_PATH = r'autodl-tmp/AIST4010-Cantonese-Translator-Data/ABC-Dict/abc_dict.csv'

def load_abc_dataset():
    abc_dict = pd.read_csv(REPO_DIRECTORY + ABC_DICT_PATH)
    abc_dataset = Dataset.from_pandas(abc_dict)
    return abc_dataset

abc_set = load_abc_dataset()
abc_shuffled_set = abc_set.shuffle(seed=42).train_test_split(test_size=0.1)
abc_train_set = abc_shuffled_set['train']
abc_test_set = abc_shuffled_set['test']

In [19]:
def get_translate_prompt(target_language, source_text):
    return f"Translate the following words into {target_language}:\n{source_text}"

def get_messages(sample):
    def get_translate_prompt(target_language, source_text):
        return f"Translate the following words into {target_language}:\n{source_text}"
    return [
        [{"role": "user", "content": get_translate_prompt('English', sample['yue'])}],
        [{"role": "user", "content": get_translate_prompt('Cantonese', sample['en'])}]
    ]

train_samples = abc_train_set.shuffle(seed=10).select(range(5))
test_samples = abc_test_set.shuffle(seed=10).select(range(5))

get_messages(train_samples[0])

[[{'role': 'user',
   'content': 'Translate the following words into English:\n佢雖然係經理，但個個都當佢冇到。'}],
 [{'role': 'user',
   'content': 'Translate the following words into Cantonese:\nEven though he is the manager, he is looked down upon and ignored by everyone.'}]]

In [20]:
def model_output(model, tokenizer, messages, name=None):
        for prompt in messages:
                input_ids = tokenizer.apply_chat_template(conversation=prompt, tokenize=True, add_generation_prompt=True, return_tensors='pt')
                output_ids = model.generate(input_ids.to('cuda'))
                response = base_tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True, max_length=100)
                # response = tokenizer.decode(output_ids[0], skip_special_tokens=False, max_length=100)
                print(f"{name}: {response}")


print([model_output(base_model, base_tokenizer, get_messages(train_samples[0]), 'Base model'),])



Base model: Although he is the manager, everyone treats him as if he were not there.
Base model: Even though he is the manager, he is looked down upon and ignored by everyone.
[None]


In [21]:
def compare_outputs(samples):
    for sample in samples:
        print(sample)
        prompts = get_messages(sample)
        for prompt in prompts:
            print(prompt)
            model_output(base_model, base_tokenizer, [prompt], 'Base model')
            model_output(model, tokenizer, [prompt], 'SFT model')

# compare_outputs(train_samples)

In [22]:
compare_outputs(train_samples)


{'en': 'Even though he is the manager, he is looked down upon and ignored by everyone.', 'yue': '佢雖然係經理，但個個都當佢冇到。'}
[{'role': 'user', 'content': 'Translate the following words into English:\n佢雖然係經理，但個個都當佢冇到。'}]
Base model: He is the manager, but everyone treats him as if he isn't there.
SFT model: assistant
Although he is a manager, no one respects him. 
[{'role': 'user', 'content': 'Translate the following words into Cantonese:\nEven though he is the manager, he is looked down upon and ignored by everyone.'}]
Base model: 即使他是经理，他还是被所有人看不起并被忽视。
SFT model: assistant
Even though he is the manager, he is looked down upon and ignored by everyone. 
{'en': "I should give you all my money? I'll give you nothing!", 'yue': '畀晒啲錢你？畀你個頭！'}
[{'role': 'user', 'content': 'Translate the following words into English:\n畀晒啲錢你？畀你個頭！'}]
Base model: "畀晒啲錢你？畀你個頭！" 可以翻译为 "Give all the money to you? Give you the head!" 或者 "Give all the money to you? Give you the money!" 这两种翻译都表示"给你所有的钱"，但是第一种翻译更直接，而第二种翻译则更礼貌。

In [23]:
compare_outputs(test_samples)

{'en': 'As soon as he sees a pretty girl, he feels totally infatuated with and attracted to her.', 'yue': '佢一見到靚女就暈晒大浪。'}
[{'role': 'user', 'content': 'Translate the following words into English:\n佢一見到靚女就暈晒大浪。'}]
Base model: He faints at the sight of a beautiful woman.
SFT model: assistant
He getshead 靓女就到。 
[{'role': 'user', 'content': 'Translate the following words into Cantonese:\nAs soon as he sees a pretty girl, he feels totally infatuated with and attracted to her.'}]
Base model: 當他一看到一個漂亮的女孩時，他就對她完全著迷，被她吸引。
SFT model: assistant
assistant
As soon as he sees a pretty girl, he's infatuated with and attracted to her. 
{'en': "Nowadays parents really spend lots of money for the sake of celebrating their children's birthdays.", 'yue': '而家父母真係好抌本去為仔女慶祝生日。'}
[{'role': 'user', 'content': 'Translate the following words into English:\n而家父母真係好抌本去為仔女慶祝生日。'}]
Base model: Currently, my parents are really happy to go all out to celebrate their children's birthdays.
SFT model: assistant
assistan

In [24]:
messages = [
    {"role": "user", "content": "Translate the following words into English:\n乜嘢都係波士決定嘅，打工仔啲人淨係得個知字。\n"},
]

# get 5 random samples from train and test dataset
train_sample = abc_train_set.shuffle(seed=42).select(range(5))
test_sample = abc_test_set.shuffle(seed=42).select(range(5))

en_train_messages = {get_translate_prompt('Cantonese', sentence) for sentence in train_sample['en']}
en_test_messages = {get_translate_prompt('Cantonese', sentence) for sentence in test_sample['en']}
yue_train_messages = {get_translate_prompt('English', sentence) for sentence in train_sample['yue']}
yue_test_messages = {get_translate_prompt('English', sentence) for sentence in test_sample['yue']}

for messages in [en_train_messages, en_test_messages, yue_train_messages, yue_test_messages]:
    for message in messages:
        print(message)


Translate the following words into Cantonese:
The hooker has just finished, and her client has gone to pay the money.
Translate the following words into Cantonese:
He's become so fat that he's split the seams of his shirt.
Translate the following words into Cantonese:
Some Hongkongers have gone abroad and experienced living there.
Translate the following words into Cantonese:
This guy keeps wandering around at the entrance of the bank and it doesn't seem quite right.
Translate the following words into Cantonese:
Truly I'm not cheating you.
Translate the following words into Cantonese:
His wife got sick, and now his friends feel very sympathetic toward him.
Translate the following words into Cantonese:
What did you say just now?
Translate the following words into Cantonese:
Hey punk kid! Are you thinking you can kill me?
Translate the following words into Cantonese:
Your way of doing things is really thoroughly ridiculous!
Translate the following words into Cantonese:
Of course Mom's ho

In [25]:
input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
output_ids = model.generate(input_ids.to('cuda'))
# response = base_tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True, max_length=100)
response = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True, max_length=100)

# Model response: "Hello! How can I assist you today?"
print("Tuned model:", response)

input_ids = base_tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
output_ids = base_model.generate(input_ids.to('cuda'))
# response = base_tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True, max_length=100)
response = base_tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True, max_length=100)

print("Base model:", response)

UndefinedError: 'str object' has no attribute 'role'