In [1]:
import os

import pandas as pd
import numpy as np
import torch

import bitsandbytes
import accelerate
from datasets import Dataset

from modelscope import snapshot_download
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import Trainer, TrainingArguments
from peft import PeftModel

from custom_tokenizers import YueTokenizer


  from .autonotebook import tqdm as notebook_tqdm
2024-04-05 12:50:57,539 - modelscope - INFO - PyTorch version 2.2.1 Found.
2024-04-05 12:50:57,541 - modelscope - INFO - Loading ast index from /root/.cache/modelscope/ast_indexer
2024-04-05 12:50:57,574 - modelscope - INFO - Loading done! Current index file version is 1.13.1, with md5 b5a2c5fe01f7460b3e700a8ce7e6fc94 and a total number of 972 components indexed


In [2]:
model_path=r'/root/autodl-tmp/01ai/Yi-6B-Chat'
model_dir = snapshot_download('01ai/Yi-6B-Chat', cache_dir='/root/autodl-tmp', revision='master')

# base_tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, padding_side='left', max_length=512, return_tensors='pt')

# base_model = AutoModelForCausalLM.from_pretrained(
# 	 model_path,
# 	 device_map='auto',
# 	 torch_dtype=torch.bfloat16,
# 	 trust_remote_code=True 
# ).eval()

In [3]:
messages = [
    {"role": "user", "content": "Translate the following words into English:\n你係邊個？"},
]

messages_plain = [
    """
    <|im_start|> user
    hi<|im_end|> 
    <|im_start|> assistant
    """
]



# input_ids = base_tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
# output_ids = base_model.generate(input_ids.to('cuda'))
# # response = base_tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True, max_length=100)
# response = base_tokenizer.decode(output_ids[0], skip_special_tokens=False, max_length=100)

# # Model response: "Hello! How can I assist you today?"
# print(response)

In [4]:
tokenizer = YueTokenizer.from_pretrained('/root/autodl-tmp/01ai/Yi-6B-Chat', use_fast=True, padding_side='left', max_length=512, return_tensors='pt')
base_tokenizer = AutoTokenizer.from_pretrained('/root/autodl-tmp/01ai/Yi-6B-Chat', use_fast=True, padding_side='left', max_length=512, return_tensors='pt')
print(len(tokenizer.vocab))
# tokenizer = AutoTokenizer.from_pretrained('/root/AIST4010-Cantonese-Translator/tokenizer', use_fast=True, padding_side='left', max_length=512, return_tensors='pt')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LlamaTokenizer'. 
The class this function is called from is 'YueTokenizer'.


77969


In [5]:
# model = PeftModel.from_pretrained(
#     model,
#    '/root/peft_model',
#     is_trainable=False
# )

In [6]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
base_model = AutoModelForCausalLM.from_pretrained(
	 '/root/autodl-tmp/01ai/Yi-6B-Chat',
	 device_map=device,
	 torch_dtype=torch.bfloat16,
     quantization_config=BitsAndBytesConfig(load_in_8bit=True),
	 trust_remote_code=True 
).eval()
model = AutoModelForCausalLM.from_pretrained(
	 model_path,
	 device_map=device,
	 torch_dtype=torch.bfloat16,
	 quantization_config=BitsAndBytesConfig(load_in_8bit=True),
	 trust_remote_code=True 
).eval()
model.resize_token_embeddings(len(tokenizer))
model.load_adapter('/root/autodl-tmp/peft_model_sft')

Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.56s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.27s/it]


In [7]:
REPO_DIRECTORY = r'/root/'
ABC_DICT_PATH = r'autodl-tmp/AIST4010-Cantonese-Translator-Data/ABC-Dict/abc_dict.csv'

def load_abc_dataset():
    abc_dict = pd.read_csv(REPO_DIRECTORY + ABC_DICT_PATH)
    abc_dataset = Dataset.from_pandas(abc_dict)
    return abc_dataset

abc_set = load_abc_dataset()
abc_shuffled_set = abc_set.train_test_split(seed=42, test_size=0.1)
abc_train_set = abc_shuffled_set['train']
abc_test_set = abc_shuffled_set['test']

In [8]:

def get_messages(sample):
    lang_map = {'English': 'en', 'Cantonese': 'yue'}
    def get_prompt(src, tgt, sample):
        system_prompt = f"Translate the given {src} words to {tgt}."
        user_prompt = sample[lang_map[src]]
        return system_prompt, user_prompt
    system1, user1 = get_prompt('English', 'Cantonese', sample)
    system2, user2 = get_prompt('Cantonese', 'English', sample)
    return [[
        {
            "role": "system",
            "content": system1
        },
        {
            "role": "user",
            "content": user1
        }],
        [
        {
            "role": "system",
            "content": system2
        },
        {
            "role": "user",
            "content": user2
        }]
    ]
    

train_samples = abc_train_set.shuffle(seed=10).select(range(20))
test_samples = abc_test_set.shuffle(seed=10).select(range(20))

get_messages(train_samples[0])

[[{'role': 'system',
   'content': 'Translate the given English words to Cantonese.'},
  {'role': 'user',
   'content': "Today my classmate wasn't careful, and kicked my left nut. It was very painful!"}],
 [{'role': 'system',
   'content': 'Translate the given Cantonese words to English.'},
  {'role': 'user', 'content': '今日我同學唔小心，踢中我左面粒蛋蛋，好痛啊！'}]]

In [9]:
def model_output(model, tokenizer, messages, name=None):
        for prompt in messages:
                input_ids = tokenizer.apply_chat_template(conversation=prompt, tokenize=True, add_generation_prompt=True, return_tensors='pt')
                with torch.cuda.amp.autocast():
                        output_ids = model.generate(input_ids.to('cuda'), max_new_tokens=100)
                response = base_tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True, max_length=100)
                # response = tokenizer.decode(output_ids[0], skip_special_tokens=False, max_length=100)
                print(output_ids)
                print(f"{name}:\n{response}\n")


print([model_output(base_model, base_tokenizer, get_messages(train_samples[0]), 'Base model'),])



tensor([[    6,  1328,   144,  7759, 14429,   567,  1858,  4750,  3151,   592,
         26212,  2823, 59569,    98,     7, 59568,   144,     6,  2942,   144,
         25585,   826,  1225, 13580,  3613, 59610, 59570, 11369,    97,   597,
         19591,   826,  1999,  6230,    98,   983,   717,  1196, 16465,    99,
             7, 59568,   144,     6, 14135,   144, 10721, 59646, 60588, 54274,
           534,   453,   453, 13272,   101, 62328, 40994, 60588, 60488, 62295,
           536,   462,   495, 62278,   101, 59706, 60544, 61459,   103,     7]],
       device='cuda:0')
Base model:
今日我個同學唔小心，踢到我個左邊睾丸，好痛呀！

tensor([[    6,  1328,   144,  7759, 14429,   567,  1858, 26212,  2823, 59569,
          3151,   592,  4750,    98,     7, 59568,   144,     6,  2942,   144,
         10721, 59646, 54274,   534,   453,   453, 13272,   101, 62328, 59642,
         59646, 60488, 59724, 61329, 60751, 60751,   101, 59706, 60544, 60530,
           103,     7, 59568,   144,     6, 14135,   144, 25585,   8

In [10]:
def compare_outputs(samples):
    for sample in samples:
        print(sample)
        prompts = get_messages(sample)
        for prompt in prompts:
            print(f"Prompt:\n{prompt[0]['content']}")
            print()
            model_output(base_model, base_tokenizer, [prompt], 'Base model')
            model_output(model, tokenizer, [prompt], 'Fine-tuned model')

# compare_outputs(train_samples)

In [11]:
model_output(model, tokenizer, get_messages(train_samples[0]), 'Fine-tuned model')

tensor([[    6,  1328,   144,  7759, 14429,   567,  1858,  4750, 72032,  1994,
           592, 26212,  2823, 59569,    98,     7, 59568,   144,     6,  2942,
           144, 25585,   826,  1225, 13580,  3613, 59610, 59570, 11369,    97,
           597, 19591,   826,  1999,  6230,    98,   983,   717,  1196, 16465,
            99,     7, 59568,   144,     6, 14135,   144, 10721, 59646,   534,
           454,   483, 65228, 68360, 67373, 67644, 67373, 76988,     7]],
       device='cuda:0')
Fine-tuned model:
今日我啲

tensor([[    6,  1328,   144,  7759, 14429,   567,  1858, 26212,  2823, 59569,
         72032,  1994,   592,  4750,    98,     7, 59568,   144,     6,  2942,
           144, 10721, 59646, 54274,   534,   453,   453, 13272,   101, 62328,
         59642, 59646, 60488, 59724, 61329, 60751, 60751,   101, 59706, 60544,
         60530,   103,     7, 59568,   144,     6, 14135,   144, 25585,   826,
          1225, 13580, 24870, 19591,   795,   594,   567,  1999,  1682,  1935,
         

In [12]:
compare_outputs(train_samples)


{'en': "Today my classmate wasn't careful, and kicked my left nut. It was very painful!", 'yue': '今日我同學唔小心，踢中我左面粒蛋蛋，好痛啊！'}
Prompt:
Translate the given English words to Cantonese.

tensor([[    6,  1328,   144,  7759, 14429,   567,  1858,  4750,  3151,   592,
         26212,  2823, 59569,    98,     7, 59568,   144,     6,  2942,   144,
         25585,   826,  1225, 13580,  3613, 59610, 59570, 11369,    97,   597,
         19591,   826,  1999,  6230,    98,   983,   717,  1196, 16465,    99,
             7, 59568,   144,     6, 14135,   144, 10721, 54274, 30369,   101,
         62328, 59642, 60488, 62295, 59599, 60751,   101, 59706, 60544, 61459,
           103,     7]], device='cuda:0')
Base model:
今日同學不小心，踢中左邊的蛋，好痛呀！

tensor([[    6,  1328,   144,  7759, 14429,   567,  1858,  4750, 72032,  1994,
           592, 26212,  2823, 59569,    98,     7, 59568,   144,     6,  2942,
           144, 25585,   826,  1225, 13580,  3613, 59610, 59570, 11369,    97,
           597, 19591,   826,  199

In [None]:
compare_outputs(test_samples)

{'en': 'Miss Linda.', 'yue': 'Linda姐。'}
Prompt:
Translate the following words from Cantonese to English:
Linda姐。



Base model:
Linda姐.

SFT model:
Linda, or Miss Linda. 

Prompt:
Translate the following words from English to Cantonese:
Miss Linda.

Base model:
Miss Linda.

SFT model:
李生。 

{'en': 'What is he doing in the kitchen to make such banging and clanging noise?', 'yue': '佢喺廚房度打得咁𠽤叻𡃈嘞做乜嘢啊？'}
Prompt:
Translate the following words from Cantonese to English:
佢喺廚房度打得咁𠽤叻𡃈嘞做乜嘢啊？

Base model:
"佢喺廚房度打得咁𠽤叻𡃈嘞做乜嘢啊？" 可以翻译为 "他在厨房里打得这么好啊？" 或者 "他在厨房里打得这么好吗？" 其中，"打得咁𠽤叻𡃈嘞" 表示打得很厉害，"做乜嘢啊？" 表示在做些什么。

SFT model:
What is he doing in the kitchen looking so engrossed in his work? 

Prompt:
Translate the following words from English to Cantonese:
What is he doing in the kitchen to make such banging and clanging noise?

Base model:
他在廚房裡做什麼會發出這麼大的敲擊和碰撞噪音？

SFT model:
佢喺廚房咁多嘈? 

{'en': 'How high do you think the possibility of success for this matter will be?', 'yue': '計你話呢件事嘅成數會有幾高呢？'}
Prompt:
Translate the following words from Cantonese to English:
計你話呢件事嘅成數會有幾高呢？

Base model:
Translation from Cantonese to English:

In [None]:
messages = [
    {"role": "user", "content": "Translate the following words into English:\n乜嘢都係波士決定嘅，打工仔啲人淨係得個知字。\n"},
]

# get 5 random samples from train and test dataset
train_sample = abc_train_set.shuffle(seed=42).select(range(5))
test_sample = abc_test_set.shuffle(seed=42).select(range(5))

en_train_messages = {get_translate_prompt('Cantonese', sentence) for sentence in train_sample['en']}
en_test_messages = {get_translate_prompt('Cantonese', sentence) for sentence in test_sample['en']}
yue_train_messages = {get_translate_prompt('English', sentence) for sentence in train_sample['yue']}
yue_test_messages = {get_translate_prompt('English', sentence) for sentence in test_sample['yue']}

for messages in [en_train_messages, en_test_messages, yue_train_messages, yue_test_messages]:
    for message in messages:
        print(message)


TypeError: get_translate_prompt() missing 1 required positional argument: 'source_text'

In [None]:
input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
output_ids = model.generate(input_ids.to('cuda'))
# response = base_tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True, max_length=100)
response = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True, max_length=100)

# Model response: "Hello! How can I assist you today?"
print("Tuned model:", response)

input_ids = base_tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
output_ids = base_model.generate(input_ids.to('cuda'))
# response = base_tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True, max_length=100)
response = base_tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True, max_length=100)

print("Base model:", response)

UndefinedError: 'str object' has no attribute 'role'