In [1]:
import os

import pandas as pd
import numpy as np
import torch

import bitsandbytes
import accelerate
from datasets import Dataset

from modelscope import snapshot_download
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import Trainer, TrainingArguments
from peft import PeftModel


  from .autonotebook import tqdm as notebook_tqdm
2024-04-18 11:20:21,056 - modelscope - INFO - PyTorch version 2.2.1 Found.
2024-04-18 11:20:21,058 - modelscope - INFO - Loading ast index from /root/.cache/modelscope/ast_indexer
2024-04-18 11:20:21,098 - modelscope - INFO - Loading done! Current index file version is 1.13.1, with md5 b5a2c5fe01f7460b3e700a8ce7e6fc94 and a total number of 972 components indexed


In [2]:
model_path=r'01ai/Yi-6B-Chat'
curr_dir = os.getcwd()
model_dir = snapshot_download('01ai/Yi-6B-Chat', cache_dir=curr_dir, revision='master')

# base_tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, padding_side='left', max_length=512, return_tensors='pt')

# base_model = AutoModelForCausalLM.from_pretrained(
# 	 model_path,
# 	 device_map='auto',
# 	 torch_dtype=torch.bfloat16,
# 	 trust_remote_code=True 
# ).eval()

In [3]:
messages = [
    {"role": "user", "content": "Translate the following words into English:\n你係邊個？"},
]

messages_plain = [
    """
    <|im_start|> user
    hi<|im_end|> 
    <|im_start|> assistant
    """
]



# input_ids = base_tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
# output_ids = base_model.generate(input_ids.to('cuda'))
# # response = base_tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True, max_length=100)
# response = base_tokenizer.decode(output_ids[0], skip_special_tokens=False, max_length=100)

# # Model response: "Hello! How can I assist you today?"
# print(response)

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir=curr_dir, local_files_only=True, use_fast=True, padding_side='right', max_length=512, return_tensors='pt')
base_tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir=curr_dir, local_files_only=True, use_fast=True, padding_side='right', max_length=512, return_tensors='pt')

KeyboardInterrupt: 

In [None]:
# model = PeftModel.from_pretrained(
#     model,
#    '/root/peft_model',
#     is_trainable=False
# )

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
base_model = AutoModelForCausalLM.from_pretrained(
	 '01ai/Yi-6B-Chat',
	 device_map=device,
	 torch_dtype=torch.bfloat16,
     quantization_config=BitsAndBytesConfig(load_in_8bit=True),
	 trust_remote_code=True 
).eval()
model = AutoModelForCausalLM.from_pretrained(
	 model_path,
	 device_map=device,
	 torch_dtype=torch.bfloat16,
	 quantization_config=BitsAndBytesConfig(load_in_8bit=True),
	 trust_remote_code=True 
).eval()
model.resize_token_embeddings(len(tokenizer))
model.load_adapter('/root/autodl-tmp/peft_model_sft')

Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.49s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.30s/it]


In [None]:
REPO_DIRECTORY = r''
ABC_DICT_PATH = r'AIST4010-Cantonese-Translator-Data/ABC-Dict/abc_dict.csv'

def load_abc_dataset():
    abc_dict = pd.read_csv(REPO_DIRECTORY + ABC_DICT_PATH)
    abc_dataset = Dataset.from_pandas(abc_dict)
    return abc_dataset

abc_set = load_abc_dataset()
abc_shuffled_set = abc_set.train_test_split(seed=42, test_size=0.1)
abc_train_set = abc_shuffled_set['train']
abc_test_set = abc_shuffled_set['test']

In [None]:

def get_messages(sample):
    lang_map = {'English': 'en', 'Cantonese': 'yue'}
    def get_prompt(src, tgt, sample):
        system_prompt = f"Translate the given {src} words to {tgt}."
        user_prompt = sample[lang_map[src]]
        return system_prompt, user_prompt
    system1, user1 = get_prompt('English', 'Cantonese', sample)
    system2, user2 = get_prompt('Cantonese', 'English', sample)
    return [[
        {
            "role": "system",
            "content": system1
        },
        {
            "role": "user",
            "content": user1
        }],
        [
        {
            "role": "system",
            "content": system2
        },
        {
            "role": "user",
            "content": user2
        }]
    ]
    

train_samples = abc_train_set.shuffle(seed=10).select(range(20))
test_samples = abc_test_set.shuffle(seed=10).select(range(20))

get_messages(train_samples[0])

[[{'role': 'system',
   'content': 'Translate the given English words to Cantonese.'},
  {'role': 'user',
   'content': "Today my classmate wasn't careful, and kicked my left nut. It was very painful!"}],
 [{'role': 'system',
   'content': 'Translate the given Cantonese words to English.'},
  {'role': 'user', 'content': '今日我同學唔小心，踢中我左面粒蛋蛋，好痛啊！'}]]

In [None]:
def model_output(model, tokenizer, messages, name=None):
        for prompt in messages:
                input_ids = tokenizer.apply_chat_template(conversation=prompt, tokenize=True, add_generation_prompt=True, return_tensors='pt')
                with torch.cuda.amp.autocast():
                        output_ids = model.generate(input_ids.to('cuda'), max_new_tokens=100)
                response = base_tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True, max_length=100)
                # response = tokenizer.decode(output_ids[0], skip_special_tokens=False, max_length=100)
                # print(output_ids)
                print(f"{name}:\n{response}\n")


print([model_output(base_model, base_tokenizer, get_messages(train_samples[0]), 'Base model'),])



tensor([[    6,  1328,   144,  7759, 14429,   567,  1858,  4750,  3151,   592,
         26212,  2823, 59569,    98,     7, 59568,   144,     6,  2942,   144,
         25585,   826,  1225, 13580,  3613, 59610, 59570, 11369,    97,   597,
         19591,   826,  1999,  6230,    98,   983,   717,  1196, 16465,    99,
             7, 59568,   144,     6, 14135,   144,  3569, 59646, 60588, 54274,
         59706,   534,   453,   453, 13272,   101, 62328, 40994, 60488, 62295,
         59599, 61847, 60751,   101, 59706, 60544, 61459,   103,     7]],
       device='cuda:0')
Base model:
今天我個同學好唔小心，踢到我左邊的卵蛋，好痛呀！

tensor([[    6,  1328,   144,  7759, 14429,   567,  1858, 26212,  2823, 59569,
          3151,   592,  4750,    98,     7, 59568,   144,     6,  2942,   144,
         10721, 59646, 54274,   534,   453,   453, 13272,   101, 62328, 59642,
         59646, 60488, 59724, 61329, 60751, 60751,   101, 59706, 60544, 60530,
           103,     7, 59568,   144,     6, 14135,   144, 25585,   826,  1

In [None]:
def compare_outputs(samples):
    for sample in samples:
        print(sample)
        prompts = get_messages(sample)
        for prompt in prompts:
            print(f"Prompt:\n{prompt[1]['content']}")
            print()
            model_output(base_model, base_tokenizer, [prompt], 'Base model')
            model_output(model, tokenizer, [prompt], 'Fine-tuned model')

# compare_outputs(train_samples)

In [None]:
model_output(model, tokenizer, get_messages(train_samples[0]), 'Fine-tuned model')

tensor([[    6,  1328,   144,  7759, 14429,   567,  1858,  4750,  3151,   592,
         26212,  2823, 59569,    98,     7, 59568,   144,     6,  2942,   144,
         25585,   826,  1225, 13580,  3613, 59610, 59570, 11369,    97,   597,
         19591,   826,  1999,  6230,    98,   983,   717,  1196, 16465,    99,
             7, 59568,   144,     6, 14135,   144, 10721, 54274,   534,   453,
           453, 13272,   101, 62328, 59642, 59646, 60488,   537,   438,   476,
           102, 59706, 60544,   534,   449,   455,   103,     7]],
       device='cuda:0')
Fine-tuned model:
今日同學唔小心，踢中我左腫。好痛吖！

tensor([[    6,  1328,   144,  7759, 14429,   567,  1858, 26212,  2823, 59569,
          3151,   592,  4750,    98,     7, 59568,   144,     6,  2942,   144,
         10721, 59646, 54274,   534,   453,   453, 13272,   101, 62328, 59642,
         59646, 60488, 59724, 61329, 60751, 60751,   101, 59706, 60544, 60530,
           103,     7, 59568,   144,     6, 14135,   144, 25585,   826,  1225,
  

In [None]:
compare_outputs(train_samples)


{'en': "Today my classmate wasn't careful, and kicked my left nut. It was very painful!", 'yue': '今日我同學唔小心，踢中我左面粒蛋蛋，好痛啊！'}
Prompt:
Translate the given English words to Cantonese.



tensor([[    6,  1328,   144,  7759, 14429,   567,  1858,  4750,  3151,   592,
         26212,  2823, 59569,    98,     7, 59568,   144,     6,  2942,   144,
         25585,   826,  1225, 13580,  3613, 59610, 59570, 11369,    97,   597,
         19591,   826,  1999,  6230,    98,   983,   717,  1196, 16465,    99,
             7, 59568,   144,     6, 14135,   144, 10721, 59646, 60588, 54274,
         59706,   534,   453,   453, 13272,   101, 62328, 40994, 60588, 60488,
         62295,   536,   462,   495, 62278,   101, 59706, 60544, 61459,   103,
             7]], device='cuda:0')
Base model:
今日我個同學好唔小心，踢到我個左邊睾丸，好痛呀！

tensor([[    6,  1328,   144,  7759, 14429,   567,  1858,  4750,  3151,   592,
         26212,  2823, 59569,    98,     7, 59568,   144,     6,  2942,   144,
         25585,   826,  1225, 13580,  3613, 59610, 59570, 11369,    97,   597,
         19591,   826,  1999,  6230,    98,   983,   717,  1196, 16465,    99,
             7, 59568,   144,     6, 14135,   144, 10721, 

KeyboardInterrupt: 

In [None]:
compare_outputs(test_samples)

{'en': 'Miss Linda.', 'yue': 'Linda姐。'}
Prompt:
Translate the following words from Cantonese to English:
Linda姐。



tensor([[    6,  2942,   144,  7759, 14429,   567,  1926,  3151,   742, 26212,
          2823, 59569,   592,  4750, 59601,   144, 59620, 27164, 60706,   102,
             7, 59568,   144,     6, 14135,   144, 59620, 27164, 60706,    98,
             7]], device='cuda:0')
Base model:
Linda姐.

tensor([[    6,  2942,   144,  7759, 14429,   567,  1926,  3151,   742, 26212,
          2823, 59569,   592,  4750, 59601,   144, 59620, 27164, 60706,   102,
             7, 59568,   144,     6, 14135,   144, 59620, 27164, 60706,   101,
         62376, 59967, 12666, 60521, 30916, 59568,   534,   457,   438, 11940,
           102,     7]], device='cuda:0')
Fine-tuned model:
Linda姐，係指一個叫 Linda 嘅女士。

Prompt:
Translate the following words from English to Cantonese:
Miss Linda.

tensor([[    6,  2942,   144,  7759, 14429,   567,  1926,  3151,   742,  4750,
           592, 26212,  2823, 59569, 59601,   144, 30874, 30916,    98,     7,
         59568,   144,     6, 14135,   144, 30874, 30916,    98,     7

In [None]:
messages = [
    {"role": "user", "content": "Translate the following words into English:\n乜嘢都係波士決定嘅，打工仔啲人淨係得個知字。\n"},
]

# get 5 random samples from train and test dataset
train_sample = abc_train_set.shuffle(seed=42).select(range(5))
test_sample = abc_test_set.shuffle(seed=42).select(range(5))

en_train_messages = {get_translate_prompt('Cantonese', sentence) for sentence in train_sample['en']}
en_test_messages = {get_translate_prompt('Cantonese', sentence) for sentence in test_sample['en']}
yue_train_messages = {get_translate_prompt('English', sentence) for sentence in train_sample['yue']}
yue_test_messages = {get_translate_prompt('English', sentence) for sentence in test_sample['yue']}

for messages in [en_train_messages, en_test_messages, yue_train_messages, yue_test_messages]:
    for message in messages:
        print(message)


TypeError: get_translate_prompt() missing 1 required positional argument: 'source_text'

In [None]:
input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
output_ids = model.generate(input_ids.to('cuda'))
# response = base_tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True, max_length=100)
response = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True, max_length=100)

# Model response: "Hello! How can I assist you today?"
print("Tuned model:", response)

input_ids = base_tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
output_ids = base_model.generate(input_ids.to('cuda'))
# response = base_tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True, max_length=100)
response = base_tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True, max_length=100)

print("Base model:", response)

UndefinedError: 'str object' has no attribute 'role'