In [1]:
import os
import gc

import pandas as pd
import numpy as np

import torch

import bitsandbytes
from modelscope import snapshot_download
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from transformers import BitsAndBytesConfig
from trl import SFTTrainer
from peft import get_peft_model, LoraConfig, TaskType

from tqdm import tqdm

from custom_tokenizers import YueTokenizer

2024-03-31 21:53:07,967 - modelscope - INFO - PyTorch version 2.2.1 Found.
2024-03-31 21:53:07,969 - modelscope - INFO - Loading ast index from /root/.cache/modelscope/ast_indexer
2024-03-31 21:53:08,020 - modelscope - INFO - Loading done! Current index file version is 1.13.1, with md5 b5a2c5fe01f7460b3e700a8ce7e6fc94 and a total number of 972 components indexed
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
gc.collect()
torch.cuda.empty_cache()
torch.cuda.is_available()

True

In [3]:
DATA_DIRECTORY = r'/root/autodl-tmp/AIST4010-Cantonese-Translator-Data/'

def load_cantonese_wiki():
    wiki_lines = []
    def load_cantonese_wiki_file(filename='wiki_00'):
        with open(os.path.join(DATA_DIRECTORY, 'Cantonese-Wiki/text', filename), 'r') as f:
            lines = f.readlines()
            lines = [line.strip() for line in lines]
            lines = [line for line in lines if len(line) > 0]
            lines = [[line[i:i+500] for i in range(0, len(line), 500)] for line in lines]
            lines = [line for sublist in lines for line in sublist]
            return lines
        
    for file in os.listdir(os.path.join(DATA_DIRECTORY, 'Cantonese-Wiki/text')):
        curr_lines = load_cantonese_wiki_file(file)
        wiki_lines.extend(curr_lines)
    
    return wiki_lines

def load_openrice_reviews():
    with open(os.path.join(DATA_DIRECTORY, 'openrice/openrice.txt'), 'r') as f:
        lines = f.readlines()
        lines = [line.strip() for line in lines]
        lines = [line for line in lines if len(line) > 0]
        lines = [[line[i:i+500] for i in range(0, len(line), 500)] for line in lines]
        lines = [line for sublist in lines for line in sublist]
        return lines

yue_wiki_lines = load_cantonese_wiki()
openrice_lines = load_openrice_reviews()

print(len(yue_wiki_lines))
print(len(openrice_lines))

mono_dataset = Dataset.from_dict({
    'text': yue_wiki_lines + openrice_lines
})

print(len(mono_dataset))

#print mean sentence length
sentence_lengths = [len(sentence) for sentence in mono_dataset['text']]
print(np.mean(sentence_lengths))
print(np.sum(sentence_lengths))
print(np.max(sentence_lengths))


602153
14736
616889
48.799967255049125
30104163
500


In [4]:
model_path=r'/root/autodl-tmp/01ai/Yi-6B-Chat'
model_dir = snapshot_download('01ai/Yi-6B-Chat', cache_dir='/root/autodl-tmp', revision='master')

In [5]:
base_tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, padding_side='right', max_length=512, return_tensors='pt')

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


# Since transformers 4.35.0, the GPT-Q/AWQ model can be loaded using AutoModelForCausalLM.
base_model = AutoModelForCausalLM.from_pretrained(
	 '/root/autodl-tmp/01ai/Yi-6B-Chat',
	 device_map=device,
	 torch_dtype=torch.bfloat16,
    #  quantization_config=BitsAndBytesConfig(load_in_8bit=True),
	#  trust_remote_code=True 
)


# # Prompt content: "hi"
# messages = [
#     {"role": "user", "content": "hi"}
# ]


# input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
# output_ids = model.generate(input_ids.to('cuda'))
# response = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)

# # Model response: "Hello! How can I assist you today?"
# print(response)

Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.24s/it]


In [6]:
print(device)

cuda


In [7]:
# Prompt content: "hi"
messages = [
    {"role": "user", "content": "hi"}
]


input_ids = base_tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
output_ids = base_model.generate(input_ids.to('cuda'))
response = base_tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)

# Model response: "Hello! How can I assist you today?"
print(response)

Hello! How can I assist you today?


In [8]:
lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules = ["k_proj", "q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
peft_model = get_peft_model(base_model, 
                            lora_config)
peft_model = peft_model.to(device)

peft_model.print_trainable_parameters()

trainable params: 17,825,792 || all params: 6,078,861,312 || trainable%: 0.293242288071467


In [9]:
tokenizer = YueTokenizer.from_pretrained(model_path, use_fast=True, padding_side='right', max_length=512, return_tensors='pt')

print(len(tokenizer.get_vocab()))

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LlamaTokenizer'. 
The class this function is called from is 'YueTokenizer'.


77969


In [10]:
print(tokenizer.tokenize("嗌 呃 畀 啲 嘢 噃"))
print(base_tokenizer.tokenize("嗌 呃 畀 啲 嘢 噃"))
print(tokenizer.tokenize("嗌呃畀啲嘢噃"))
print(base_tokenizer.tokenize("嗌呃畀啲嘢噃"))
print(tokenizer.tokenize("中文字"))
print(base_tokenizer.tokenize("中文字"))
print(tokenizer("Good morning")['input_ids'])
print(base_tokenizer("Good morning")['input_ids'])

['▁嗌', '▁呃', '▁畀', '▁啲', '▁嘢', '▁噃']
['▁', '<0xE5>', '<0x97>', '<0x8C>', '▁', '<0xE5>', '<0x91>', '<0x83>', '▁', '<0xE7>', '<0x95>', '<0x80>', '▁', '<0xE5>', '<0x95>', '<0xB2>', '▁', '<0xE5>', '<0x98>', '<0xA2>', '▁', '<0xE5>', '<0x99>', '<0x83>']
['▁嗌', '▁呃', '▁畀', '▁啲嘢', '▁噃']
['▁', '<0xE5>', '<0x97>', '<0x8C>', '<0xE5>', '<0x91>', '<0x83>', '<0xE7>', '<0x95>', '<0x80>', '<0xE5>', '<0x95>', '<0xB2>', '<0xE5>', '<0x98>', '<0xA2>', '<0xE5>', '<0x99>', '<0x83>']
['▁', '中文', '▁', '字']
['▁中', '文字']
[6076, 4040]
[6076, 4040]


In [11]:
def formatting_prompts_func(examples):
    output_texts = []
    for i, example in enumerate(examples['text']):
        if example.strip() == '' or len(example) <= 1:
            continue
        example_len = len(example)
        random_split = np.random.randint(0.3*example_len, 0.7*example_len)
        random_split = max(min(random_split, example_len-1), 1)
        split1 = example[:random_split]
        split2 = example[random_split:]
        text = f"""<|im_start|> user
        Complete the following text: {split1} <|im_end|> 
        <|im_start|> assistant
        {split2} <|im_end|>"""
        output_texts.append(text)
    return output_texts

In [12]:
prompts = formatting_prompts_func(mono_dataset[:10])
for prompt in prompts:
    print(prompt)

<|im_start|> user
        Complete the following text: 香 <|im_end|> 
        <|im_start|> assistant
        港 <|im_end|>
<|im_start|> user
        Complete the following text: （）係華南一城埠，同時都係一個國際大都會，同紐約、倫敦合稱「紐倫港」，全名中華人民共和國香港特別行政區（；縮寫：或）。香港開埠於1841年，有人叫佢做「百年之城」，曾經係英國嘅殖民地，1997年7月1號開始由中華人民共和國接管，成立特別行政區。按照《中英聯合聲明》同埋《香港基本法》，香港享有高度自治，除咗外交同防務之外，理論上係全面自治，香港亦都有參與國際組織嘅權利。香 <|im_end|> 
        <|im_start|> assistant
        港實行資本主義制度，有別於中國大陸嘅社會主義體制，所以官方稱之為一國兩制（）。但隨住中國共產黨喺習近平上台之後加強對香港嘅操控，2020年更加強行通過港版國安法同顛覆香港選舉制度，大肆侵犯人權、消滅新聞自由、迫害政治異見人士，令一國兩制名存實亡，更加掀起新一波移民潮。 <|im_end|>
<|im_start|> user
        Complete the following text: 香港三面環海，東面同南面係南中國海，西面係珠江口同零丁洋；東北面係大鵬灣，北面同中國大陸隔住條深圳河。香港有唔少島散佈東南西，因為瀕海、水路發達，所以係太平洋到印度洋同埋各國貨船嘅必經之路。空運亦都係南 <|im_end|> 
        <|im_start|> assistant
        洋同中國嘅樞紐，而且全球每日都有唔少飛機途經香港。由於香港鄰近廣東省、澳門兩地，因此又合稱「省港澳」或「粵港澳」。 <|im_end|>
<|im_start|> user
        Complete the following text: 香港原本係一個人煙稀少嘅農村同漁村，不過而家已成為世界上最重要嘅金融中心同商業港口之一，重係世界十大出口國同埋第九大進口國。香港擁有低稅率同自由貿易為特徵嘅資本主義經濟體系，港元係世界第八 <|im_end|> 
 

In [13]:
print(tokenizer.vocab)



In [14]:
# bleu = evaluate.load('bleu')

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    print(predictions.shape, labels.shape)
    return {"bleu": bleu(predictions, labels)}

In [15]:
peft_model.resize_token_embeddings(len(tokenizer))

Embedding(77969, 4096)

In [16]:
training_args = TrainingArguments(
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=3,
    logging_steps=100,
    output_dir="/root/peft_model",
    per_device_train_batch_size=1
)

# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = SFTTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=mono_dataset,
    formatting_func=formatting_prompts_func,
    tokenizer=tokenizer,
    # data_collator=data_collator,
)
trainer.train()

Map:  40%|████      | 247000/616889 [00:11<00:18, 20377.71 examples/s]

Map: 100%|██████████| 616889/616889 [00:29<00:00, 21232.40 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
100,2.8996
200,2.1684
300,2.1918
400,2.0496
500,2.1015


Checkpoint destination directory /root/peft_model/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


KeyboardInterrupt: 

In [None]:
trainer.model.save_pretrained("/root/peft_model")



In [None]:
# #get random data from test dataset
# for i in range(5):
#     example = abc_test_set[i]
#     print(example)
#     text1 = f"""Translate the following words into Cantonese: 
#         {example['en']}
#         """
#     text2 = f"""Translate the following words into English:
#         {example['yue']}
#         """
#     texts = [text1, text2]
#     for text in texts:
#         messages = [
#             {"role": "user", "content": text}
#         ]
#         print(messages)
#         #print model outputs for base_model and peft_model
#         base_input_ids = base_tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
#         peft_input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
#         print("Base ID:", base_input_ids)
#         print("Base Input:", base_tokenizer.decode(base_input_ids[base_input_ids.shape[1]:], skip_special_tokens=True))
#         print("PEFT ID:", peft_input_ids)
#         print("PEFT Input:", tokenizer.decode(peft_input_ids[peft_input_ids.shape[1]:], skip_special_tokens=True))
#         print(peft_input_ids)
#         base_output_ids = base_model.generate(base_input_ids.to('cuda'), max_new_tokens=100)
#         peft_output_ids = peft_model.generate(peft_input_ids.to('cuda'), max_new_tokens=100)
#         print(base_output_ids.shape, peft_output_ids.shape)
#         print("Base model: ", base_tokenizer.decode(base_output_ids[0][base_input_ids.shape[1]:], skip_special_tokens=True))
#         print("Fine-tuned: ", tokenizer.decode(peft_output_ids[0][peft_input_ids.shape[1]:], skip_special_tokens=True))


In [None]:
print(pd.DataFrame(trainer.state.log_history))

     loss  grad_norm  learning_rate  epoch  step  train_runtime  \
0  1.4801   0.750000       0.000667   0.03   100            NaN   
1  1.0638   0.558594       0.000333   0.06   200            NaN   
2  1.0345   0.507812       0.000000   0.09   300            NaN   
3     NaN        NaN            NaN   0.09   300        66.8764   

   train_samples_per_second  train_steps_per_second    total_flos  train_loss  
0                       NaN                     NaN           NaN         NaN  
1                       NaN                     NaN           NaN         NaN  
2                       NaN                     NaN           NaN         NaN  
3                    35.887                   4.486  8.621312e+15    1.192813  
