In [2]:
import os

import pandas as pd
import numpy as np

import torch


from modelscope import snapshot_download
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTTrainer
from peft import get_peft_model, LoraConfig, TaskType



2024-03-14 08:42:43,091 - modelscope - INFO - PyTorch version 2.2.1 Found.
2024-03-14 08:42:43,093 - modelscope - INFO - Loading ast index from /root/.cache/modelscope/ast_indexer
2024-03-14 08:42:43,126 - modelscope - INFO - Updating the files for the changes of local files, first time updating will take longer time! Please wait till updating done!
2024-03-14 08:42:43,128 - modelscope - INFO - AST-Scanning the path "/root/miniconda3/envs/trans/lib/python3.10/site-packages/modelscope" with the following sub folders ['models', 'metrics', 'pipelines', 'preprocessors', 'trainers', 'msdatasets', 'exporters']
2024-03-14 08:42:43,129 - modelscope - INFO - Scanning done! A number of 0 components indexed or updated! Time consumed 0.0010764598846435547s
2024-03-14 08:42:43,163 - modelscope - INFO - Loading done! Current index file version is 1.13.1, with md5 b5a2c5fe01f7460b3e700a8ce7e6fc94 and a total number of 972 components indexed
  from .autonotebook import tqdm as notebook_tqdm


In [8]:
REPO_DIRECTORY = r'/root/'
ABC_DICT_PATH = r'autodl-tmp/AIST4010-Cantonese-Translator-Data/ABC-Dict/abc_dict.csv'

def load_abc_dataset():
    abc_dict = pd.read_csv(REPO_DIRECTORY + ABC_DICT_PATH)
    abc_dataset = Dataset.from_pandas(abc_dict)
    return abc_dataset

abc_set = load_abc_dataset()
abc_shuffled_set = abc_set.shuffle(seed=42).train_test_split(test_size=0.1)
abc_train_set = abc_shuffled_set['train']
abc_test_set = abc_shuffled_set['test']
for (i, example) in enumerate(abc_train_set):
    print(example)
    if i == 5:
        break

{'en': "Stop it, you're being too kind! We're old friends, so there's no need to talk politely to me.", 'yue': '咪講埋啲衰嘢！我哋係老死，唔使講客氣說話。'}
{'en': 'So many bubbles are on the surface of the water. Would there be somebody under the water?', 'yue': '水面有咁多水𦢊，水底會唔會有人啊？'}
{'en': "I don't know how many schemes I had come up with before I was done.", 'yue': '我唔知扭咗幾多六壬至去到。'}
{'en': 'A layer of skin has come off', 'yue': '甩咗一浸皮'}
{'en': 'My point of view is completely different from yours.', 'yue': '我嘅睇法同你嘅完全唔同。'}
{'en': 'The convict was hanged this morning.', 'yue': '犯人喺今朝問吊。'}


In [11]:
def count_dataset_tokens(dataset):
    en_count = 0
    yue_count = 0
    for example in dataset:
        en_count += len(example['en'])
        yue_count += len(example['yue'])
    return en_count, yue_count


counts = np.array(count_dataset_tokens(abc_train_set))
print(counts)
print(counts/len(abc_train_set))


[919534 191751]
[70.59220021 14.72063565]


In [13]:
model_path=r'/root/autodl-tmp/01ai/Yi-6B-Chat'

# model = Model.from_pretrained('01ai/Yi-6B')

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     device_map="auto",
#     torch_dtype='auto'
# ).eval()


# tokenizer = AutoTokenizer.from_pretrained(model_name)

In [14]:
model_dir = snapshot_download('01ai/Yi-6B-Chat', cache_dir='/root/autodl-tmp', revision='master')

In [15]:
base_tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)



# Since transformers 4.35.0, the GPT-Q/AWQ model can be loaded using AutoModelForCausalLM.
base_model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype='auto',
).eval()

# # Prompt content: "hi"
# messages = [
#     {"role": "user", "content": "hi"}
# ]


# input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
# output_ids = model.generate(input_ids.to('cuda'))
# response = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)

# # Model response: "Hello! How can I assist you today?"
# print(response)

Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.24it/s]


In [16]:
messages = [
    {"role": "user", "content": "你識唔識講廣東話?"},
]

input_ids = base_tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
output_ids = base_model.generate(input_ids.to('cuda'))
response = base_tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)

# Model response: "Hello! How can I assist you today?"
print(response)

作為一個人工智能，我並不具備人類的語言能力，包括說廣東話。我只能通過程式和算法來理解和生成文本，但我無法發出聲音或進行對話。如果您有其他問題，我很樂意幫助您。


In [21]:
print(input_ids)
print(output_ids)
print(base_tokenizer.decode(input_ids[0]))
print(base_tokenizer.decode(input_ids[0]))

#get text of list of tokens in output_ids stored in array
print([base_tokenizer.decode([token]) for token in output_ids[0]])

tensor([[    6,  3903,   144, 59725, 62028,   534,   453,   453, 62028, 62364,
         62098, 61518, 61845,   100,     7,   144,     6,   765, 13611,   144]])
tensor([[    6,  3903,   144, 59725, 62028,   534,   453,   453, 62028, 62364,
         62098, 61518, 61845,   100,     7,   144,     6,   765, 13611,   144,
         26747, 12666, 13992,   101, 59646, 61239, 59630, 60001, 62070, 49240,
         59599, 53202,  2604,   101,  2887, 60917, 62098, 61518, 61845,   102,
         59646,  5323, 25924, 56241, 59652, 16981, 60599, 48246, 16425, 23884,
           101, 16097, 26912, 60732, 59676, 62203, 60275, 59876, 16827, 60713,
         61845,   102, 33794, 59635,  2711, 14375,   101, 18071, 61726, 59794,
         36293, 60629,   102,     7]], device='cuda:0')
<|im_start|> user
你識唔識講廣東話?<|im_end|> 
<|im_start|>assistant

<|im_start|> user
你識唔識講廣東話?<|im_end|> 
<|im_start|>assistant

['<|im_start|>', 'user', '\n', '你', '識', '�', '�', '�', '識', '講', '廣', '東', '話', '?', '<|im_end|>', '\n', '

In [22]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['en'])):
        text1 = f"""
        <|im_start|> user
        Translate the following words into Cantonese: 
        {example['en'][i]}
        <|im_start|>assistant
        {example['yue'][i]}
        """
        text2 = f"""
        <|im_start|> user
        Translate the following words into English:
        {example['yue'][i]}
        <|im_start|>assistant
        {example['en'][i]}
        """
        output_texts.append(text1)
        output_texts.append(text2)
    return output_texts

In [23]:
prompts = formatting_prompts_func(abc_set[:10])
for prompt in prompts:
    print(prompt)


        <|im_start|> user
        Translate the following words into Cantonese: 
        Scoop up water
        <|im_start|>assistant
        㧾水
        

        <|im_start|> user
        Translate the following words into English:
        㧾水
        <|im_start|>assistant
        Scoop up water
        

        <|im_start|> user
        Translate the following words into Cantonese: 
        Ladle out soup
        <|im_start|>assistant
        㧾湯
        

        <|im_start|> user
        Translate the following words into English:
        㧾湯
        <|im_start|>assistant
        Ladle out soup
        

        <|im_start|> user
        Translate the following words into Cantonese: 
        Third son of a rich family
        <|im_start|>assistant
        三少
        

        <|im_start|> user
        Translate the following words into English:
        三少
        <|im_start|>assistant
        Third son of a rich family
        

        <|im_start|> user
        Translate the follow

In [10]:
# for name, param in base_model.named_parameters():
#     print(f"Parameter name: {name}")
#     print(param)
#     print("-" * 50)

Parameter name: model.embed_tokens.weight
Parameter containing:
tensor([[ 2.5146e-08, -3.2131e-08,  1.5367e-08,  ..., -7.6834e-08,
          8.6613e-08, -4.3306e-08],
        [-8.0909e-09,  7.7416e-09, -8.3237e-09,  ..., -2.2119e-08,
         -4.7497e-08, -3.3760e-08],
        [ 4.3945e-03,  3.1853e-04,  4.3030e-03,  ...,  3.3875e-03,
          5.4550e-04, -1.2451e-02],
        ...,
        [ 2.7100e-02,  1.6724e-02, -3.3447e-02,  ...,  2.8687e-03,
          1.2756e-02,  1.6602e-02],
        [-2.4048e-02, -2.3560e-02,  1.3977e-02,  ..., -3.7689e-03,
          2.5635e-02,  5.3406e-03],
        [ 1.5869e-02,  1.3550e-02,  3.9062e-02,  ...,  3.1006e-02,
         -7.5378e-03, -5.8899e-03]], device='cuda:0', dtype=torch.bfloat16,
       requires_grad=True)
--------------------------------------------------
Parameter name: model.layers.0.self_attn.q_proj.weight
Parameter containing:
tensor([[ 5.0659e-03,  4.0283e-03, -2.5177e-04,  ..., -3.2196e-03,
          1.9836e-03,  1.9073e-03],
       

In [24]:
print(base_model.config)

LlamaConfig {
  "_name_or_path": "/root/autodl-tmp/01ai/Yi-6B-Chat",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 5000000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.38.2",
  "use_cache": true,
  "vocab_size": 64000
}



In [26]:
lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules = ["k_proj", "q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)
peft_model = get_peft_model(base_model, 
                            lora_config)

peft_model.print_trainable_parameters()

trainable params: 17,825,792 || all params: 6,078,861,312 || trainable%: 0.293242288071467


In [13]:


trainer = SFTTrainer(
    base_model,
    train_dataset= abc_set,
    formatting_func=formatting_prompts_func,
)
trainer.train()

Map: 100%|██████████| 14474/14474 [00:01<00:00, 12646.51 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


KeyboardInterrupt: 

In [None]:
trainer.save_model("sft_model")