In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import sys
import argparse
import json
import warnings
import logging
warnings.filterwarnings("ignore")

from tqdm import tqdm
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import GenerationConfig
from peft import (
    prepare_model_for_int8_training,
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_kbit_training
)

import torch
import torch.nn as nn
import bitsandbytes as bnb
from datasets import load_dataset, load_from_disk
import transformers, datasets
from peft import PeftModel
from colorama import *

In [7]:
# Download Training dataset
# reference:https://github.com/chinese-poetry/chinese-poetry/tree/master/%E5%85%A8%E5%94%90%E8%AF%97?fbclid=IwAR2bM14S42T-VtrvMi3wywCqKfYJraBtMl7QVTo0qyPMjX9jj9Vj3JepFBA
!git clone https://github.com/juniusho/tang-poem-generator.git

Cloning into 'tang-poem-generator'...
remote: Enumerating objects: 8, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 8 (delta 0), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (8/8), 391.93 KiB | 15.68 MiB/s, done.


In [4]:
seed = 1010
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

In [5]:
# generate training data
def generate_training_data(data_point):
    # construct full input prompt
    prompt = f"""\
[INST] <<SYS>>
You are a helpful assistant and good at writing Tang poem :D
<</SYS>>

{data_point["instruction"]}
{data_point["input"]}
[/INST]"""
    # count the number of input tokens
    len_user_prompt_tokens = (
        len(
            tokenizer(
                prompt,
                truncation=True,
                max_length=CUTOFF_LEN + 1,
                padding="max_length",
            )["input_ids"]
        ) - 1
    )
    # transform input prompt into tokens
    full_tokens = tokenizer(
        prompt + " " + data_point["output"] + "</s>",
        truncation=True,
        max_length=CUTOFF_LEN + 1,
        padding="max_length",
    )["input_ids"][:-1]
    return {
        "input_ids": full_tokens,
        "labels": [-100] * len_user_prompt_tokens
        + full_tokens[len_user_prompt_tokens:],
        "attention_mask": [1] * (len(full_tokens)),
    }

# conduct an evaluation of the generated replies
def evaluate(instruction, generation_config, max_len, input="", verbose=True):

    # construct full input prompt
    prompt = f"""\
[INST] <<SYS>>
You are a helpful assistant and good at writing Tang poem :D
<</SYS>>

{instruction}
{input}
[/INST]"""
    # convert the prompt text into the numerical representation required by the model
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].cuda()
    # generate replies using the model
    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=max_len,
    )
    # decode and print the generated replies
    for s in generation_output.sequences:
        output = tokenizer.decode(s)
        output = output.split("[/INST]")[1].replace("</s>", "").replace("<s>", "").replace("Assistant:", "").replace("Assistant", "").strip()
        if (verbose):
            print(output)

    return output

In [None]:
model_name = "/content/TAIDE-LX-7B-Chat" # Model: TAIDE 7B

# TAIDE L Models Community License Agreement (https://drive.google.com/file/d/1FcUZjbUH6jr4xoCyAronN_slLgcdhEUd/view)
!wget -O taide_7b.zip "https://www.dropbox.com/scl/fi/harnetdwx2ttq1xt94rin/TAIDE-LX-7B-Chat.zip?rlkey=yzyf5nxztw6farpwyyildx5s3&st=s22mz5ao&dl=0"

!unzip taide_7b.zip

In [8]:
cache_dir = "./cache"

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

# load the pre-trained language model from the specified model name or path
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir=cache_dir,
    quantization_config=nf4_config,
    low_cpu_mem_usage = True
)

# create a tokenizer and set the end-of-sequence token (eos_token)
logging.getLogger('transformers').setLevel(logging.ERROR)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    add_eos_token=True,
    cache_dir=cache_dir,
    quantization_config=nf4_config
)
tokenizer.pad_token = tokenizer.eos_token

# set the decoding parameters needed for model inference
max_len = 128
generation_config = GenerationConfig(
    do_sample=True,
    temperature=0.1,
    num_beams=1,
    top_p=0.3,
    no_repeat_ngram_size=3,
    pad_token_id=2,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
# demo examples
test_tang_list = ['夫子何爲者？栖栖一代中。', '昨夜星辰昨夜風，畫樓西畔桂堂東。', '乘興南遊不戒嚴，九重誰省諫書函。']

# get the model output for each examples
demo_before_finetune = []
for tang in test_tang_list:
  demo_before_finetune.append(f'模型輸入:\n以下是一首唐詩的第一句話，請用你的知識判斷並完成整首詩。{tang}\n\n模型輸出:\n'+evaluate('以下是一首唐詩的第一句話，請用你的知識判斷並完成整首詩。', generation_config, max_len, tang, verbose = False))

# print and store the output to text file
for idx in range(len(demo_before_finetune)):
  print(f"Example {idx + 1}:")
  print(demo_before_finetune[idx])
  print("-" * 80)

Example 1:
模型輸入:
以下是一首唐詩的第一句話，請用你的知識判斷並完成整首詩。夫子何爲者？栖栖一代中。

模型輸出:
這首唐詩是描述孔子一生的坎坷與不遇，以及對後學們的深切關懷。以下是完整詩篇：
夫 子 何 為 者？ 栖 栖一代 中。 春 華 秋 實 始 知 君 子之 道 矣。

少 時 不 遇 明 師 指 點 迷 途 何處 去？

學 習 勤 苦 功 業 成 就 半 途而 廢。
月 明
--------------------------------------------------------------------------------
Example 2:
模型輸入:
以下是一首唐詩的第一句話，請用你的知識判斷並完成整首詩。昨夜星辰昨夜風，畫樓西畔桂堂東。

模型輸出:
根據你所提供的唐詩第一句「昨夜明星昨夜月」，我推測這首詩可能是描寫夜晚的風景和情景。以下是我根據上下文推測的完整唐詩：
昨夜晚星辰閃爍，昨夜西風吹拂。

昨宵夢裡遊仙島，今朝醒後卻孤單。
畫樓依稀在月光下，桂花飄落香滿堂。
憶起那人共賞花，如今卻只剩孤衾自擁。
月移花影動，人比花先老。
此
--------------------------------------------------------------------------------
Example 3:
模型輸入:
以下是一首唐詩的第一句話，請用你的知識判斷並完成整首詩。乘興南遊不戒嚴，九重誰省諫書函。

模型輸出:
根據你所提供的唐詩第一句「乘興遊南不戒盛，九霄誰省書函」，我推測整首唐律詩可能是描述詩人在暢遊南國時，心懷寬廣，不拘泥於小事，而對朝廷事務仍能保持熱忱，願意上書諫言。以下是猜測的完整唐律五言律詩：

乘勝南遊忘歸路，九天高處覽雲霞。
不辭勞頓渡湘水，為報明主萬里家。

寄語南風吹
--------------------------------------------------------------------------------


In [19]:
num_train_data = 5000
output_dir = "/content/drive/MyDrive"
ckpt_dir = "./exp1"
num_epoch = 1
LEARNING_RATE = 3e-4

cache_dir = "./cache"
from_ckpt = False
ckpt_name = None
dataset_dir = "./tang-poem-generator/training.json"
logging_steps = 20
save_steps = 65
save_total_limit = 3
report_to = None
MICRO_BATCH_SIZE = 4
BATCH_SIZE = 16
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
CUTOFF_LEN = 256
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05
VAL_SET_SIZE = 0
TARGET_MODULES = ["q_proj", "up_proj", "o_proj", "k_proj", "down_proj", "gate_proj", "v_proj"]
device_map = "auto"
world_size = int(os.environ.get("WORLD_SIZE", 1))
ddp = world_size != 1
if ddp:
    device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
    GRADIENT_ACCUMULATION_STEPS = GRADIENT_ACCUMULATION_STEPS // world_size

In [20]:
# create the output directory you specify
os.makedirs(output_dir, exist_ok = True)
os.makedirs(ckpt_dir, exist_ok = True)

# load the model weights from the checkpoint based on the from_ckpt flag
if from_ckpt:
    model = PeftModel.from_pretrained(model, ckpt_name)

# prepare the model for training using INT8
model = prepare_model_for_int8_training(model)

# configure the LORA model using LoraConfig
config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

# set the padding token of the tokenizer to 0
tokenizer.pad_token_id = 0

# load and process the training data
with open(dataset_dir, "r", encoding = "utf-8") as f:
    data_json = json.load(f)
with open("tmp_dataset.json", "w", encoding = "utf-8") as f:
    json.dump(data_json[:num_train_data], f, indent = 2, ensure_ascii = False)

data = load_dataset('json', data_files="tmp_dataset.json", download_mode="force_redownload")

# split the training data into training and validation sets (if VAL_SET_SIZE > 0)
if VAL_SET_SIZE > 0:
    train_val = data["train"].train_test_split(
        test_size=VAL_SET_SIZE, shuffle=True, seed=42
    )
    train_data = train_val["train"].shuffle().map(generate_training_data)
    val_data = train_val["test"].shuffle().map(generate_training_data)
else:
    train_data = data['train'].shuffle().map(generate_training_data)
    val_data = None

# train the model using the Transformers Trainer
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=MICRO_BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        warmup_steps=50,
        num_train_epochs=num_epoch,
        learning_rate=LEARNING_RATE,
        fp16=True,  # use mixed precision training
        logging_steps=logging_steps,
        save_strategy="steps",
        save_steps=save_steps,
        output_dir=ckpt_dir,
        save_total_limit=save_total_limit,
        ddp_find_unused_parameters=False if ddp else None,  # determine whether to use DDP to control the gradient update strategy
        report_to=report_to,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

# disable the model's cache functionality
model.config.use_cache = False

# if using PyTorch version 2.0 or above and not on a Windows system, compile the model
if torch.__version__ >= "2" and sys.platform != 'win32':
    model = torch.compile(model)

# start model training
trainer.train()

# save the trained model to the specified directory
model.save_pretrained(ckpt_dir)

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-10bb6f75373ffe1c/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-10bb6f75373ffe1c/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

{'loss': 27.2243, 'grad_norm': nan, 'learning_rate': 1.7999999999999997e-05, 'epoch': 0.06}
{'loss': 12.6943, 'grad_norm': nan, 'learning_rate': 4.7999999999999994e-05, 'epoch': 0.13}
{'loss': 8.8691, 'grad_norm': nan, 'learning_rate': 6.599999999999999e-05, 'epoch': 0.19}
{'loss': 10.1533, 'grad_norm': nan, 'learning_rate': 0.00010799999999999998, 'epoch': 0.26}
{'loss': 12.7078, 'grad_norm': 0.0, 'learning_rate': 0.00013199999999999998, 'epoch': 0.32}
{'loss': 8.9522, 'grad_norm': 0.0, 'learning_rate': 0.00019199999999999998, 'epoch': 0.38}
{'loss': 7.0326, 'grad_norm': 0.0, 'learning_rate': 0.00023999999999999998, 'epoch': 0.45}
{'loss': 8.1093, 'grad_norm': nan, 'learning_rate': 0.00028199999999999997, 'epoch': 0.51}
{'loss': 15.853, 'grad_norm': nan, 'learning_rate': 0.0002988549618320611, 'epoch': 0.58}
{'loss': 9.1293, 'grad_norm': nan, 'learning_rate': 0.00029541984732824424, 'epoch': 0.64}
{'loss': 732.1666, 'grad_norm': nan, 'learning_rate': 0.00029427480916030535, 'epoch': 0

In [21]:
# find all available checkpoints
ckpts = []
for ckpt in os.listdir(ckpt_dir):
    if (ckpt.startswith("checkpoint-")):
        ckpts.append(ckpt)

# list all the checkpoints
ckpts = sorted(ckpts, key = lambda ckpt: int(ckpt.split("-")[-1]))
print("all available checkpoints:")
print(" id: checkpoint name")
for (i, ckpt) in enumerate(ckpts):
    print(f"{i:>3}: {ckpt}")

all available checkpoints:
 id: checkpoint name
  0: checkpoint-130
  1: checkpoint-195
  2: checkpoint-260


In [22]:
id_of_ckpt_to_use = -1
ckpt_name = os.path.join(ckpt_dir, ckpts[id_of_ckpt_to_use])

In [23]:
max_len = 128
temperature = 0.1
top_p = 0.3

In [24]:
test_data_path = "./tang-poem-generator/testing.json"
output_path = os.path.join(output_dir, "results.txt")

cache_dir = "./cache"  # set cache directory path
seed = 42  # set random seed for reproducibility of results
no_repeat_ngram_size = 3  # set the size of N-grams to prohibit repetition, used to avoid generating duplicate segments.

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

# use a tokenizer to convert the model name into a numerical representation that the model can read
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=cache_dir,
    quantization_config=nf4_config
)

# load the model from the pre-trained model and set it as an 8-bit integer (INT8) model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=nf4_config,
    device_map={'': 0},  # set the device to be used, in this case, GPU 0
    cache_dir=cache_dir
)

# load the model weights from the specified checkpoint
model = PeftModel.from_pretrained(model, ckpt_name, device_map={'': 0})

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [26]:
# use the same demo examples as before
test_tang_list = ['相見時難別亦難，東風無力百花殘。', '重帷深下莫愁堂，臥後清宵細細長。', '芳辰追逸趣，禁苑信多奇。']

# inference our fine-tuned model
demo_after_finetune = []
for tang in test_tang_list:
  demo_after_finetune.append(f'模型輸入:\n以下是一首唐詩的第一句話，請用你的知識判斷並完成整首詩。{tang}\n\n模型輸出:\n'+evaluate('以下是一首唐詩的第一句話，請用你的知識判斷並完成整首詩。', generation_config, max_len, tang, verbose = False))

# print and store the output to text file
for idx in range(len(demo_after_finetune)):
  print(f"Example {idx + 1}:")
  print(demo_after_finetune[idx])
  print("-" * 80)

Example 1:
模型輸入:
以下是一首唐詩的第一句話，請用你的知識判斷並完成整首詩。相見時難別亦難，東風無力百花殘。

模型輸出:
根據你所提供的唐詩第一句「相見多難別更難，百花齊放春滿園」，我推測整首唐律詩可能是描述愛情或友誼中的離別與重聚的主題。以下是完整詩篇：
相见多难别更难，东风无力百花残。
春日光阳和暖暖，百卉开花如火焰。
君子于行无留迹，小人多为私計忙。
流水无边春去兮，浮雲无定秋来兮。
世事无常变
--------------------------------------------------------------------------------
Example 2:
模型輸入:
以下是一首唐詩的第一句話，請用你的知識判斷並完成整首詩。重帷深下莫愁堂，臥後清宵細細長。

模型輸出:
根據你所提供的唐詩第一句「重帷」、「深下」、以及「莫愁」等詞彙，我推測這首詩可能是描寫一個人在夜間睡眠期間，因窗戶或帷幔遮擋了月光，使室內變得昏暗而感到不安。然而，由於僅提供第一句，我無法完全確定這首詩的完整內容和主旨。若你想讓我完成整篇詩，可以提供更多相關資訊和上下文供我參考。
--------------------------------------------------------------------------------
Example 3:
模型輸入:
以下是一首唐詩的第一句話，請用你的知識判斷並完成整首詩。芳辰追逸趣，禁苑信多奇。

模型輸出:
根據你所提供的第一句唐詩，我推測整首歌的題材可能是描寫在美好時光中，詩人於禁苑（即今中國北京圓明園）閒逛，欣賞到奇妙的景色。以下是我根據這段文字創作的一首唐代風格詩歌：

芳晨光影逐花飛，禁園幽境信多姿。
春風拂面柳垂青，夏日清涼樹影移。
秋天紅葉飄零水邊，冬雪飄揚冰上晶。
四時美景皆可賞，何必拘拘在室內
--------------------------------------------------------------------------------
