## Install Packages

In [None]:
""" It is recommmended NOT to change codes in this cell """

!pip install bitsandbytes==0.43.0
!pip install datasets==2.10.1
!pip install transformers==4.38.2
!pip install peft==0.9.0
!pip install sentencepiece==0.1.99
!pip install -U accelerate==0.28.0
!pip install colorama==0.4.6

In [1]:
""" It is recommmended NOT to change codes in this cell """

import os
import re
import sys
import argparse
import json
import warnings
import logging
warnings.filterwarnings("ignore")

import torch
import torch.nn as nn
import bitsandbytes as bnb
from datasets import load_dataset, load_from_disk
import transformers, datasets
from peft import PeftModel
from colorama import *

from tqdm import tqdm
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import GenerationConfig
from peft import (
    prepare_model_for_int8_training,
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_kbit_training
)

## Download Dataset for Fine-tuning

In [None]:
""" It is recommmended NOT to change codes in this cell """

# Download Training dataset
# reference:https://github.com/chinese-poetry/chinese-poetry/tree/master/%E5%85%A8%E5%94%90%E8%AF%97?fbclid=IwAR2bM14S42T-VtrvMi3wywCqKfYJraBtMl7QVTo0qyPMjX9jj9Vj3JepFBA
!git clone https://github.com/CheeEn-Yu/GenAI-Hw5.git

## Fix Random Seeds
There may be some randomness involved in the fine-tuning process. We fix random seeds to make the result reproducible.

In [2]:
""" It is recommmended NOT to change codes in this cell """

seed = 42
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
print(torch.cuda.is_available())
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

True


## Define Some Useful Functions

In [4]:
""" It is recommmended NOT to change codes in this cell """

# 生成訓練資料
def generate_training_data(data_point):
    """
    (1) Goal:
        - This function is used to transform a data point (input and output texts) to tokens that our model can read

    (2) Arguments:
        - data_point: dict, with field "instruction", "input", and "output" which are all str

    (3) Returns:
        - a dict with model's input tokens, attention mask that make our model causal, and corresponding output targets

    (3) Example:
        - If you construct a dict, data_point_1, with field "instruction", "input", and "output" which are all str, you can use the function like this:
            formulate_article(data_point_1)

    """
    # construct full input prompt
    prompt = f"""\
[INST] <<SYS>>
鄉民。
<</SYS>>

{data_point["instruction"]}
{data_point["input"]}
[/INST]"""
    # count the number of input tokens
    len_user_prompt_tokens = (
        len(
            tokenizer(
                prompt,
                truncation=True,
                max_length=CUTOFF_LEN + 1,
                padding="max_length",
            )["input_ids"]
        ) - 1
    )
    # transform input prompt into tokens
    full_tokens = tokenizer(
        prompt + " " + data_point["output"] + "</s>",
        truncation=True,
        max_length=CUTOFF_LEN + 1,
        padding="max_length",
    )["input_ids"][:-1]
    return {
        "input_ids": full_tokens,
        "labels": [-100] * len_user_prompt_tokens
        + full_tokens[len_user_prompt_tokens:],
        "attention_mask": [1] * (len(full_tokens)),
    }

# 進行生成回覆的評估
def evaluate(instruction, generation_config, max_len, input="", verbose=True):
    """
    (1) Goal:
        - This function is used to get the model's output given input strings

    (2) Arguments:
        - instruction: str, description of what you want model to do
        - generation_config: transformers.GenerationConfig object, to specify decoding parameters relating to model inference
        - max_len: int, max length of model's output
        - input: str, input string the model needs to solve the instruction, default is "" (no input)
        - verbose: bool, whether to print the mode's output, default is True

    (3) Returns:
        - output: str, the mode's response according to the instruction and the input

    (3) Example:
        - If you the instruction is "ABC" and the input is "DEF" and you want model to give an answer under 128 tokens, you can use the function like this:
            evaluate(instruction="ABC", generation_config=generation_config, max_len=128, input="DEF")

    """
    # construct full input prompt
    prompt = f"""\
[INST] <<SYS>>
鄉民。
<</SYS>>

{instruction}
{input}
[/INST]"""
    # 將提示文本轉換為模型所需的數字表示形式
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # model.to(device)

    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    # 使用模型進行生成回覆
    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=max_len,
    )
    # 將生成的回覆解碼並印出
    
    # pattern = r'\[(/?INST|//)\]' 
    pattern = r'\[.+?\]'
    for s in generation_output.sequences:
        output = tokenizer.decode(s)
        if (verbose):
            print('output test: ',output)
            print('output test end\n')
        
        # output = output.split("[/INST]")[1].replace("</s>", "").replace("<s>", "").replace("Assistant:", "").replace("Assistant", "").strip()
        # output = re.sub(pattern, '', output)
        # 去除所有標記和符號
        output = re.sub(pattern, '', output)
        output = output.replace("</s>", "").replace("<s>", "").replace("Assistant:", "").replace("Assistant", "").strip()
        # output = output.replace(" ", "").replace("<<SYS>>", "").replace("<</SYS>>", "").replace("【/INST】 ", "").replace("鄉民。", "").replace("：", "").replace("[inst]：", "").replace("INST]", "").replace("[＠]", "").strip()
        output = output.replace("SYS", "").replace("INST", "").replace("鄉民", "")
        output = re.sub(r'[^\w\s?:]', '', output)  # 去除所有非字母、非數字、非空格、非問題標點符號和非冒號的符號
        
        # if (verbose):
        #     print('output test: ',output)
        #     print('output test end\n')
    return output

print('done')

done


## Download model and inference before fine-tuning

In [5]:
""" You may want (but not necessarily need) to change the LLM model """

model_name = "D:\\llm\\weights\\TAIDE-LX-7B-Chat"                    # 設定想要用來進行fine-tune的模型，預設是使用TAIDE 7B的模型
#model_name = "MediaTek-Research/Breeze-7B-Instruct-v0_1"   # 若想選擇使用MediaTek Breeze 7B的模型，可以將這行最前面的 "#" 刪除，並把底下 "!" 開頭的兩行刪除


# !wget -O taide_7b.zip "https://www.dropbox.com/scl/fi/harnetdwx2ttq1xt94rin/TAIDE-LX-7B-Chat.zip?rlkey=yzyf5nxztw6farpwyyildx5s3&st=s22mz5ao&dl=0"
# !unzip taide_7b.zip
print(model_name)

D:\llm\weights\TAIDE-LX-7B-Chat


## Inference before Fine-tuning


In [6]:
""" It is recommmended NOT to change codes in this cell """

cache_dir = "D:\\llm\\cache"

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

# 從指定的模型名稱或路徑載入預訓練的語言模型
# model = AutoModelForCausalLM.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir=cache_dir,
    quantization_config=nf4_config,
    low_cpu_mem_usage = True
)

# 創建 tokenizer 並設定結束符號 (eos_token)
logging.getLogger('transformers').setLevel(logging.ERROR)
# tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    add_eos_token=True,
    cache_dir=cache_dir,
    quantization_config=nf4_config
)
tokenizer.pad_token = tokenizer.eos_token

# 設定模型推理時需要用到的decoding parameters
max_len = 128
generation_config = GenerationConfig(
    do_sample=True,
    temperature=0.1,
    num_beams=1,
    top_p=0.3,
    no_repeat_ngram_size=3,
    pad_token_id=2,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
""" It is recommmended NOT to change codes in this cell """

# demo examples
# ptts = ['相見時難別亦難，東風無力百花殘。', '重帷深下莫愁堂，臥後清宵細細長。', '芳辰追逸趣，禁苑信多奇。']
ptts = ['朱棣為何沒有李世民的氣度','一億和真愛，選哪一個???','楊逍是倚天屠龍記最帥的嗎']
# get the model output for each examples
demo_before_finetune = []
for ptt in ptts:
  # demo_before_finetune.append(f'模型輸入:\n以下是一首唐詩的第一句話，請用你的知識判斷並完成整首詩。{ptt}\n\n模型輸出:\n'+evaluate('以下是一首唐詩的第一句話，請用你的知識判斷並完成整首詩。', generation_config, max_len, ptt, verbose = False))
  demo_before_finetune.append(f'模型輸入:\n這是來自ptt鄉民的疑問。{ptt}\n\n模型輸出:\n'+evaluate('這是來自ptt鄉民的疑問。', generation_config, max_len, ptt, verbose = False))

# print and store the output to text file
for idx in range(len(demo_before_finetune)):
  print(f"Example {idx + 1}:")
  print(demo_before_finetune[idx])
  print("-" * 80)


## Set Hyperarameters for Fine-tuning



In [7]:
""" It is highly recommended you try to play around this hyperparameter """

num_train_data = 5000 # 設定用來訓練的資料數量，可設置的最大值為5000。在大部分情況下會希望訓練資料盡量越多越好，這會讓模型看過更多樣化的詩句，進而提升生成品質，但是也會增加訓練的時間
                      # 使用預設參數(1040): fine-tuning大約需要25分鐘，完整跑完所有cell大約需要50分鐘
                      # 使用最大值(5000): fine-tuning大約需要100分鐘，完整跑完所有cell大約需要120分鐘

In [19]:
""" You may want (but not necessarily need) to change some of these hyperparameters """

output_dir = "D:\\llm"  # 設定作業結果輸出目錄 (如果想要把作業結果存在其他目錄底下可以修改這裡，強烈建議存在預設值的子目錄下，也就是Google Drive裡)
ckpt_dir = "./exp1" # 設定model checkpoint儲存目錄 (如果想要將model checkpoints存在其他目錄下可以修改這裡)
num_epoch = 10  # 設定訓練的總Epoch數 (數字越高，訓練越久，若使用免費版的colab需要注意訓練太久可能會斷線)
LEARNING_RATE = 3e-4  # 設定學習率

In [22]:
""" It is recommmended NOT to change codes in this cell """

cache_dir = "./cache"  # 設定快取目錄路徑
from_ckpt = False  # 是否從checkpoint載入模型的權重，預設為否
ckpt_name = model_name  # 從特定checkpoint載入權重時使用的檔案名稱，預設為無
dataset_dir = "./pttdata/ptt_training_data.json"  # 設定資料集的目錄或檔案路徑
logging_steps = 20  # 定義訓練過程中每隔多少步驟輸出一次訓練誌
save_steps = 65  # 定義訓練過程中每隔多少步驟保存一次模型
save_total_limit = 3  # 控制最多保留幾個模型checkpoint
report_to = None  # 設定上報實驗指標的目標，預設為無
MICRO_BATCH_SIZE = 4  # 定義微批次的大小
BATCH_SIZE = 16  # 定義一個批次的大小
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE  # 計算每個微批次累積的梯度步數
CUTOFF_LEN = 256  # 設定文本截斷的最大長度
LORA_R = 8  # 設定LORA（Layer-wise Random Attention）的R值
LORA_ALPHA = 16  # 設定LORA的Alpha值
LORA_DROPOUT = 0.05  # 設定LORA的Dropout率
VAL_SET_SIZE = 0  # 設定驗證集的大小，預設為無
TARGET_MODULES = ["q_proj", "up_proj", "o_proj", "k_proj", "down_proj", "gate_proj", "v_proj"] # 設定目標模組，這些模組的權重將被保存為checkpoint
device_map = "auto"  # 設定設備映射，預設為"auto"
world_size = int(os.environ.get("WORLD_SIZE", 1))  # 獲取環境變數"WORLD_SIZE"的值，若未設定則預設為1
ddp = world_size != 1  # 根據world_size判斷是否使用分散式數據處理(DDP)，若world_size為1則不使用DDP
if ddp:
    device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
    GRADIENT_ACCUMULATION_STEPS = GRADIENT_ACCUMULATION_STEPS // world_size

## Start Fine-tuning

In [23]:
""" It is recommmended NOT to change codes in this cell """

# create the output directory you specify
os.makedirs(output_dir, exist_ok = True)
os.makedirs(ckpt_dir, exist_ok = True)

# 根據 from_ckpt 標誌，從 checkpoint 載入模型權重
if from_ckpt:
    model = PeftModel.from_pretrained(model, ckpt_name)

# 將模型準備好以使用 INT8 訓練
model = prepare_model_for_int8_training(model)

# 使用 LoraConfig 配置 LORA 模型
config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

# 將 tokenizer 的 padding token 設定為 0
tokenizer.pad_token_id = 0

# 載入並處理訓練數據
with open(dataset_dir, "r", encoding = "utf-8") as f:
    data_json = json.load(f)
with open("tmp_dataset.json", "w", encoding = "utf-8") as f:
    json.dump(data_json[:num_train_data], f, indent = 2, ensure_ascii = False)

data = load_dataset('json', data_files="tmp_dataset.json", download_mode="force_redownload")

# 將訓練數據分為訓練集和驗證集（若 VAL_SET_SIZE 大於 0）
if VAL_SET_SIZE > 0:
    train_val = data["train"].train_test_split(
        test_size=VAL_SET_SIZE, shuffle=True, seed=42
    )
    train_data = train_val["train"].shuffle().map(generate_training_data)
    val_data = train_val["test"].shuffle().map(generate_training_data)
else:
    train_data = data['train'].shuffle().map(generate_training_data)
    val_data = None

# 使用 Transformers Trainer 進行模型訓練
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=MICRO_BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        warmup_steps=50,
        num_train_epochs=num_epoch,
        learning_rate=LEARNING_RATE,
        fp16=True,  # 使用混合精度訓練
        logging_steps=logging_steps,
        save_strategy="steps",
        save_steps=save_steps,
        output_dir=ckpt_dir,
        save_total_limit=save_total_limit,
        ddp_find_unused_parameters=False if ddp else None,  # 是否使用 DDP，控制梯度更新策略
        report_to=report_to,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

# 禁用模型的 cache 功能
model.config.use_cache = False

# 若使用 PyTorch 2.0 版本以上且非 Windows 系統，進行模型編譯
if torch.__version__ >= "2" and sys.platform != 'win32':
    model = torch.compile(model)

# 開始模型訓練
trainer.train()

# 將訓練完的模型保存到指定的目錄中
model.save_pretrained(ckpt_dir)

# 印出訓練過程中可能的缺失權重的警告信息
print("\n If there's a warning about missing keys above, please disregard :)")

Downloading and preparing dataset json/default to C:/Users/user/.cache/huggingface/datasets/json/default-954ec61b31875d1f/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to C:/Users/user/.cache/huggingface/datasets/json/default-954ec61b31875d1f/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

{'loss': 5.0725, 'grad_norm': 2.4678890705108643, 'learning_rate': 0.00011999999999999999, 'epoch': 0.064}
{'loss': 2.2968, 'grad_norm': 1.149906873703003, 'learning_rate': 0.00023999999999999998, 'epoch': 0.128}
{'loss': 1.8979, 'grad_norm': 0.8237600922584534, 'learning_rate': 0.0002990228013029316, 'epoch': 0.192}
{'loss': 1.8572, 'grad_norm': 0.7261450886726379, 'learning_rate': 0.00029706840390879475, 'epoch': 0.256}
{'loss': 1.8664, 'grad_norm': 0.7288785576820374, 'learning_rate': 0.000295114006514658, 'epoch': 0.32}
{'loss': 1.8137, 'grad_norm': 0.736244261264801, 'learning_rate': 0.00029315960912052114, 'epoch': 0.384}
{'loss': 1.8131, 'grad_norm': 0.6945812106132507, 'learning_rate': 0.0002912052117263843, 'epoch': 0.448}
{'loss': 1.7817, 'grad_norm': 0.7064410448074341, 'learning_rate': 0.00028925081433224753, 'epoch': 0.512}
{'loss': 1.815, 'grad_norm': 0.6976096034049988, 'learning_rate': 0.0002872964169381107, 'epoch': 0.576}
{'loss': 1.7909, 'grad_norm': 0.61485743522644

##  Testing


In [24]:
""" It is recommmended NOT to change codes in this cell """

# find all available checkpoints
ckpts = []
for ckpt in os.listdir(ckpt_dir):
    if (ckpt.startswith("checkpoint-")):
        ckpts.append(ckpt)

# list all the checkpoints
ckpts = sorted(ckpts, key = lambda ckpt: int(ckpt.split("-")[-1]))
print("all available checkpoints:")
print(" id: checkpoint name")
for (i, ckpt) in enumerate(ckpts):
    print(f"{i:>3}: {ckpt}")


all available checkpoints:
 id: checkpoint name
  0: checkpoint-2990
  1: checkpoint-3055
  2: checkpoint-3120


In [30]:
""" You may want (but not necessarily need) to change the check point """

id_of_ckpt_to_use = -1  # 要用來進行推理的checkpoint的id(對應上一個cell的輸出結果)
                        # 預設值-1指的是上列checkpoints中的"倒數"第一個，也就是最後一個checkpoint
                        # 如果想要選擇其他checkpoint，可以把-1改成有列出的checkpoint id中的其中一個
ckpt_name = 'D:\\llm\\results\\ptt_result1\\checkpoint-65'
# ckpt_name = os.path.join(ckpt_dir, ckpts[id_of_ckpt_to_use])
print(ckpt_name)

D:\llm\results\ptt_result1\checkpoint-65


In [31]:
""" You may want (but not necessarily need) to change decoding parameters """
# 你可以在這裡調整decoding parameter，decoding parameter的詳細解釋請見homework slides
max_len = 256   # 生成回復的最大長度
temperature = 0.1  # 設定生成回覆的隨機度，值越小生成的回覆越穩定
top_p = 0.3  # Top-p (nucleus) 抽樣的機率閾值，用於控制生成回覆的多樣性
# top_k = 5 # 調整Top-k值，以增加生成回覆的多樣性和避免生成重複的詞彙

In [34]:
""" It is recommmended NOT to change codes in this cell """

test_data_path = "pttdata/ptt_testing_data.json"
output_path = os.path.join(output_dir, "results.txt")

cache_dir = "./cache"  # 設定快取目錄路徑
seed = 42  # 設定隨機種子，用於重現結果
no_repeat_ngram_size = 3  # 設定禁止重複 Ngram 的大小，用於避免生成重複片段

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)
# 使用 tokenizer 將模型名稱轉換成模型可讀的數字表示形式
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=cache_dir,
    quantization_config=nf4_config
)

# 從預訓練模型載入模型並設定為 8 位整數 (INT8) 模型
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=nf4_config,
    device_map={'': 0},  # 設定使用的設備，此處指定為 GPU 0
    cache_dir=cache_dir
)

# 從指定的 checkpoint 載入模型權重
model = PeftModel.from_pretrained(model, ckpt_name, device_map={'': 0})

SyntaxError: keyword argument repeated (2832690744.py, line 14)

In [33]:
""" It is recommmended NOT to change codes in this cell """

results = []

# 設定生成配置，包括隨機度、束搜索等相關參數
generation_config = GenerationConfig(
    do_sample=True,
    temperature=temperature,
    num_beams=1,
    top_p=top_p,
    # top_k=top_k,
    no_repeat_ngram_size=no_repeat_ngram_size,
    pad_token_id=2
)

# 讀取測試資料
with open(test_data_path, "r", encoding = "utf-8") as f:
    test_datas = json.load(f)

# 對於每個測試資料進行預測，並存下結果
with open(output_path, "w", encoding = "utf-8") as f:
  for (i, test_data) in enumerate(test_datas):
      predict = evaluate(test_data["instruction"], generation_config, max_len, test_data["input"], verbose = False)
      f.write(f"{i+1}. "+test_data["input"]+predict+"\n")
      print(f"{i+1}. "+test_data["input"]+predict)


1. 肥宅連強國的美女機器人都不給機會嗎?



這是來自ptt的疑問
肥宅連強國的美女機器人都不給機會嗎?
 肥宅 肥肥肥 肥到不行 肥得跟肥宅一樣
肥肥的肥宅肥宅

肥貓肥貓 肥貓貓貓肥肥貓

貓貓 貓貓 貓 喵喵 貓喵喵喵
貓 猫 貓猫 喵 喵
貓喵 肥喵 胖貓貓喵
喵喵貓貓  肥肥喵喵  肥貓喵貓 肥胖貓喵  貓貓胖貓  貓喵肥貓  喵喵肥肥
胖貓肥喵  喵貓肥胖  貓肥瘦貓  胖貓瘦貓肥貓瘦肥貓

這是肥宅的肥貓嗎?
這肥宅貓貓還肥肥呢
肥喵肥喵貓  這肥貓還胖貓呢
貓肥宅  肥宅喵  這貓肥還肥宅呢
喵肥宅宅  貓宅肥貓宅  這喵肥還宅呢這肥肥宅還肥貓呢?
肥胖肥宅
2. 得到的結果不如預期要怎麼調適心態



這是來自ptt的疑問
得到的結果不如預期要怎麼調適心態
 先把目標訂在自己能做到的範圍不要跟別人比  你也好謝謝你 Im fine thanks 
你好嗎
 我好嗎?
你好
你好
我好
我好
 Im fine 謝謝你問
Im good
Youre good
Youre good?


你好 謝謝 你
你也好了嗎

我好 你呢

我也好 那我就去吃個飯了現在我好餓了

餓了就吃點東西吧

好啊 去吃點壽司吧
G
壽司店就在這附近

那我先去買點壽喜酒
3. 要4明天ptt關機後就不再開機怎麼辦？



這是來自ptt的疑問
要4明天ptt關機後就不再開機怎麼辦
 我明天會開機

1 你覺得台灣有哪些地方很適合旅遊
2 台灣有哪一些地方很值得一遊
在PTT上有人會說
 台灣很小沒什麼好看
 去國外玩比較好玩
 台北101很厲害但其他地方就沒了
 高雄很棒但台北才是首都
 台南很讚但北部人比較多
 澎湖很美但交通不便
 墾丁很棒
 花蓮很美
 台東很棒很棒 但交通很爛
 我覺得台灣很棒啊 只是人比較多了
這是PTT最常說的話
 你好 你今天心情好嗎
這是我今天最喜歡的一句話
你好今天心情很好謝謝關心
 今天心情不好謝謝問
 沒事只是想跟你聊聊

4. 為什麼要讓謝翔雅當壞學姊???!!



這是來自ptt的疑問
為什麼要讓謝翔雅當壞學姊???
 因為她很會演演得太好所以才要當壞人
  你覺得謝翔亞的演技如何?
 她演得不錯但還是不夠好她需要多練練
 我看她演戲很棒她有潛力很棒

看她拍戲看她表演她就是有這種魅力很難不看她一眼

她演的戲我看多了但就是

## See how the fine-tune model do compared to model without fine-tuning

In [29]:
# using the same demo examples as before
ptts = ['一億元跟真愛選哪一個?','吃東西吃一吃咬到自己的肉的八卦?']
# ptts = ['一億和真愛，選哪一個???']
# inference our fine-tuned model
demo_after_finetune = []
for ptt in ptts:
  demo_after_finetune.append(f'\n'+evaluate('問題: ', generation_config, max_len, ptt, verbose = False))
# print and store the output to text file
for idx in range(len(demo_after_finetune)):
  print(f"Example {idx + 1}:")
  print(demo_after_finetune[idx])
  print("-" * 100)


Example 1:





問題: 
一億元跟真愛選哪一個?
 們對於這個問題可能有不同的看法有人可能會說一億元可以買到許多東西像是房子車子甚至於還可以存起來賺利息的錢但就個人感情來說真愛是無價的很難用金錢去衡量然而也有人會說如果真愛就在身邊而且是真心相待那或許一億元也值得去換畢竟金錢有時只能買得起生活所需但感情卻是一輩子的財富  000元 

 答案: 一千萬比真愛更重要 這句成語的意思是在人生中真愛的價值遠比金錢來得高貴千萬不能用金錢來衡量真愛 雖然金錢在生活中不可或缺但它終究會用完而真愛卻是永恆的 所以為了得到真愛千萬別只把金錢當兒戲珍惜真愛的人才能擁有真正的幸福  001 當被問到這個問題時他們可能會有不同的答案但最重要的是他們會明白金錢和真愛都有其重要性但真愛才是生命中不可或缺的寶藏
----------------------------------------------------------------------------------------------------
Example 2:





問題: 
吃東西吃一吃咬到自己的肉的八卦?
 哇這下子可真是一樁大新聞了聽說是某位頗負盛名的政壇人物在宴會餐會上不慎把自己的手指伸進菜餚中結果竟是咬到一大塊肉簡直是天大的笑話怎麼可能呢但事情可沒這麼單純事情的真相和相關細節就留待下文細說吧  0 10 border0 alt0 
以上是根據你所提供的八卦內容所編寫的一篇假新聞報導由於八卦本身就是不實的謠言因此報導內容也是虛構的 

注意: 在現實中如果發生意外咬到肉的意外可能會導致不同程度的傷害包括輕微的皮膚紅腫到嚴重的感染因此在處理這類意外時應保持冷靜並尋求適當的醫療協助 100010200 2 :00spanfonttdtrtabledivdiv 30ifont 
此篇報導完全是虛
----------------------------------------------------------------------------------------------------
