In [None]:
"""
@misc{traditional-chinese-alpaca,
  author = {Wei-Lin Chen and Cheng-Kuang Wu and Hsin-Hsi Chen},
  title = {Traditional-Chinese Alpaca: Models and Datasets},
  year = {2023},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\\url{https://github.com/ntunlplab/traditional-chinese-alpaca}},
}
本程式使用了此研究提供的資料集並參考其部分程式碼。
"""

In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
!sudo add-apt-repository ppa:ubuntu-toolchain-r/test
!sudo apt-get update
!sudo apt-get install --only-upgrade libstdc++6
!strings /usr/lib/x86_64-linux-gnu/libstdc++.so.6 | grep GLIBCXX
! pip install transformers datasets torch trl bitsandbytes
!Y| pip uninstall peft
! pip install peft==0.9.0

In [None]:
!wget https://raw.githubusercontent.com/ntunlplab/traditional-chinese-alpaca/main/data/alpaca-tw_en_instruction.json
!mkdir chatbot_model

In [None]:
import wandb

# Initialize a new W&B run to track this job
run = wandb.init(project="huggingface", job_type="model_training")

# Create a new artifact, which is a sample dataset
dataset = wandb.Artifact('alpaca-tw_en', type='dataset')
# Add files to the artifact, in this case a simple text file
dataset.add_file('alpaca-tw_en_instruction.json')
# Log the artifact to save it as an output of this run
run.log_artifact(dataset)

wandb.finish()

In [None]:
import os
import sys
import argparse

import torch
import torch.nn as nn
import bitsandbytes as bnb
from datasets import load_dataset
import transformers
from peft import PeftModel

from transformers import (
  BertTokenizerFast,
  AutoModel,
  AutoTokenizer,
  GPT2LMHeadModel,
  TrainingArguments,
  DataCollatorForLanguageModeling,
  Trainer,
  TrainerCallback
)
from peft import (
    #prepare_model_for_int8_training,
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
)
import wandb
MICRO_BATCH_SIZE = 4
BATCH_SIZE = 128
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
# EPOCHS = 6
LEARNING_RATE = 3e-4
CUTOFF_LEN = 256
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05
VAL_SET_SIZE = 0
TARGET_MODULES = [
    "q_proj",
    "v_proj",
]

class WandbUploadCallback(TrainerCallback):
    def on_save(self, args, state, control, **kwargs):
        arti_model = wandb.Artifact('gpt2', type='model')
        arti_model.add_dir('chatbot_model')
        wandb.log_artifact(arti_model)

def generate_and_tokenize_prompt(data_point):
    # This function masks out the labels for the input,
    # so that our loss is computed only on the response.
    user_prompt = (
        (
            f"""下方是一個關於任務的指令，以及一個提供與任務相關之資訊的輸入。請撰寫一個能適當地完成該任務需求的回覆。
            ### 指令:
            {data_point["instruction"]}
            ### 輸入:
            {data_point["input"]}
            ### 回覆:
            """
        )
        if data_point["input"]
        else (
            f"""下方是一個關於任務的指令。請撰寫一個能適當地完成該任務需求的回覆。
            ### 輸入:
            {data_point["instruction"]}
            ### 回覆:
            """
        )
    )
    len_user_prompt_tokens = (
        len(
            tokenizer(
                user_prompt,
                truncation=True,
                max_length=CUTOFF_LEN + 1,
                padding="max_length",
            )["input_ids"]
        )
        - 1
    )  # no eos token
    full_tokens = tokenizer(
        user_prompt + data_point["output"],
        truncation=True,
        max_length=CUTOFF_LEN + 1,
        padding="max_length",
    )["input_ids"][:-1]
    return {
        "input_ids": full_tokens,
        "labels": [-100] * len_user_prompt_tokens
        + full_tokens[len_user_prompt_tokens:],
        "attention_mask": [1] * (len(full_tokens)),
    }
if __name__ == "__main__":

    device_map = "auto"
    world_size = int(os.environ.get("WORLD_SIZE", 1))
    ddp = world_size != 1
    wandb.init(
        project='huggingface',
        name='batch_size: %d' % BATCH_SIZE,
        resume = 'allow')
    arti_dataset = wandb.Artifact('alpaca-tw_en_instruction', type='dataset')
    arti_dataset.add_file('alpaca-tw_en_instruction.json')
    wandb.log_artifact(arti_dataset)

    arti_model = wandb.Artifact('gpt2', type='model')
    arti_model.add_dir('chatbot_model')
    wandb.log_artifact(arti_model)
    model_name = 'gpt2'
    train_set = ''
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    try:
        best_model = wandb.restore('gpt2',run_path = '%my W&B ProjectName%')
        model.load_weights(best_model.name)
    except:
        pass

    tokenizer.pad_token = tokenizer.eos_token
    if torch.cuda.is_available():
        device = "cuda"
    elif torch.backends.mps.is_available():
        device = "mps"
    else:
        device = "cpu"
    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

    if train_set =='bst':
        dataset = load_dataset("blended_skill_talk", split = 'train')

        def format_conversation(example):
            formatted_text = ""
            if example["previous_utterance"]:
                formatted_text += f"A: {example['previous_utterance']}\n"

            for i, msg in enumerate(example["free_messages"]):
                speaker = "A" if i % 2 == 0 else "B"
                formatted_text += f"{speaker}: {msg}\n"

            for i, msg in enumerate(example["guided_messages"]):
                speaker = "A" if i % 2 == 0 else "B"
                formatted_text += f"{speaker}: {msg}\n"

            return {"text": formatted_text}

        train_dataset = dataset.map(format_conversation)
    elif train_set == 'wiki':
        train_dataset = load_dataset("wikitext", "wikitext-2-raw-v1",split = 'train').select(range(4000))
    else:
        data = load_dataset(
            "json",
            data_files="alpaca-tw_en_instruction.json"
        )
        train_data = data['train'].shuffle().map(generate_and_tokenize_prompt)
        val_data = None

    #tokenized_datasets = train_dataset.map(tokenize_function, batched=True)
    training_args = TrainingArguments(
        output_dir="./chatbot_model",
        per_device_train_batch_size=4,
        num_train_epochs=1,
        logging_steps=100,
        save_steps=100,
        save_total_limit=1,
        evaluation_strategy="no",
        fp16=torch.cuda.is_available(),
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False
    )
    model = model.to(device)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks=[WandbUploadCallback()]
    )
    # 開始訓練
    trainer.train()