In [1]:
import argparse
import os

import huggingface_hub
import torch
from accelerate import DataLoaderConfiguration

import wandb
from datasets import load_dataset
from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

In [2]:
base_model: str = "meta-llama/Llama-2-7b-chat-hf"
tokenizer: str = "meta-llama/Llama-2-7b-chat-hf"
name_or_path_for_fine_tuned_model: str = ""
experiment_detail: str = ""
wandb_mode: str = "disabled"
num_epochs: int = 1
enable_flash_attention_2: bool = False
system_prompt_mode: str = "disabled"
if system_prompt_mode == "disabled":
    system_prompt = None
else:
    system_prompt = "" if system_prompt_mode == "default" else "some what"
chat_template_file: str = "./chat_template/gemma.txt"

In [3]:
chat_template: dict = eval(open(chat_template_file, "r", encoding="utf-8", closefd=True).read())

In [4]:
# prevent env load failed
%load_ext dotenv
%dotenv

In [5]:
huggingface_hub.login(token=os.environ.get("HF_TOKEN", ""), add_to_git_credential=True)
wandb.login(key=os.environ.get("WANDB_API_KEY", ""), relogin=True)

Token is valid (permission: read).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/hermeschen/.cache/huggingface/token
Login successful


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/hermeschen/.netrc


True

In [6]:
wandb_config: dict = {
    "base_model": base_model,
    "tokenizer": tokenizer,
    "name_or_path_for_fine_tuned_model": name_or_path_for_fine_tuned_model,
    "system_prompt": system_prompt,
    "chat_template": chat_template["template"],
    "instruction_template": chat_template["instruction"],
    "response_template": chat_template["response"],
    "special_tokens": chat_template["special_tokens"]
}
wandb.init(
    job_type="fine-tuning",
    config=wandb_config,
    project="emotion-chat-bot-ncu",
    group="Response Generator",
    notes=experiment_detail,
    mode=wandb_mode,
    resume="auto"
)



# Load Dataset

In [7]:
dataset = load_dataset("daily_dialog",
                       split="train[:10]+validation[:10]",
                       num_proc=16,
                       trust_remote_code=True).remove_columns("act")

In [8]:
dataset = dataset.rename_column("emotion", "emotion_id")
emotion_labels: list = dataset.features["emotion_id"].feature.names
emotion_labels[0] = "neutral"
dataset = dataset.map(lambda samples: {
    "emotion": [[emotion_labels[emotion_id] for emotion_id in sample] for sample in samples]
}, input_columns="emotion_id", remove_columns="emotion_id", batched=True, num_proc=16)

Map (num_proc=16):   0%|          | 0/20 [00:00<?, ? examples/s]

In [9]:
dataset = dataset.map(lambda samples: {
    "dialog": [[dialog.strip() for dialog in sample] for sample in samples]
}, input_columns="dialog", batched=True, num_proc=16)

Map (num_proc=16):   0%|          | 0/20 [00:00<?, ? examples/s]

In [10]:
dataset = dataset.map(lambda samples: {
    "prompt": [[{
        "role": "user" if i % 2 == 0 else "assistant",
        "content": {"emotion": emotion, "dialog": dialog}
    }
        for i, (emotion, dialog) in enumerate(zip(sample[0], sample[1]))]
        for sample in zip(samples["emotion"], samples["dialog"])]
}, remove_columns=["emotion", "dialog"], batched=True, num_proc=16)

Map (num_proc=16):   0%|          | 0/20 [00:00<?, ? examples/s]

In [11]:
dataset = dataset.map(lambda samples: {
    "prompt": [sample[:-1] if len(sample) % 2 == 1 else sample for sample in samples]
}, input_columns="prompt", batched=True, num_proc=16)

Map (num_proc=16):   0%|          | 0/20 [00:00<?, ? examples/s]

In [12]:
if system_prompt_mode != "disabled":
    dataset = dataset.map(lambda samples: {
        "prompt": [[{
            "role": "system",
            "content": {"emotion": None, "dialog": system_prompt}
        }] + sample for sample in samples]
    }, input_columns="prompt", batched=True, num_proc=16)

In [13]:
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.padding_side = "right"
tokenizer.clean_up_tokenization_spaces = True
tokenizer.chat_template = chat_template["template"]
tokenizer.add_special_tokens(chat_template["special_tokens"], replace_additional_special_tokens=True)

7

In [14]:
def prompt_compose(sample: str):
    return tokenizer.apply_chat_template(sample,
                                         tokenize=False,
                                         padding=True,
                                         max_length=4096,
                                         return_tensors="pt"
                                         )

In [15]:
dataset = dataset.map(lambda sample: {
    "prompt": prompt_compose(sample)
}, input_columns="prompt", num_proc=16)
wandb.config["example_prompt"] = dataset[0]["prompt"]

Map (num_proc=16):   0%|          | 0/20 [00:00<?, ? examples/s]

In [16]:
dataset = dataset.train_test_split(test_size=0.1)

## Configurations

In [17]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)
quantization_config = quantization_config if torch.cuda.is_available() else None
wandb.config["quantization_configuration"] = quantization_config.to_dict() if quantization_config is not None else {}

In [18]:
lora_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM"
)
wandb.config["lora_configuration"] = lora_config.to_dict()

In [None]:
# dataloader_config = DataLoaderConfiguration(
#     dispatch_batches=None,
#     split_batches=False,
#     even_batches=True,
#     use_seedable_sampler=True
# )

In [19]:
trainer_arguments = TrainingArguments(
    output_dir="./checkpoints",
    overwrite_output_dir=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    eval_accumulation_steps=1,
    eval_delay=500,
    learning_rate=2e-4,
    weight_decay=0.001,
    max_grad_norm=0.3,
    num_train_epochs=num_epochs,
    lr_scheduler_type="constant",
    warmup_ratio=0.03,
    max_steps=-1,
    logging_steps=25,
    save_steps=25,
    save_total_limit=5,
    bf16=False,
    fp16=False,
    dataloader_num_workers=16,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    optim="paged_adamw_32bit",
    group_by_length=True,
    report_to=["wandb"],
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": True},
    auto_find_batch_size=True,
    torch_compile=False,
    resume_from_checkpoint=True
)
wandb.config["trainer_arguments"] = trainer_arguments.to_dict()

## Load Model

In [20]:
flash_attention: str = "flash_attention_2" if enable_flash_attention_2 else None

In [21]:
base_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quantization_config,
    attn_implementation=flash_attention,
    pretraining_tp=1,
    use_cache=False,
    device_map="auto",
    low_cpu_mem_usage=True,
    trust_remote_code=True
)
base_model.resize_token_embeddings(len(tokenizer))
wandb.config["base_model_configuration"] = base_model.config.to_dict()

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

## Setup Tuner

In [22]:
data_collator = DataCollatorForCompletionOnlyLM(
    chat_template["response"],
    instruction_template=chat_template["instruction"],
    tokenizer=tokenizer
)

In [23]:
tuner = SFTTrainer(
    model=base_model,
    args=trainer_arguments,
    data_collator=data_collator,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=lora_config,
    dataset_text_field="prompt",
    tokenizer=tokenizer,
    max_seq_length=4096,
    dataset_num_proc=16
)

Map (num_proc=16):   0%|          | 0/16 [00:00<?, ? examples/s]

num_proc must be <= 4. Reducing num_proc to 4 for dataset of size 4.


Map (num_proc=4):   0%|          | 0/4 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [24]:
tuner.train()

[2024-04-14 23:30:45,351] [16/0] torch._dynamo.variables.higher_order_ops: [ERROR] hasattr TupleVariable to
[2024-04-14 23:30:45,351] [16/0] torch._dynamo.variables.higher_order_ops: [ERROR] Traceback (most recent call last):
[2024-04-14 23:30:45,351] [16/0] torch._dynamo.variables.higher_order_ops: [ERROR]   File "/home/hermeschen/.cache/pypoetry/virtualenvs/chat-bot-uhayQKRl-py3.11/lib/python3.11/site-packages/torch/_dynamo/variables/higher_order_ops.py", line 242, in speculate_subgraph
[2024-04-14 23:30:45,351] [16/0] torch._dynamo.variables.higher_order_ops: [ERROR]     output = f.call_function(tx, args, sub_kwargs)
[2024-04-14 23:30:45,351] [16/0] torch._dynamo.variables.higher_order_ops: [ERROR]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[2024-04-14 23:30:45,351] [16/0] torch._dynamo.variables.higher_order_ops: [ERROR]   File "/home/hermeschen/.cache/pypoetry/virtualenvs/chat-bot-uhayQKRl-py3.11/lib/python3.11/site-packages/torch/_dynamo/variables/functions.py", line 294,

Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


TrainOutput(global_step=4, training_loss=0.0, metrics={'train_runtime': 31.7906, 'train_samples_per_second': 0.503, 'train_steps_per_second': 0.126, 'total_flos': 200885839331328.0, 'train_loss': 0.0, 'epoch': 1.0})

In [25]:
wandb.finish()