In [None]:
import os
import json
import pandas as pd
from datasets import load_dataset, load_metric
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer
)
from peft import get_peft_model, LoraConfig, TaskType, PeftModel


File CSV -> JSON

In [None]:
CSV_FILE = "data/augmented_dataset.csv"
TRAIN_FILE = "train.jsonl"
VAL_FILE = "val.jsonl"

df = pd.read_csv(CSV_FILE)

if not {"comment", "response"}.issubset(df.columns):
    raise ValueError("CSV phải có hai cột: 'comment' và 'response'!")

# Split 90% train, 10% val
split = int(0.9 * len(df))
train_df, val_df = df[:split], df[split:]

def convert_to_jsonl(df, out_file):
    records = []
    for _, row in df.iterrows():
        records.append({
            "instruction": "Hãy an ủi và đồng cảm với lời tâm sự sau:",
            "input": row["comment"],
            "output": row["response"]
        })
    with open(out_file, "w", encoding="utf-8") as f:
        for r in records:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

convert_to_jsonl(train_df, TRAIN_FILE)
convert_to_jsonl(val_df, VAL_FILE)


In [None]:
MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
OUTPUT_DIR = "./lora_output"

rouge = load_metric("rouge")
bleu = load_metric("bleu")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    rouge_result = rouge.compute(predictions=preds, references=labels)
    bleu_result = bleu.compute(
        predictions=[p.split() for p in preds],
        references=[[l.split()] for l in labels]
    )
    result = {
        "rougeL": rouge_result["rougeL"].mid.fmeasure,
        "bleu": bleu_result["bleu"]
    }
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
base = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, device_map="auto", load_in_8bit=True
)
model = PeftModel.from_pretrained(base, OUTPUT_DIR)

def gen_reply(user_text, instruction="Hãy an ủi và đồng cảm với người nói"):
    prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{user_text}\n\n### Response:\n"
    ids = tokenizer(prompt, return_tensors="pt").to("cuda")
    out = model.generate(**ids, max_new_tokens=256, do_sample=True, temperature=0.8)
    txt = tokenizer.decode(out[0], skip_special_tokens=True)
    resp = txt.split("### Response:")[-1].strip()
    return resp

test_text = "Mình cảm thấy rất cô đơn, không ai quan tâm đến mình cả."
print(gen_reply(test_text))