In [1]:
from datasets import load_dataset

# Lệnh ngắn gọn để tải toàn bộ dataset
# Dữ liệu sẽ được lưu vào bộ nhớ (memory/cache) của session Kaggle
alpaca_dataset = load_dataset("tatsu-lab/alpaca")

# In cấu trúc để xác nhận
print(alpaca_dataset) 

# Lấy 5 ví dụ đầu tiên từ split 'train' để xem dữ liệu
train_split = alpaca_dataset['train']
first_5_examples_dict = train_split[:5]
print(first_5_examples_dict)
# một tập hợp các cặp chỉ dẫn/câu hỏi và phản hồi/câu trả lời (Instruction-Output Pairs).

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-a09b74b3ef9c3b(…):   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 52002
    })
})
{'instruction': ['Give three tips for staying healthy.', 'What are the three primary colors?', 'Describe the structure of an atom.', 'How can we reduce air pollution?', 'Describe a time when you had to make a difficult decision.'], 'input': ['', '', '', '', ''], 'output': ['1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'The three primary colors are red, blue, and yellow.', 'An atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the

In [2]:
def format_alpaca_to_chatbot_text(example):
    instruction = example['instruction']
    input_text = example['input']
    output_text = example['output']

    # --- KHUÔN MẪU TẠO CHUỖI ĐỂ FINETUNE LLM ---
    # Sử dụng định dạng tiêu chuẩn (Standard Prompting Format)
    
    if input_text:
        # Trường hợp có Input/Ngữ cảnh
        text = (
            f"### User Instruction:\n{instruction}\n\n"
            f"### Context Input:\n{input_text}\n\n"
            f"### Assistant Response:\n{output_text}"
        )
    else:
        # Trường hợp chỉ có Instruction (thường thấy ở Alpaca)
        text = (
            f"### User Instruction:\n{instruction}\n\n"
            f"### Assistant Response:\n{output_text}"
        )
        
    return {"text": text}

# Sau đó áp dụng cho dataset:
# alpaca_formatted = alpaca_dataset.map(format_alpaca_to_chatbot_text)

In [3]:
# Định nghĩa hàm tokenizer
from transformers import LlamaTokenizer

tokenizer = LlamaTokenizer(
    vocab_file="/kaggle/input/final2-data/tinyshakespeare.model",  # dùng model luôn
    sp_model_file="/kaggle/input/final2-data/tinyshakespeare.model",
    unk_token="<unk>",
    bos_token="<s>",
    eos_token="</s>",
    pad_token="</s>"
)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message


In [4]:
#Test
text = "This is a test sentence."
tokens = tokenizer.tokenize(text)
print(tokens)
print(tokenizer.convert_tokens_to_ids(tokens))

['▁This', '▁is', '▁a', '▁test', '▁sentence', '.']
[470, 78, 5, 4224, 2381, 7961]


In [5]:
# Tokenize dữ liệu mới dựa trên tokenize cũ 
def tokenize(example):
    return tokenizer(
        example["text"], 
        truncation=True, 
        padding="max_length", 
        max_length=512
    )

tokenized_dataset = alpaca_dataset.map(tokenize, batched=True, remove_columns=["text"])


Map:   0%|          | 0/52002 [00:00<?, ? examples/s]

In [6]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("/kaggle/input/final2-data/")



2025-10-18 14:51:44.929474: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760799105.118444      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760799105.169351      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at /kaggle/input/final2-data/ and are newly initialized: ['lm_head.weight', 'transformer.h.0.attn.c_attn.bias', 'transformer.h.0.attn.c_attn.weight', 'transformer.h.0.attn.c_proj.bias', 'transformer.h.0.attn.c_proj.weight', 'transformer.h.0.ln_1.bias', 'transformer.h.0.ln_1.weight', 'transformer.h.0.ln_2.bias', 'transformer.h.0.ln_2.weight', 'transformer.h

In [7]:
# compatible_training_verbose.py
import os
import math
import torch
import inspect
from transformers import (
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    TrainerCallback,
)

# --- 0️⃣ Kiểm tra thiết bị ---
if torch.cuda.is_available():
    device_name = torch.cuda.get_device_name(0)
    print(f"🚀 Using GPU: {device_name}")
else:
    print("⚠️ CUDA not available, using CPU")

# --- 1️⃣ Chuẩn bị dataset ---
train_dataset = (
    tokenized_dataset["train"]
    if isinstance(tokenized_dataset, dict) and "train" in tokenized_dataset
    else tokenized_dataset
)

# --- 2️⃣ Cấu hình TrainingArguments ---
args_dict = dict(
    output_dir="./alpaca_finetuned",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    num_train_epochs=3,
    fp16=True,
    save_strategy="epoch",          # 💾 lưu mỗi epoch
    save_total_limit=3,             # chỉ giữ 3 checkpoint gần nhất
    logging_steps=50,
    report_to="none",               # tránh lỗi khi không có wandb
    load_best_model_at_end=False,   # có thể bật nếu có eval
)

# --- 3️⃣ Tắt fp16 nếu không có GPU ---
if not torch.cuda.is_available():
    args_dict.pop("fp16", None)

# --- 4️⃣ Lọc key hợp lệ ---
sig = inspect.signature(TrainingArguments.__init__)
supported_keys = set(sig.parameters.keys()) - {"self"}
filtered_args = {k: v for k, v in args_dict.items() if k in supported_keys}
training_args = TrainingArguments(**filtered_args)

# --- 5️⃣ Data collator ---
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# --- 6️⃣ Callback in loss + perplexity ---
class LossLoggingCallback(TrainerCallback):
    def on_train_begin(self, args, state, control, **kwargs):
        print("\n🔥 Training bắt đầu...\n")

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and "loss" in logs:
            loss = logs["loss"]
            ppl = math.exp(loss) if loss < 20 else float("inf")
            step = logs.get("step", state.global_step)
            epoch = state.epoch if state.epoch is not None else 0
            print(f"🧮 [Epoch {epoch:.2f}] Step {step:>5}: loss = {loss:.4f} | ppl = {ppl:.2f}")

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics and "eval_loss" in metrics:
            ppl = math.exp(metrics["eval_loss"]) if metrics["eval_loss"] < 20 else float("inf")
            print(f"\n📊 Evaluation - Loss: {metrics['eval_loss']:.4f} | Perplexity: {ppl:.2f}\n")

# --- 7️⃣ Tạo Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[LossLoggingCallback()],
)

# --- 8️⃣ In loss trung bình ban đầu ---
try:
    model.eval()
    with torch.no_grad():
        total_loss = 0
        n_batches = 0
        for i, batch in enumerate(train_dataset):
            if i >= 10:
                break
            inputs = {k: torch.tensor(v).unsqueeze(0).to(model.device) for k, v in batch.items()}
            outputs = model(**inputs)
            loss = outputs.loss.item()
            total_loss += loss
            n_batches += 1
        if n_batches > 0:
            avg_loss = total_loss / n_batches
            print(f"\n📍 Average initial loss (sampled): {avg_loss:.4f} | PPL ≈ {math.exp(avg_loss):.2f}\n")
except Exception as e:
    print(f"⚠️ Could not compute initial loss preview: {e}")

# --- 9️⃣ Kiểm tra checkpoint mới nhất ---
latest_checkpoint = None
if os.path.exists(training_args.output_dir):
    checkpoints = [
        os.path.join(training_args.output_dir, d)
        for d in os.listdir(training_args.output_dir)
        if d.startswith("checkpoint")
    ]
    if checkpoints:
        latest_checkpoint = max(checkpoints, key=os.path.getmtime)
        print(f"📦 Found checkpoint: {latest_checkpoint}")

# --- 🔟 Resume training ---
if latest_checkpoint:
    print(f"🔁 Resuming from {latest_checkpoint} ...")
    trainer.train(resume_from_checkpoint=latest_checkpoint)
else:
    print("🆕 No checkpoint found, starting fresh training...")
    trainer.train()

print("\n✅ Training finished successfully!\n")


🚀 Using GPU: Tesla P100-PCIE-16GB


  trainer = Trainer(


⚠️ Could not compute initial loss preview: new(): invalid data type 'str'
🆕 No checkpoint found, starting fresh training...

🔥 Training bắt đầu...



`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,5.7368
100,4.7662
150,4.316
200,4.1938
250,4.0327
300,3.8678
350,3.8564
400,3.6644
450,3.5305
500,3.618


🧮 [Epoch 0.02] Step    50: loss = 5.7368 | ppl = 310.07
🧮 [Epoch 0.03] Step   100: loss = 4.7662 | ppl = 117.47
🧮 [Epoch 0.05] Step   150: loss = 4.3160 | ppl = 74.89
🧮 [Epoch 0.06] Step   200: loss = 4.1938 | ppl = 66.27
🧮 [Epoch 0.08] Step   250: loss = 4.0327 | ppl = 56.41
🧮 [Epoch 0.09] Step   300: loss = 3.8678 | ppl = 47.84
🧮 [Epoch 0.11] Step   350: loss = 3.8564 | ppl = 47.29
🧮 [Epoch 0.12] Step   400: loss = 3.6644 | ppl = 39.03
🧮 [Epoch 0.14] Step   450: loss = 3.5305 | ppl = 34.14
🧮 [Epoch 0.15] Step   500: loss = 3.6180 | ppl = 37.26
🧮 [Epoch 0.17] Step   550: loss = 3.6108 | ppl = 37.00
🧮 [Epoch 0.18] Step   600: loss = 3.5218 | ppl = 33.85
🧮 [Epoch 0.20] Step   650: loss = 3.4461 | ppl = 31.38
🧮 [Epoch 0.22] Step   700: loss = 3.4318 | ppl = 30.93
🧮 [Epoch 0.23] Step   750: loss = 3.3791 | ppl = 29.34
🧮 [Epoch 0.25] Step   800: loss = 3.3703 | ppl = 29.09
🧮 [Epoch 0.26] Step   850: loss = 3.3865 | ppl = 29.56
🧮 [Epoch 0.28] Step   900: loss = 3.2442 | ppl = 25.64
🧮 [Epoch

In [33]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# ✅ Chỉ định đúng checkpoint (tránh lỗi weight)
model_path = "./alpaca_finetuned/checkpoint-9753"

# ✅ Load tokenizer và model
tokenizer = AutoTokenizer.from_pretrained("./alpaca_finetuned")
model = AutoModelForCausalLM.from_pretrained(model_path)

# ✅ Resize phòng trường hợp token mới
model.resize_token_embeddings(len(tokenizer))

# ✅ Tạo pipeline chạy CPU
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device="cpu"   # 👈 Quan trọng
)

# ✅ Prompt test
prompt = """### Instruction:
Explain what Artificial Intelligence is.

### Input:
None

### Response:
"""

output = pipe(prompt, max_new_tokens=100, do_sample=True, temperature=0.7)[0]["generated_text"]

print("\n=== OUTPUT ===")
print(output)


Device set to use cpu



=== OUTPUT ===
### Instruction:
Explain what Artificial Intelligence is.

### Input:
None

### Response:
AI is its keyworks, and translation. AI uses in automate data and the user service that automate data, providing for targets to accuracy, and decisions. AI can help users with other handwords, allowing accuracy, and more accuracy, and to their data, and analyze and data and acc
