In [1]:
# Cell 1: Cài đặt môi trường và import thư viện
!pip install transformers datasets peft bitsandbytes accelerate scikit-learn

import os
import torch
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.13.0->peft)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.13.0->peft)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.13.0->peft)
  Downloading nvidia_cu

2025-06-24 15:40:15.809867: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750779616.037957      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750779616.105687      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Cell 2: Chuẩn bị dataset (load CSV, split train/val/test, tokenize)
def prepare_datasets(
    csv_files: list[str],
    tokenizer,
    train_frac: float = 0.7,
    val_frac: float = 0.15,
    max_length: int = 2048,
    random_seed: int = 42
) -> DatasetDict:
    df = pd.concat([pd.read_csv(p) for p in csv_files], ignore_index=True)
    df = df[df['code'].notna() & df['label'].isin([0,1])]

    # first split train vs temp
    train_df, temp_df = train_test_split(
        df,
        test_size=1-train_frac,
        random_state=random_seed,
        stratify=df['label']
    )
    # then split temp into val/test equally
    val_df, test_df = train_test_split(
        temp_df,
        test_size=val_frac/(1-train_frac),
        random_state=random_seed,
        stratify=temp_df['label']
    )

    def tokenize_fn(batch):
        toks = tokenizer(
            batch['code'],
            truncation=True,
            max_length=max_length,
            padding='max_length'
        )
        toks['labels'] = batch['label']
        return toks

    ds = DatasetDict({
        'train': Dataset.from_pandas(train_df[['code','label']].reset_index(drop=True)),
        'validation': Dataset.from_pandas(val_df[['code','label']].reset_index(drop=True)),
        'test':  Dataset.from_pandas(test_df[['code','label']].reset_index(drop=True)),
    })
    ds = ds.map(
        tokenize_fn,
        batched=True,
        remove_columns=['code','label']
    )
    ds.set_format(type='torch', columns=['input_ids','attention_mask','labels'])
    return ds

csv_files = [
    "/kaggle/input/data-cleaned/labeled_data_cleaned_0.csv",
    "/kaggle/input/data-cleaned/labeled_data_cleaned_1.csv",
]

In [3]:
# Cell 3: Load tokenizer và model classification (4-bit + LoRA)
model_id = "Qwen/Qwen2.5-Coder-0.5B-Instruct"
output_dir = "/kaggle/working/qwen2.5_lora"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)
base_model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    quantization_config=quant_config,
    num_labels=2,
    device_map="auto"
)
base_model.config.pad_token_id = tokenizer.pad_token_id
model = prepare_model_for_kbit_training(base_model)
lora_config = LoraConfig(
    task_type="SEQ_CLS",
    r=4,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
)
model = get_peft_model(model, lora_config)
model.config.pad_token_id = tokenizer.pad_token_id
torch.cuda.empty_cache()

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-Coder-0.5B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# Cell 4: Tạo dataset đã tokenize và DataCollator với dynamic padding
datasets = prepare_datasets(
    csv_files=csv_files,
    tokenizer=tokenizer,
    max_length=2048,
    random_seed=42
)
print("Train:", len(datasets['train']), "Val:", len(datasets['validation']), "Test:", len(datasets['test']))

data_collator = DataCollatorWithPadding(tokenizer)
torch.cuda.empty_cache()

Map:   0%|          | 0/89025 [00:00<?, ? examples/s]

Map:   0%|          | 0/19077 [00:00<?, ? examples/s]

Map:   0%|          | 0/19078 [00:00<?, ? examples/s]

Train: 89025 Val: 19077 Test: 19078


In [5]:
# Cell 5: Hàm metrics và TrainingArguments với validation
from transformers import EarlyStoppingCallback

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0)
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,
    num_train_epochs=3,
    fp16=True,
    dataloader_num_workers=4,
    logging_steps=500,
    eval_strategy="epoch",
    save_strategy="no",
    report_to="none",
)

In [6]:
# Cell 6: Khởi tạo Trainer và fine‑tune với validation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets['train'],
    eval_dataset=datasets['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("🚀 Bắt đầu huấn luyện...")
trainer.train(resume_from_checkpoint="/kaggle/input/checkpoint")
print("✅ Hoàn thành huấn luyện.")

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


🚀 Bắt đầu huấn luyện...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
2,0.064,0.079568,0.972218,0.953417,0.903994,0.928048


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

✅ Hoàn thành huấn luyện.


In [7]:
# Cell 7: Đánh giá trên test set
eval_trainer = Trainer(
    model=trainer.model,
    args=training_args,
    eval_dataset=datasets['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("🔍 Running final evaluation on test set...")
metrics = eval_trainer.evaluate()

print("✅ Test set evaluation results:")
for key, val in metrics.items():
    if isinstance(val, float):
        print(f"  {key}: {val:.4f}")
    else:
        print(f"  {key}: {val}")

  eval_trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


🔍 Running final evaluation on test set...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

✅ Test set evaluation results:
  eval_loss: 0.0897
  eval_model_preparation_time: 0.0030
  eval_accuracy: 0.9683
  eval_precision: 0.9468
  eval_recall: 0.8900
  eval_f1: 0.9175
  eval_runtime: 6128.9670
  eval_samples_per_second: 3.1130
  eval_steps_per_second: 0.3890


In [9]:
# import os, torch

# ckpt_dir = "/kaggle/working/manual_checkpoint"
# os.makedirs(ckpt_dir, exist_ok=True)

# trainer.save_model(ckpt_dir)
# trainer.state.save_to_json(os.path.join(ckpt_dir, "trainer_state.json"))
# torch.save(trainer.optimizer.state_dict(), os.path.join(ckpt_dir, "optimizer.pt"))
# torch.save(trainer.lr_scheduler.state_dict(), os.path.join(ckpt_dir, "scheduler.pt"))

# print(f"✅ Lưu manual checkpoint xong ở {ckpt_dir}")

# !zip -r checkpoint.zip /kaggle/working/manual_checkpoint
torch.cuda.empty_cache()
trainer.save_model(output_dir)
print(f"✅ Đã lưu model tại {output_dir}")
!zip -r model_qwen2.5_v1.zip /kaggle/working/qwen2.5_lora


✅ Đã lưu model tại /kaggle/working/qwen2.5_lora
  adding: kaggle/working/qwen2.5_lora/ (stored 0%)
  adding: kaggle/working/qwen2.5_lora/training_args.bin (deflated 52%)
  adding: kaggle/working/qwen2.5_lora/adapter_config.json (deflated 53%)
  adding: kaggle/working/qwen2.5_lora/merges.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 57%)
  adding: kaggle/working/qwen2.5_lora/added_tokens.json (deflated 67%)
  adding: kaggle/working/qwen2.5_lora/README.md (deflated 66%)
  adding: kaggle/working/qwen2.5_lora/tokenizer.json (deflated 81%)
  adding: kaggle/working/qwen2.5_lora/tokenizer_config.json (deflated 83%)
  adding: kaggle/working/qwen2.5_lora/vocab.json (deflated 61%)
  adding: kaggle/working/qwen2.5_lora/adapter_model.safetensors (deflated 8%)
  adding: kaggle/working/qwen2.5_lora/special_tokens_map.json (deflated 63%)
