In [1]:
# Cell 1: Cài đặt môi trường và import thư viện
!pip install transformers datasets peft bitsandbytes accelerate scikit-learn

import os
import torch
import random
import re
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)
from transformers import PreTrainedModel, PreTrainedTokenizerBase
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.13.0->peft)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.13.0->peft)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.13.0->peft)
  Downloading nvidia_cu

2025-06-21 04:12:22.167547: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750479142.348316      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750479142.424778      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Cell 2: Chuẩn bị dataset (load CSV, split train/validation, tokenize với max_length=2048)

def prepare_datasets(
    csv_files: list[str],
    tokenizer,
    train_frac: float = 0.7,
    val_frac: float = 0.15,
    max_length: int = 512,
    random_seed: int = 42
) -> DatasetDict:
    df = pd.concat([pd.read_csv(p) for p in csv_files], ignore_index=True)
    df = df[df['code'].notna() & df['label'].isin([0,1])]

    train_df, temp_df = train_test_split(
        df,
        test_size=1-train_frac,
        random_state=random_seed,
        stratify=df['label']
    )

    val_df, test_df = train_test_split(
        temp_df,
        test_size=val_frac/(1-train_frac),
        random_state=random_seed,
        stratify=temp_df['label']
    )

    def tokenize_fn(batch):
        toks = tokenizer(
            batch['code'],
            truncation=True,
            max_length=max_length,
            padding='max_length'
        )
        toks['labels'] = batch['label']
        return toks

    ds = DatasetDict({
        'train': Dataset.from_pandas(train_df[['code','label']].reset_index(drop=True)),
        'validation': Dataset.from_pandas(val_df[['code','label']].reset_index(drop=True)),
        'test':  Dataset.from_pandas(test_df[['code','label']].reset_index(drop=True)),
    })
    ds = ds.map(
        tokenize_fn,
        batched=True,
        remove_columns=['code','label']
    )
    ds.set_format(type='torch', columns=['input_ids','attention_mask','labels'])
    return ds

# Đường dẫn CSV:
csv_files = [
    "/kaggle/input/data-cleaned/labeled_data_cleaned_0.csv",
    "/kaggle/input/data-cleaned/labeled_data_cleaned_1.csv",
]

In [3]:
# Cell 3: Load tokenizer và model classification (LoRA)
model_id = "microsoft/codebert-base"
output_dir = "/kaggle/working/codebert_lora"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=2,
    torch_dtype=torch.float16,
    device_map="auto"
)
base_model.config.pad_token_id = tokenizer.pad_token_id
model = prepare_model_for_kbit_training(base_model)
lora_config = LoraConfig(
    task_type="SEQ_CLS",
    r=4,
    lora_alpha=16,
    target_modules=["query", "value"],
    lora_dropout=0.05,
    bias="none",
)
model = get_peft_model(model, lora_config)
model.config.pad_token_id = tokenizer.pad_token_id
torch.cuda.empty_cache()


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# Cell 4: Tạo dataset đã tokenize và DataCollator
datasets = prepare_datasets(
    csv_files=csv_files,
    tokenizer=tokenizer,
    max_length=512,
    random_seed=42
)

print("Train:", len(datasets['train']), "Val:", len(datasets['validation']), "Test:", len(datasets['test']))

data_collator = DataCollatorWithPadding(tokenizer)
torch.cuda.empty_cache()

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Map:   0%|          | 0/101744 [00:00<?, ? examples/s]

Map:   0%|          | 0/12717 [00:00<?, ? examples/s]

Map:   0%|          | 0/12719 [00:00<?, ? examples/s]

Train: 101744 Val: 12717 Test: 12719


In [5]:
# Cell 5: Hàm tính metrics và TrainingArguments
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary", zero_division=0
    )
    return {
        "accuracy": accuracy, 
        "precision": precision, 
        "recall": recall, 
        "f1": f1
    }

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,
    num_train_epochs=3,
    fp16=True,
    dataloader_num_workers=4,
    logging_steps=500,
    eval_strategy="epoch",
    report_to="none",
)

In [6]:
# Cell 6: Khởi tạo Trainer và fine‑tune với validation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets['train'],
    eval_dataset=datasets['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("🚀 Bắt đầu huấn luyện...")
trainer.train(resume_from_checkpoint=False)
print("✅ Hoàn thành huấn luyện.")

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


🚀 Bắt đầu huấn luyện...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3214,0.293514,0.878352,0.722781,0.626735,0.671341
2,0.2698,0.274155,0.885508,0.711393,0.710829,0.711111


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

✅ Hoàn thành huấn luyện.


In [7]:
# Cell 7: Đánh giá trên test set
eval_trainer = Trainer(
    model=trainer.model,
    args=training_args,
    eval_dataset=datasets['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("🔍 Running final evaluation on test set...")
metrics = eval_trainer.evaluate()

print("✅ Test set evaluation results:")
for key, val in metrics.items():
    if isinstance(val, float):
        print(f"  {key}: {val:.4f}")
    else:
        print(f"  {key}: {val}")

  eval_trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


🔍 Running final evaluation on test set...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{'eval_loss': 0.2725101709365845, 'eval_model_preparation_time': 0.0052, 'eval_accuracy': 0.887569777498231, 'eval_precision': 0.7181127548980408, 'eval_recall': 0.7124157080523602, 'eval_f1': 0.7152528872958982, 'eval_runtime': 200.2156, 'eval_samples_per_second': 63.527, 'eval_steps_per_second': 7.941}


In [8]:
torch.cuda.empty_cache()
trainer.save_model(output_dir)
print(f"✅ Đã lưu model tại {output_dir}")

✅ Đã lưu model tại /kaggle/working/codebert_lora
