In [1]:
### Cell 1: Cài đặt và import
!pip install transformers datasets peft bitsandbytes accelerate scikit-learn

import os
import gc
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModel,
    Trainer,
    TrainingArguments
)
from peft import get_peft_model, LoraConfig
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.utils.class_weight import compute_class_weight

def free_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft)

2025-07-10 09:00:01.527884: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752138001.729015      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752138001.784288      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
### Cell 2: Load CSV & Embeddings, Merge
csv_paths = {
    'train': '/kaggle/input/id-data/train.csv',
    'validation': '/kaggle/input/id-data/validation.csv',
    'test': '/kaggle/input/id-data/test.csv'
}
csv_dfs = {split: pd.read_csv(path)[['id','code','label']].dropna()
           for split, path in csv_paths.items()}

emb_pdg_paths = {
    'train': '/kaggle/input/emb-vector-pdg/train_emb.pt',
    'validation': '/kaggle/input/emb-vector-pdg/val_emb.pt',
    'test': '/kaggle/input/emb-vector-pdg/test_emb.pt'
}

merged = {}
for split in ['train','validation','test']:
    pdg_data = torch.load(emb_pdg_paths[split])
    df_pdg = pd.DataFrame({'id': pdg_data['ids'],  'pdg_emb': [e.cpu() for e in pdg_data['embeddings']]})
    df = csv_dfs[split].merge(df_pdg, on='id')
    merged[split] = df.reset_index(drop=True)
    print(f"{split}: {len(df)} samples")

train: 75909 samples
validation: 16189 samples
test: 16175 samples


In [3]:
### Cell 3: Tạo DatasetDict
raw = DatasetDict({
    split: Dataset.from_dict({
        'code': merged[split]['code'].tolist(),
        'pdg_emb': merged[split]['pdg_emb'].apply(lambda x: x.tolist()).tolist(),
        'label': merged[split]['label'].tolist()
    }) for split in merged
})

In [4]:
### Cell 4: Tokenize code and attach pdg_emb
model_ckpt = 'microsoft/codebert-base'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
max_length = 512

def preprocess(examples):
    toks = tokenizer(
        examples['code'], truncation=True,
        padding='max_length', max_length=max_length
    )
    toks['pdg_emb'] = examples['pdg_emb']
    toks['labels'] = examples['label']
    return toks

datasets = raw.map(preprocess, batched=True)
datasets.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'pdg_emb', 'labels']
)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Map:   0%|          | 0/75909 [00:00<?, ? examples/s]

Map:   0%|          | 0/16189 [00:00<?, ? examples/s]

Map:   0%|          | 0/16175 [00:00<?, ? examples/s]

In [15]:
### Cell 5: Định nghĩa model wrapper và gắn LoRA
class CodeBERTWithPDG(nn.Module):
    def __init__(self, base_model_name, pdg_dim, num_labels):
        super().__init__()
        # Load base encoder và cấu hình
        self.bert = AutoModel.from_pretrained(base_model_name)
        self.config = self.bert.config
        hidden_size = self.config.hidden_size
        # Head classification nối [CLS] với PDG embedding
        self.classifier = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(hidden_size + pdg_dim, num_labels)
        )

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        inputs_embeds=None,
        pdg_emb=None,
        labels=None,
        **kwargs
    ):
        # Forward qua BERT, hỗ trợ input_ids hoặc inputs_embeds
        bert_outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            **{k: v for k, v in kwargs.items() if k in ['token_type_ids', 'position_ids']}
        )
        cls_rep = bert_outputs.last_hidden_state[:, 0, :]
        # Xử lý PDG embeddings: pdg_emb đã là Tensor shape [batch_size, pdg_dim]
        pdg_tensor = pdg_emb.to(cls_rep.device)
        # Concatenate và tính logits
        concat_vec = torch.cat([cls_rep, pdg_tensor], dim=1)
        logits = self.classifier(concat_vec)
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)
        return {'loss': loss, 'logits': logits}

# Khởi tạo model với LoRA
pdg_dim = merged['train']['pdg_emb'][0].shape[0]
num_labels = len(set(merged['train']['label']))
base_model = CodeBERTWithPDG('microsoft/codebert-base', pdg_dim, num_labels)
peft_config = LoraConfig(
    task_type='SEQ_CLS',
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=['query', 'value']
)
model = get_peft_model(base_model, peft_config)
model.cuda()
free_memory()

In [20]:
### Cell 6: TrainingArguments và Trainer
training_args = TrainingArguments(
    output_dir='./codebert_lora',
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    eval_strategy='epoch',
    save_strategy='epoch',
    fp16=True,
    logging_steps=100,
    report_to='none'
)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    acc = accuracy_score(p.label_ids, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='macro')
    return {'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets['train'],
    eval_dataset=datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
### Cell 7: Huấn luyện và đánh giá
trainer.train()
trainer.save_model('./codebert_lora_checkpoint')

res = trainer.predict(datasets['test'])
print(res.metrics)
preds = res.predictions.argmax(-1)
print(classification_report(res.label_ids, preds, digits=4))

Epoch,Training Loss,Validation Loss


In [None]:
### Cell 8: Lưu model và tokenizer
model.save_pretrained('./codebert_lora_tuned')
tokenizer.save_pretrained('./codebert_lora_tuned')