In [11]:
### Cell 1: Cài đặt và import
!pip install transformers datasets peft bitsandbytes accelerate scikit-learn

import os
import gc
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from peft import get_peft_model, PrefixTuningConfig
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.utils.class_weight import compute_class_weight

def free_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [12]:
### Cell 2: Load CSV & Embeddings, Merge
# CSV paths
csv_paths = {
    'train': '/kaggle/input/id-data/train.csv',
    'validation': '/kaggle/input/id-data/validation.csv',
    'test': '/kaggle/input/id-data/test.csv'
}
csv_dfs = {s: pd.read_csv(p)[['id','code','label']].dropna() for s,p in csv_paths.items()}

# Embedding paths
emb_paths = {
    'code': {
        'train': '/kaggle/input/emb-token/train_emb.pt',
        'validation': '/kaggle/input/emb-token/val_emb.pt',
        'test': '/kaggle/input/emb-token/test_emb.pt'
    },
    'pdg': {
        'train': '/kaggle/input/emb-pdg/train_emb.pt',
        'validation': '/kaggle/input/emb-pdg/val_emb.pt',
        'test': '/kaggle/input/emb-pdg/test_emb.pt'
    }
}

merged = {}
for split in ['train','validation','test']:
    code_data = torch.load(emb_paths['code'][split])
    pdg_data  = torch.load(emb_paths['pdg'][split])
    df_code = pd.DataFrame({'id': code_data['ids'], 'code_emb': [e.cpu() for e in code_data['embeddings']]})
    df_pdg  = pd.DataFrame({'id': pdg_data['ids'],  'pdg_emb':  [e.cpu() for e in pdg_data['embeddings']]})
    df = csv_dfs[split].merge(df_code, on='id').merge(df_pdg, on='id')
    merged[split] = df.reset_index(drop=True)
    print(f"{split}: {len(df)} samples")

train: 75909 samples
validation: 16189 samples
test: 16175 samples


In [13]:
### Cell 3: Tạo DatasetDict cho prefix-tuning
# Build DatasetDict with code text and PDG embeddings
raw = DatasetDict({
    split: Dataset.from_dict({
        'code': merged[split]['code'].tolist(),
        'pdg_emb': merged[split]['pdg_emb'].apply(lambda x: x.tolist()).tolist(),
        'label': merged[split]['label'].tolist()
    }) for split in merged
})

In [17]:
### Cell 4: Tokenize code and attach pdg_emb
model_ckpt = 'Qwen/Qwen2.5-Coder-0.5B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
# Ensure a pad token is defined for batch padding
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
max_length = 512

def preprocess(examples):
    toks = tokenizer(
        examples['code'], truncation=True,
        padding='max_length', max_length=max_length
    )
    toks['pdg_emb'] = examples['pdg_emb']
    toks['labels']  = examples['label']
    return toks

datasets = raw.map(preprocess, batched=True)
# set torch format
datasets.set_format(
    type='torch',
    columns=['input_ids','attention_mask','pdg_emb','labels']
)

Map:   0%|          | 0/75909 [00:00<?, ? examples/s]

Map:   0%|          | 0/16189 [00:00<?, ? examples/s]

Map:   0%|          | 0/16175 [00:00<?, ? examples/s]

In [18]:
### Cell 5: Thiết lập PEFT Prefix-Tuning trên Qwen2.5
# Set pad_token_id in model config
base_model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=len(set(merged['train']['label'])),
    torch_dtype=torch.float16
)
base_model.config.pad_token_id = tokenizer.pad_token_id
base_model.cuda()

# Prefix tuning config
prefix_len = 8
peft_config = PrefixTuningConfig(
    task_type='SEQ_CLS',
    prefix_projection=True,
    num_virtual_tokens=prefix_len
)
# Wrap model
model = get_peft_model(base_model, peft_config)
torch.cuda.empty_cache()

Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-Coder-0.5B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
### Cell 6: TrainingArguments và Trainer
training_args = TrainingArguments(
    output_dir='./qwen_prefix',
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    eval_strategy='epoch',
    save_strategy='epoch',
    fp16=True,
    logging_steps=100,
    report_to='none'
)

def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    acc = accuracy_score(p.label_ids, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='macro')
    return {'accuracy':acc,'precision':prec,'recall':rec,'f1':f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets['train'],
    eval_dataset=datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

free_memory()
trainer.train()
free_memory()

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4085,0.369442,0.832973,0.773205,0.716476,0.737262
2,0.3225,0.30264,0.871703,0.837054,0.778286,0.801536
3,0.3096,0.287149,0.880351,0.849132,0.794067,0.816451


In [21]:
### Cell 7: Đánh giá trên test
res = trainer.predict(datasets['test'])
print(res.metrics)
preds = res.predictions.argmax(-1)
print(classification_report(res.label_ids, preds, digits=4))

{'test_loss': 0.2997478246688843, 'test_accuracy': 0.874435857805255, 'test_precision': 0.8383769905591381, 'test_recall': 0.7867517971317355, 'test_f1': 0.8078438605889944, 'test_runtime': 947.3624, 'test_samples_per_second': 17.074, 'test_steps_per_second': 2.134}
              precision    recall  f1-score   support

           0     0.8952    0.9482    0.9210     12479
           1     0.7815    0.6253    0.6947      3696

    accuracy                         0.8744     16175
   macro avg     0.8384    0.7868    0.8078     16175
weighted avg     0.8692    0.8744    0.8693     16175



In [22]:
### Cell 8: Lưu model và tokenizer
# Lưu state PEFT
model.save_pretrained('./qwen2_prefix_tuned')
# tokenizer
tokenizer.save_pretrained('./qwen2_prefix_tuned')
# Nếu muốn lưu cả base_model config và weights
base_model.save_pretrained('./qwen2_prefix_tuned_base')