In [2]:
# Cell 1: Cài đặt môi trường và import thư viện
# Cài đặt thư viện cần thiết
!pip install -q torch torchvision torchaudio
!pip install -q torch-geometric
!pip install -q transformers
!pip install -q scikit-learn tqdm pandas

# Import thư viện cơ bản
import os
import ast
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# PyTorch & Torch Geometric
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import RGCNConv, global_mean_pool

# Transformers - dùng CodeBERT
from transformers import AutoTokenizer, AutoModel

# Đánh giá mô hình
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report
)

# Cài đặt thiết bị
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🔧 Đang sử dụng thiết bị: {device}")

# Tránh cảnh báo không cần thiết từ tokenizer
os.environ["TOKENIZERS_PARALLELISM"] = "false"

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m97.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m82.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
# Cell 2: Chuẩn bị dataset (load CSV + trích xuất PDG)
import ast

# Đọc và làm sạch dữ liệu từ CSV
def prepare_datasets(csv_paths: dict):
    datasets = {}
    for split, path in csv_paths.items():
        df = pd.read_csv(path)
        df = df[df['code'].notna() & df['label'].isin([0, 1])].reset_index(drop=True)
        datasets[split] = df
        print(f"{split.upper()} set: {df.shape[0]} samples")
    return datasets

# Trích xuất PDG: trả về nodes, edge_index, edge_attr
def extract_pdg(code):
    try:
        tree = ast.parse(code)
        nodes, edges, var_deps = [], [], {}

        def visit(node, parent_idx=None):
            idx = len(nodes)
            node_str = ast.unparse(node).strip() if hasattr(ast, "unparse") else str(node)
            nodes.append(node_str)

            if parent_idx is not None:
                edges.append((parent_idx, idx, 'control'))

            if isinstance(node, ast.Assign):
                for target in node.targets:
                    if isinstance(target, ast.Name):
                        var_name = target.id
                        var_deps.setdefault(var_name, []).append(idx)
                        for sub in ast.walk(node.value):
                            if isinstance(sub, ast.Name) and sub.id in var_deps:
                                for dep_idx in var_deps[sub.id]:
                                    edges.append((dep_idx, idx, 'data'))
            elif isinstance(node, ast.Name):
                var_name = node.id
                if var_name in var_deps:
                    for dep_idx in var_deps[var_name]:
                        edges.append((dep_idx, idx, 'data'))

            for child in ast.iter_child_nodes(node):
                visit(child, idx)

        visit(tree)
        edge_index = torch.tensor([(u, v) for u, v, _ in edges], dtype=torch.long).t()
        edge_attr = torch.tensor([[1, 0] if t == 'control' else [0, 1] for _, _, t in edges], dtype=torch.float)
        return nodes, edge_index, edge_attr
    except Exception:
        return [], torch.empty((2, 0), dtype=torch.long), torch.empty((0, 2), dtype=torch.float)

# Tạo bộ dataset dạng list chứa (nodes, edge_index, edge_attr, label)
def build_raw_pdg_dataset(df, split_name):
    raw_dataset = []
    for i, row in tqdm(df.iterrows(), total=len(df), desc=f"📦 Building PDG for {split_name}"):
        nodes, edge_index, edge_attr = extract_pdg(row['code'])
        if len(nodes) > 0 and edge_index.size(1) > 0:
            raw_dataset.append((nodes, edge_index, edge_attr, int(row['label'])))
    print(f"✅ {split_name} set: {len(raw_dataset)} valid samples (PDG extracted)")
    return raw_dataset

# Đường dẫn tới file CSV gốc
csv_files = {
    'train': "/kaggle/input/half1-1/train.csv",
    'val': "/kaggle/input/half1-1/val.csv",
    'test': "/kaggle/input/half1-1/test.csv"
}

# Load dữ liệu CSV & PDG
datasets = prepare_datasets(csv_files)
raw_dataset_dict = {split: build_raw_pdg_dataset(datasets[split], split) for split in ['train', 'val', 'test']}

TRAIN set: 22256 samples
VAL set: 4769 samples
TEST set: 4770 samples


📦 Building PDG for train:   0%|          | 0/22256 [00:00<?, ?it/s]

✅ train set: 18936 valid samples (PDG extracted)


📦 Building PDG for val:   0%|          | 0/4769 [00:00<?, ?it/s]

✅ val set: 4089 valid samples (PDG extracted)


📦 Building PDG for test:   0%|          | 0/4770 [00:00<?, ?it/s]

✅ test set: 4039 valid samples (PDG extracted)


In [4]:
# Cell 3: Tạo node embedding bằng CodeBERT & chuẩn bị Data cho GNN
from transformers import AutoTokenizer, AutoModel

# ✅ Load CodeBERT
model_id = "microsoft/codebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModel.from_pretrained(model_id).to(device).eval()

MAX_NODES = 100

# ✅ Hàm tạo embedding cho list node bằng CodeBERT
def create_node_embeddings(nodes, batch_size=64):
    all_embeddings = []
    for i in range(0, len(nodes), batch_size):
        batch_nodes = nodes[i:i + batch_size]
        inputs = tokenizer(batch_nodes, return_tensors="pt", padding=True, truncation=True, max_length=64)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).cpu()  # (B, 768)
        all_embeddings.append(embeddings)
    return torch.cat(all_embeddings, dim=0)

# ✅ Chuyển 1 mẫu PDG → Data
def process_sample(nodes, edge_index, edge_attr, label):
    if len(nodes) == 0 or edge_index.size(1) == 0:
        return None
    if len(nodes) > MAX_NODES:
        nodes = nodes[:MAX_NODES]
        mask = (edge_index[0] < MAX_NODES) & (edge_index[1] < MAX_NODES)
        edge_index = edge_index[:, mask]
        edge_attr = edge_attr[mask]
    x = create_node_embeddings(nodes)  # (num_nodes, 768)
    y = torch.tensor([label], dtype=torch.long)
    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)

# ✅ Tạo embedded data cho tất cả các tập
embedded_dataset_dict = {}
for split in ['train', 'val', 'test']:
    embedded_data = []
    for i, (nodes, edge_index, edge_attr, label) in enumerate(tqdm(raw_dataset_dict[split], desc=f"🔁 Embedding {split}")):
        data = process_sample(nodes, edge_index, edge_attr, label)
        if data is not None:
            embedded_data.append(data)
        if i % 100 == 0:
            torch.cuda.empty_cache()
    embedded_dataset_dict[split] = embedded_data

# ✅ Kiểm tra 1 mẫu
if len(embedded_dataset_dict['train']) > 0:
    sample = embedded_dataset_dict['train'][0]
    print("📌 Sample shape:")
    print("x:", sample.x.shape)
    print("edge_index:", sample.edge_index.shape)
    print("edge_attr:", sample.edge_attr.shape)
    print("label:", sample.y.item())

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

2025-07-10 07:23:51.010411: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752132231.195539      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752132231.246078      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

🔁 Embedding train:   0%|          | 0/18936 [00:00<?, ?it/s]

🔁 Embedding val:   0%|          | 0/4089 [00:00<?, ?it/s]

🔁 Embedding test:   0%|          | 0/4039 [00:00<?, ?it/s]

📌 Sample shape:
x: torch.Size([100, 768])
edge_index: torch.Size([2, 102])
edge_attr: torch.Size([102, 2])
label: 0


In [5]:
# Cell 4: Load DataLoader cho GNN từ biến RAM
from torch_geometric.loader import DataLoader

# Load từ embedded_dataset_dict
train_data = embedded_dataset_dict['train']
val_data = embedded_dataset_dict['val']
test_data = embedded_dataset_dict['test']

# Tạo DataLoader
batch_size = 32
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

print(f"✅ Loaded: {len(train_data)} train | {len(val_data)} val | {len(test_data)} test")

✅ Loaded: 18936 train | 4089 val | 4039 test


In [16]:
from torch_geometric.nn import RGCNConv
from torch_geometric.nn.aggr import AttentionalAggregation as NewAttentionAgg
from torch.optim import AdamW
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    precision_recall_fscore_support
)
from torch.nn import LayerNorm, Dropout
import torch.nn.functional as F
import torch.nn as nn
from collections import Counter

# ✅ Focal Loss với gamma=1.5
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=1.5, weight=None, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.weight = weight
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, weight=self.weight, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return focal_loss.mean() if self.reduction == 'mean' else focal_loss.sum()

# ✅ RGCN với Attention + Dropout 0.6
class PDG_RGCN_Attention(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_relations=2):
        super().__init__()
        self.conv1 = RGCNConv(input_dim, hidden_dim, num_relations)
        self.norm1 = LayerNorm(hidden_dim)

        self.conv2 = RGCNConv(hidden_dim, hidden_dim, num_relations)
        self.norm2 = LayerNorm(hidden_dim)

        self.dropout = Dropout(0.6)  # 🔼 tăng lên 0.6

        gate_nn = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, 1)
        )
        self.attention_pool = NewAttentionAgg(gate_nn, nn.Identity())

        self.fc1 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.classifier_dropout = Dropout(0.6)  # 🔼 tăng lên 0.6
        self.fc2 = nn.Linear(hidden_dim // 2, output_dim)

    def forward(self, data):
        x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
        edge_type = edge_attr.argmax(dim=1)

        x = F.relu(self.norm1(self.conv1(x, edge_index, edge_type)))
        x = self.dropout(x)

        x = F.relu(self.norm2(self.conv2(x, edge_index, edge_type)))
        x = self.dropout(x)

        x = self.attention_pool(x, batch)
        x = F.relu(self.fc1(x))
        x = self.classifier_dropout(x)
        return self.fc2(x)

# ✅ Thiết lập huấn luyện
labels = [data.y.item() for data in train_data]
label_counts = Counter(labels)
total = sum(label_counts.values())
class_weights = [total / label_counts[i] for i in range(2)]
weights_tensor = torch.tensor(class_weights).to(device)

criterion = FocalLoss(alpha=1.0, gamma=1.5, weight=weights_tensor)

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64)

input_dim = train_data[0].x.shape[1]
model = PDG_RGCN_Attention(input_dim, hidden_dim=512, output_dim=2, num_relations=2).to(device)
optimizer = AdamW(model.parameters(), lr=3e-4, weight_decay=3e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.7)

# ✅ Hàm huấn luyện và đánh giá giữ nguyên
def train_epoch(model, loader):
    model.train()
    total_loss = 0
    for batch in loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        out = model(batch)
        loss = criterion(out, batch.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    preds, probs, labels = [], [], []
    with torch.no_grad():
        for batch in loader:
            batch = batch.to(device)
            out = model(batch)
            prob = F.softmax(out, dim=1)[:, 1].cpu()
            pred = out.argmax(dim=1).cpu()
            preds.extend(pred.tolist())
            probs.extend(prob.tolist())
            labels.extend(batch.y.cpu().tolist())

    if len(set(labels)) < 2:
        print("⚠️ Chỉ có 1 lớp trong dữ liệu đánh giá.")
        return accuracy_score(labels, preds), 0, 0, f1_score(labels, preds, zero_division=0), 0.5

    acc = accuracy_score(labels, preds)
    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0)
    auc = roc_auc_score(labels, probs)
    return acc, p, r, f1, auc

# ✅ Huấn luyện với early stopping
best_f1 = 0
patience = 12
patience_counter = 0
SAVE_MODEL = True

for epoch in range(1, 101):
    loss = train_epoch(model, train_loader)
    scheduler.step()

    val_acc, val_p, val_r, val_f1, val_auc = evaluate(model, val_loader)
    print(f"📘 Epoch {epoch:02d} | Loss: {loss:.4f} | Acc: {val_acc:.4f} | P: {val_p:.4f} | R: {val_r:.4f} | F1: {val_f1:.4f} | AUC: {val_auc:.4f}")

    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0
        if SAVE_MODEL:
            torch.save(model.state_dict(), "best_model.pt")
            print(f"✨ Model saved with F1-score: {best_f1:.4f}")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("🛑 Early stopping triggered.")
            break


📘 Epoch 01 | Loss: 0.9207 | Acc: 0.5265 | P: 0.3030 | R: 0.8391 | F1: 0.4453 | AUC: 0.7309
✨ Model saved with F1-score: 0.4453
📘 Epoch 02 | Loss: 0.8424 | Acc: 0.6202 | P: 0.3456 | R: 0.7581 | F1: 0.4748 | AUC: 0.7496
✨ Model saved with F1-score: 0.4748
📘 Epoch 03 | Loss: 0.8221 | Acc: 0.6598 | P: 0.3699 | R: 0.7138 | F1: 0.4873 | AUC: 0.7601
✨ Model saved with F1-score: 0.4873
📘 Epoch 04 | Loss: 0.8048 | Acc: 0.5644 | P: 0.3274 | R: 0.8758 | F1: 0.4766 | AUC: 0.7735
📘 Epoch 05 | Loss: 0.7885 | Acc: 0.4984 | P: 0.3046 | R: 0.9471 | F1: 0.4610 | AUC: 0.7813
📘 Epoch 06 | Loss: 0.7841 | Acc: 0.6346 | P: 0.3639 | R: 0.8197 | F1: 0.5040 | AUC: 0.7825
✨ Model saved with F1-score: 0.5040
📘 Epoch 07 | Loss: 0.7715 | Acc: 0.6185 | P: 0.3549 | R: 0.8369 | F1: 0.4984 | AUC: 0.7841
📘 Epoch 08 | Loss: 0.7691 | Acc: 0.5647 | P: 0.3296 | R: 0.8920 | F1: 0.4814 | AUC: 0.7878
📘 Epoch 09 | Loss: 0.7662 | Acc: 0.5070 | P: 0.3081 | R: 0.9449 | F1: 0.4647 | AUC: 0.7927
📘 Epoch 10 | Loss: 0.7546 | Acc: 0.65

In [17]:
# Cell 6: Đánh giá mô hình GAT + Baseline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import warnings
import torch.nn.functional as F

# ✅ Load mô hình tốt nhất
model.load_state_dict(torch.load("best_model.pt"))
model.eval()

# Đánh giá
all_preds, all_probs, all_labels = [], [], []

with torch.no_grad():
    for batch in test_loader:
        batch = batch.to(device)
        out = model(batch)
        prob = F.softmax(out, dim=1)
        pred = torch.argmax(prob, dim=1)

        all_preds.extend(pred.cpu().tolist())
        all_probs.extend(prob[:, 1].cpu().tolist())  # class = 1
        all_labels.extend(batch.y.cpu().tolist())

# 🎯 Kết quả trên tập test
print("📊 Đánh giá trên tập test:")
print(f"Accuracy : {accuracy_score(all_labels, all_preds):.4f}")
print(f"Precision: {precision_score(all_labels, all_preds):.4f}")
print(f"Recall   : {recall_score(all_labels, all_preds):.4f}")
print(f"F1-score : {f1_score(all_labels, all_preds):.4f}")
try:
    print(f"AUC-ROC  : {roc_auc_score(all_labels, all_probs):.4f}")
except:
    print("⚠️ Không thể tính AUC-ROC (dữ liệu chỉ chứa 1 class)")

print("\n📌 Báo cáo chi tiết:")
print(classification_report(all_labels, all_preds, digits=4))

# 🎯 Baseline: Logistic Regression
X_train = torch.stack([d.x.mean(dim=0) for d in train_data]).numpy()
y_train = [d.y.item() for d in train_data]
X_test = torch.stack([d.x.mean(dim=0) for d in test_data]).numpy()
y_test = [d.y.item() for d in test_data]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    baseline = LogisticRegression(max_iter=1000)
    baseline.fit(X_train_scaled, y_train)

baseline_preds = baseline.predict(X_test_scaled)
baseline_f1 = f1_score(y_test, baseline_preds)

print(f"\n📉 Baseline F1-score (Logistic Regression): {baseline_f1:.4f}")


📊 Đánh giá trên tập test:
Accuracy : 0.8074
Precision: 0.5630
Recall   : 0.7184
F1-score : 0.6313
AUC-ROC  : 0.8741

📌 Báo cáo chi tiết:
              precision    recall  f1-score   support

           0     0.9086    0.8339    0.8696      3112
           1     0.5630    0.7184    0.6313       927

    accuracy                         0.8074      4039
   macro avg     0.7358    0.7762    0.7505      4039
weighted avg     0.8293    0.8074    0.8149      4039


📉 Baseline F1-score (Logistic Regression): 0.4671
