In [12]:
!pip install -q pydot==2.0.0
!pip install -q node2vec
!pip install -q torch_geometric

In [13]:
!python --version

Python 3.11.13


In [14]:
# %%
import os
import gc
import json
import torch
import networkx as nx
from node2vec import Node2Vec
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, T5EncoderModel
import warnings

warnings.filterwarnings('ignore')

In [15]:
# %%
def seed_everything(seed_value):
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

seed_everything(42)

In [16]:
# %% [markdown]
# # Helper Models

# %%
class CodeT5Classifier(nn.Module):
    def __init__(self, model_name, n_classes, dropout_prob=0.3):
        super(CodeT5Classifier, self).__init__()
        self.encoder = T5EncoderModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=dropout_prob)
        self.fc = nn.Linear(self.encoder.config.d_model, n_classes)
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias, 0)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )
        pooled_output = outputs.last_hidden_state[:, 0]  # embedding token đầu tiên
        x = self.drop(pooled_output)
        x = self.fc(x)
        return x

    def get_embeddings(self, input_ids, attention_mask):
        self.eval()
        with torch.no_grad():
            outputs = self.encoder(
                input_ids=input_ids,
                attention_mask=attention_mask,
                return_dict=True
            )
            pooled_output = outputs.last_hidden_state[:, 0]
            # nếu muốn có dropout thì bật, hoặc tắt để embedding ổn định
            embedding = pooled_output
        return embedding
# %% [markdown]
# # Main Vulnerability Classifier Pipeline

# %%
class VulnerabilityClassifier:
    def __init__(self, codelm_path, n2v_dim=200, device='cuda:0'):
        self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")

        self.codet5_model = CodeT5Classifier(model_name="Salesforce/codet5-base", n_classes=2)
        self.codet5_model.load_state_dict(torch.load(codelm_path, map_location=self.device))
        self.codet5_model.to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-base")
        self.n2v_dim = n2v_dim

        self.n2v_node_embeddings = []
        self.adj_matrices = []
        self.aggregated_codet5_embeddings = []
        self.labels = []
        self.splits = []
        self.ignore = []

    def _make_graph(self, nodes, edge_matrix):
        G = nx.DiGraph()
        G.add_nodes_from(range(len(nodes)))
        sources, targets = edge_matrix
        edges = list(zip(sources.tolist(), targets.tolist()))
        G.add_edges_from(edges)
        code_sequence = [str(v['label']) for v in dict(nodes).values()]
        return G, code_sequence

    def process_data(self, df: pd.DataFrame):
        print("Starting data processing to generate embeddings...")
        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing Graphs"):
            try:
                nodes = row['nodes']
                adj_matrix = row['adj_matrix']
                G, code_sequence = self._make_graph(nodes, adj_matrix)
    
                # Tối ưu CodeT5 embedding
                inputs = self.tokenizer(code_sequence, return_tensors="pt", padding=True, truncation=True, max_length=128)
                inputs = {k: v.to(self.device) for k, v in inputs.items()}
                
                with torch.no_grad():  # Tắt gradient computation
                    codet5_graph_embedding = self.codet5_model.get_embeddings(
                        input_ids=inputs["input_ids"], 
                        attention_mask=inputs["attention_mask"]
                    )
                    # Chuyển về CPU ngay lập tức và detach
                    codet5_graph_embedding = codet5_graph_embedding.detach().cpu()
                
                self.aggregated_codet5_embeddings.append(codet5_graph_embedding)
                
                # Xóa inputs khỏi GPU ngay lập tức
                del inputs
                torch.cuda.empty_cache()
    
                # Node2Vec processing
                node2vec = Node2Vec(G, dimensions=self.n2v_dim, p=1, q=2, walk_length=100, 
                                  num_walks=10, workers=4, quiet=True, seed=42)
                w2v_model = node2vec.fit(window=10, min_count=1)
                node_embeddings = np.array([w2v_model.wv[str(node)] for node in G.nodes()])
                
                # Tạo tensor trên CPU trước
                node_embeddings_tensor = torch.tensor(node_embeddings, dtype=torch.float32)
                self.n2v_node_embeddings.append(node_embeddings_tensor)
                
                # Xóa numpy array
                del node_embeddings, w2v_model, node2vec
    
                # Adj matrix processing
                adj_tensor = torch.tensor(adj_matrix, dtype=torch.long)
                self.adj_matrices.append(adj_tensor)
                self.labels.append(int(row['label']))
                self.splits.append(row['split'])
    
            except Exception as e:
                print(f"[ERROR] Could not process row: {e}")
                print(f"Check the number of nodes of the graph: {len(code_sequence) if 'code_sequence' in locals() else 'N/A'}")
                self.ignore.append(idx)
            
            finally:
                # Cleanup tất cả biến local
                local_vars = list(locals().keys())
                for var in local_vars:
                    if var not in ['self', 'df', 'idx', 'row']:  # Giữ lại các biến cần thiết
                        try:
                            obj = locals()[var]
                            if hasattr(obj, 'cpu'):
                                obj = obj.cpu()
                            del obj
                        except:
                            pass
                
                # Force garbage collection và clear cache
                gc.collect()
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                    torch.cuda.synchronize()  # Đảm bảo tất cả operations hoàn thành
    
        with open('AlUNed.json', 'w') as f:
            json.dump({'labels': self.labels, 'partition': self.splits}, f)

        torch.save(self.adj_matrices, '/kaggle/working/adj_matrices.pt')
        torch.save(self.n2v_node_embeddings, '/kaggle/working/n2v_node_embeddings.pt')
        torch.save(self.aggregated_codet5_embeddings, '/kaggle/working/aggregated_codet5_embeddings.pt')
        print("Finished processing data.")

    def run(self, df: pd.DataFrame):
        self.process_data(df)

In [17]:
def balancing(full_data, ratios, default_ratio=10):
    balanced_parts = []

    for split_name, split_df in full_data.groupby("split"):
        ratio = ratios.get(split_name, default_ratio)

        vuln_df = split_df[split_df["label"] == 1]
        benign_df = split_df[split_df["label"] == 0]

        n_vuln = len(vuln_df)
        n_benign_keep = min(len(benign_df), n_vuln * ratio)

        benign_sampled = benign_df.sample(n=n_benign_keep, random_state=42)

        balanced_split = pd.concat([vuln_df, benign_sampled])
        balanced_parts.append(balanced_split)

        print(f"{split_name}: ratio={ratio}, vuln={n_vuln}, benign_kept={n_benign_keep}, total={len(balanced_split)}")

    balanced_df = pd.concat(balanced_parts).reset_index(drop=True)
    return balanced_df

def downsample_dataset(df: pd.DataFrame, target_frac: float = 0.2, random_state: int = 42) -> pd.DataFrame:
    """
    df : pd.DataFrame
        Dữ liệu đầu vào, phải có các cột ["split", "label", "cwe_id"].
    target_frac : float
        Tỉ lệ mẫu cần giữ lại (0 < target_frac <= 1).
    random_state : int
        Seed cho việc lấy mẫu ngẫu nhiên, để tái lập kết quả.
    """
    # Bước 1: Tính số mẫu cần lấy cho từng nhóm (split, label)
    split_label_counts = df.groupby(["split", "label"]).size()
    target_counts = (split_label_counts * target_frac).astype(int)

    samples = []

    # Bước 2: Lặp qua từng nhóm split, label
    for (split, label), target_count in target_counts.items():
        subset = df[(df["split"] == split) & (df["label"] == label)]
        if target_count == 0:
            continue  # bỏ qua nếu không cần lấy mẫu nào

        # Đếm số lượng mẫu cho từng cwe_id
        cwe_counts = subset["cwe_id"].value_counts()
        n_cwe = len(cwe_counts)
        per_cwe = max(target_count // n_cwe, 1)

        # Lấy mẫu đều cho từng cwe_id
        for cwe_id in cwe_counts.index:
            cwe_subset = subset[subset["cwe_id"] == cwe_id]
            n = min(len(cwe_subset), per_cwe)
            samples.append(cwe_subset.sample(n=n, random_state=random_state))

    df_downsampled = pd.concat(samples).reset_index(drop=True)
    return df_downsampled

In [18]:
CODELM_PATH = '/kaggle/input/primevul-codet5-ft/pytorch/default/1/codet5_classifier.pth'
DATASET_PATH = '/kaggle/input/covulpecker-dataset/primevul_graphs.pkl'
full_data = pd.read_pickle(DATASET_PATH)

In [19]:
# ratios = {"train": 2, "test": 3, "valid": 3}
# balanced_data = balancing(full_data, ratios, default_ratio=10)

In [23]:
df_down = downsample_dataset(full_data, target_frac=0.38)

In [24]:
df_down.to_pickle("/kaggle/working/downsample_primevul.pkl")

In [22]:
# classifier_pipeline = VulnerabilityClassifier(codelm_path=CODELM_PATH)
# classifier_pipeline.run(df_down)