In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 16384 
dtype = None 
load_in_4bit = True 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "codellama/CodeLlama-34b-Instruct-hf",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 07-16 05:02:21 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 07-16 05:02:21 [__init__.py:239] Automatically detected platform cuda.


2025-07-16 05:02:23,729	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


==((====))==  Unsloth 2025.5.3: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.5.
   \\   /|    Tesla V100-SXM2-32GB. Num GPUs = 1. Max memory: 31.749 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 7/7 [03:23<00:00, 29.11s/it]


codellama/CodeLlama-34b-Instruct-hf does not have a padding token! Will use pad_token = <unk>.


In [None]:
from datasets import load_dataset

# Define the paths to your dataset files
data_files = {
    'train': 'smart_contract_train.csv',
    'val': 'smart_contract_val.csv',
    'test': 'smart_contract_test.csv'
}

# Load the dataset
dataset = load_dataset('csv', data_files=data_files)

import random

random.seed(42)

SYSTEM_PROMPT = [
    """You are the smartest AI solidity smart contract security auditor in the world that **only** answer/respond in term of "Vulnerable Code" or "Safe Code" """
]

PROMPTS = [
    """You need to analyze the given function and classify it as "Vulnerable Code" or "Safe Code" based on potential security risks. Only answer the label dont give any explanation.
    
    This is the function we need to audit:
    ```solidity
    {code}
    ```
    
Answer: """
]

def make_conversational_vote(examples, prompt):
    code = examples['code']
    selected_system_prompt = random.choice(SYSTEM_PROMPT)
    vulnerable = "Vulnerable Code" if examples['vulnerable'] == 1 else "Safe Code"
    conversation = [
        {"role": "system", "content": selected_system_prompt},
        {"role": "user", "content": prompt.format(code=code)}
    ]
    return {"conversations": conversation, "label": vulnerable, "vuln_code": code}

# Generate 5 datasets using different prompts
datasets = []
for i, prompt in enumerate(PROMPTS):
    new_dataset = dataset.map(lambda ex: make_conversational_vote(ex, prompt))
    new_dataset = new_dataset.remove_columns(["project_id", "code", "code_analysis", "vulnerable"])
    datasets.append(new_dataset)
    
test_dataset = datasets[0]['test']

In [3]:
import time, threading, subprocess
from tqdm import tqdm

gpu_utils, gpu_mem = [], []
running = True

def poll():
    while running:
        out = subprocess.check_output([
            "nvidia-smi",
            "--query-gpu=utilization.gpu,memory.used",
            "--format=csv,noheader,nounits"
        ])
        u, m = map(int, out.decode().split(','))
        gpu_utils.append(u); gpu_mem.append(m / 1024)
        time.sleep(0.2)

thread = threading.Thread(target=poll, daemon=True)
thread.start()

from unsloth.chat_templates import get_chat_template
from tqdm import tqdm
import re
import time

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Define regex pattern to extract the assistant's response
pattern = r"\[/INST](.*?)</s>"

y_pred = []
row_times = []

for messages in tqdm(test_dataset['conversations']):
    # Start timer
    start_time = time.time()
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")

    outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                             temperature = 0.6, min_p = 0.1)
    decoded_output = tokenizer.batch_decode(outputs)[0]
    
    # Stop timer
    end_time = time.time()
    elapsed_time = end_time - start_time  # in seconds
    row_times.append(elapsed_time)
    
    # Extract the label using regex
    match = re.search(pattern, decoded_output, re.DOTALL)
    extracted_label = match.group(1).strip() if match else "Unknown"

    # Append the extracted label
    y_pred.append(extracted_label)
    
running = False
thread.join()

import numpy as np
std_util = np.std(gpu_utils, ddof=1) 

print(f"Avg GPU util: {sum(gpu_utils)/len(gpu_utils):.2f}%")
print(f"Std GPU util: {std_util:.2f}%")
print(f"Peak GPU mem: {max(gpu_mem):.4f} GB")

  0%|          | 0/278 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 278/278 [11:05<00:00,  2.39s/it]

Avg GPU util: 85.09%
Std GPU util: 9.99%
Peak GPU mem: 19.5986 GB





In [5]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")

Peak reserved memory = 19.207 GB.


In [None]:
import pickle
    
with open('_resource_data/gpu_utils_zs_codellama34b.pkl', 'wb') as file:
    pickle.dump(gpu_utils, file)
    
with open('_resource_data/gpu_mem_zs_codellama34b.pkl', 'wb') as file:
    pickle.dump(gpu_mem, file)

In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 16384 
dtype = None 
load_in_4bit = True 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "codellama/CodeLlama-13b-Instruct-hf",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 07-16 05:29:16 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 07-16 05:29:17 [__init__.py:239] Automatically detected platform cuda.


2025-07-16 05:29:18,957	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


==((====))==  Unsloth 2025.5.3: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.5.
   \\   /|    Tesla V100-SXM2-32GB. Num GPUs = 1. Max memory: 31.749 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 3/3 [00:57<00:00, 19.28s/it]


codellama/CodeLlama-13b-Instruct-hf does not have a padding token! Will use pad_token = <unk>.


In [None]:
from datasets import load_dataset

# Define the paths to your dataset files
data_files = {
    'train': 'smart_contract_train.csv',
    'val': 'smart_contract_val.csv',
    'test': 'smart_contract_test.csv'
}

# Load the dataset
dataset = load_dataset('csv', data_files=data_files)

import random

random.seed(42)

SYSTEM_PROMPT = [
    """You are the smartest AI solidity smart contract security auditor in the world that **only** answer/respond in term of "Vulnerable Code" or "Safe Code" """
]

PROMPTS = [
    """You need to analyze the given function and classify it as "Vulnerable Code" or "Safe Code" based on potential security risks. Only answer the label dont give any explanation.
    
    This is the function we need to audit:
    ```solidity
    {code}
    ```
    
Answer: """
]

def make_conversational_vote(examples, prompt):
    code = examples['code']
    selected_system_prompt = random.choice(SYSTEM_PROMPT)
    vulnerable = "Vulnerable Code" if examples['vulnerable'] == 1 else "Safe Code"
    conversation = [
        {"role": "system", "content": selected_system_prompt},
        {"role": "user", "content": prompt.format(code=code)}
    ]
    return {"conversations": conversation, "label": vulnerable, "vuln_code": code}

# Generate 5 datasets using different prompts
datasets = []
for i, prompt in enumerate(PROMPTS):
    new_dataset = dataset.map(lambda ex: make_conversational_vote(ex, prompt))
    new_dataset = new_dataset.remove_columns(["project_id", "code", "code_analysis", "vulnerable"])
    datasets.append(new_dataset)
    
test_dataset = datasets[0]['test']

In [3]:
import time, threading, subprocess
from tqdm import tqdm

gpu_utils, gpu_mem = [], []
running = True

def poll():
    while running:
        out = subprocess.check_output([
            "nvidia-smi",
            "--query-gpu=utilization.gpu,memory.used",
            "--format=csv,noheader,nounits"
        ])
        u, m = map(int, out.decode().split(','))
        gpu_utils.append(u); gpu_mem.append(m / 1024)
        time.sleep(0.2)

thread = threading.Thread(target=poll, daemon=True)
thread.start()

from unsloth.chat_templates import get_chat_template
from tqdm import tqdm
import re
import time

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Define regex pattern to extract the assistant's response
pattern = r"\[/INST](.*?)</s>"

y_pred = []
row_times = []

for messages in tqdm(test_dataset['conversations']):
    # Start timer
    start_time = time.time()
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")

    outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                             temperature = 0.6, min_p = 0.1)
    decoded_output = tokenizer.batch_decode(outputs)[0]
    
    # Stop timer
    end_time = time.time()
    elapsed_time = end_time - start_time  # in seconds
    row_times.append(elapsed_time)
    
    # Extract the label using regex
    match = re.search(pattern, decoded_output, re.DOTALL)
    extracted_label = match.group(1).strip() if match else "Unknown"

    # Append the extracted label
    y_pred.append(extracted_label)
    
running = False
thread.join()

import numpy as np
std_util = np.std(gpu_utils, ddof=1) 

print(f"Avg GPU util: {sum(gpu_utils)/len(gpu_utils):.2f}%")
print(f"Std GPU util: {std_util:.2f}%")
print(f"Peak GPU mem: {max(gpu_mem):.4f} GB")

  0%|          | 0/278 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 278/278 [03:08<00:00,  1.47it/s]

Avg GPU util: 66.42%
Std GPU util: 22.95%
Peak GPU mem: 11.0928 GB





In [5]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")

Peak reserved memory = 12.332 GB.


In [None]:
import pickle
    
with open('_resource_data/gpu_utils_zs_codellama13b.pkl', 'wb') as file:
    pickle.dump(gpu_utils, file)
    
with open('_resource_data/gpu_mem_zs_codellama13b.pkl', 'wb') as file:
    pickle.dump(gpu_mem, file)

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 16384 
dtype = None 
load_in_4bit = True 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "codellama_detector",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 07-16 05:34:04 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 07-16 05:34:04 [__init__.py:239] Automatically detected platform cuda.


2025-07-16 05:34:06,728	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


==((====))==  Unsloth 2025.5.3: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.5.
   \\   /|    Tesla V100-SXM2-32GB. Num GPUs = 1. Max memory: 31.749 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 3/3 [00:39<00:00, 13.15s/it]
Unsloth 2025.5.3 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


In [None]:
from datasets import load_dataset

# Define the paths to your dataset files
data_files = {
    'train': 'smart_contract_train.csv',
    'val': 'smart_contract_val.csv',
    'test': 'smart_contract_test.csv'
}

# Load the dataset
dataset = load_dataset('csv', data_files=data_files)

In [3]:
import random

SYSTEM_PROMPT = [
    """You are the smartest AI solidity smart contract security auditor in the world.""",
]

PROMPTS = [
    """You need to analyze the given function and classify it as "Vulnerable Code" or "Safe Code" based on potential security risks.
    
    This is the function we need to audit:
    ```solidity
    {code}
    ```
    
Answer: """,
]

def make_conversational(examples):
    code = examples['code']
    vulnerable = "Vulnerable Code" if examples['vulnerable'] == 1 else "Safe Code"
    selected_system_prompt = random.choice(SYSTEM_PROMPT)
    selected_user_prompt = random.choice(PROMPTS) 
    conversation = [{"role": "system", "content": selected_system_prompt}, 
                    {"role": "user", "content": selected_user_prompt.format(code=code)},
                    {"role": "assistant", "content": vulnerable}]
    return { "conversations" : conversation, }

dataset = dataset.map(make_conversational)
dataset = dataset.remove_columns(["project_id","code", "code_analysis", "vulnerable"])

In [4]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True,)

In [5]:
test_dataset = dataset['test']
test_dataset = test_dataset.map(lambda row: {'label':row['conversations'][-1]['content']})
test_dataset = test_dataset.map(lambda row: {'conversations':row['conversations'][:-1]})

In [6]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = test_dataset['conversations'][1]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                         temperature = 1.5, min_p = 0.1)
tokenizer.batch_decode(outputs)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


['<s> [INST] <<SYS>>\nYou are the smartest AI solidity smart contract security auditor in the world.\n<</SYS>>\n\nYou need to analyze the given function and classify it as "Vulnerable Code" or "Safe Code" based on potential security risks.\n    \n    This is the function we need to audit:\n    ```solidity\n    function _getAdmin(uint256 adminEpoch, uint256 index) internal view returns (address) {\n        return getAddress(_getAdminKey(adminEpoch, index));\n    }\n\nfunction _isAdmin(uint256 adminEpoch, address account) internal view returns (bool) {\n        return getBool(_getIsAdminKey(adminEpoch, account));\n    }\n\nfunction _setAdminEpoch(uint256 adminEpoch) internal {\n        _setUint(KEY_ADMIN_EPOCH, adminEpoch);\n    }\n    ```\n    \nAnswer:  [/INST] Safe Code </s>']

In [8]:
import time, threading, subprocess
from tqdm import tqdm

gpu_utils, gpu_mem = [], []
running = True

def poll():
    while running:
        out = subprocess.check_output([
            "nvidia-smi",
            "--query-gpu=utilization.gpu,memory.used",
            "--format=csv,noheader,nounits"
        ])
        u, m = map(int, out.decode().split(','))
        gpu_utils.append(u); gpu_mem.append(m / 1024)
        time.sleep(0.2)

thread = threading.Thread(target=poll, daemon=True)
thread.start()

from unsloth.chat_templates import get_chat_template
from tqdm import tqdm
import re
import time

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Define regex pattern to extract the assistant's response
pattern = r"\[/INST](.*?)</s>"

y_pred = []
row_times = []

for messages in tqdm(test_dataset['conversations']):
    # Start timer
    start_time = time.time()
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")

    outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                             temperature = 0.6, min_p = 0.1)
    decoded_output = tokenizer.batch_decode(outputs)[0]
    
    # Stop timer
    end_time = time.time()
    elapsed_time = end_time - start_time  # in seconds
    row_times.append(elapsed_time)
    
    # Extract the label using regex
    match = re.search(pattern, decoded_output, re.DOTALL)
    extracted_label = match.group(1).strip() if match else "Unknown"

    # Append the extracted label
    y_pred.append(extracted_label)
    
running = False
thread.join()

import numpy as np
std_util = np.std(gpu_utils, ddof=1) 

print(f"Avg GPU util: {sum(gpu_utils)/len(gpu_utils):.2f}%")
print(f"Std GPU util: {std_util:.2f}%")
print(f"Peak GPU mem: {max(gpu_mem):.4f} GB")

100%|██████████| 278/278 [03:15<00:00,  1.42it/s]

Avg GPU util: 62.41%
Std GPU util: 24.79%
Peak GPU mem: 12.1436 GB





In [10]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")

Peak reserved memory = 12.332 GB.


In [None]:
import pickle
    
with open('_resource_data/gpu_utils_ftcodellama13b.pkl', 'wb') as file:
    pickle.dump(gpu_utils, file)
    
with open('_resource_data/gpu_mem_ftcodellama13b.pkl', 'wb') as file:
    pickle.dump(gpu_mem, file)

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 32768
dtype = None 
load_in_4bit = False 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "ours_detector",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    full_finetuning = True
)
FastLanguageModel.for_inference(model)



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 07-16 05:40:59 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 07-16 05:40:59 [__init__.py:239] Automatically detected platform cuda.


2025-07-16 05:41:01,823	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


==((====))==  Unsloth 2025.5.3: Fast Qwen3 patching. Transformers: 4.51.3. vLLM: 0.8.5.
   \\   /|    Tesla V100-SXM2-32GB. Num GPUs = 1. Max memory: 31.749 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Float16 full finetuning uses more memory since we upcast weights to float32.


Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024, padding_idx=151654)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
        (post_atten

In [None]:
from datasets import load_dataset

# Define the paths to your dataset files
data_files = {
    'train': 'smart_contract_train.csv',
    'val': 'smart_contract_val.csv',
    'test': 'smart_contract_test.csv'
}

# Load the dataset
dataset = load_dataset('csv', data_files=data_files)

import random

random.seed(42)

SYSTEM_PROMPT = [
    """You are the smartest AI solidity smart contract security auditor in the world.""",
    """You are the greatest AI assistant smart contract security auditor in the world.""",
    """You are the best solidity smart contract security auditor in the world""",
    """You are the greatest AI assistant solidity security researcher in the world""",
    """You are the best AI solidity smart contract security auditor in the world."""
]

PROMPTS = [
    """You need to analyze the given function and classify it as "Vulnerable Code" or "Safe Code" based on potential security risks.
    
    This is the function we need to audit:
    ```solidity
    {code}
    ```
    
Answer: """,
    
    """Analyze the given function and determine whether it is "Vulnerable Code" or "Safe Code" based on security risks.
    
    Function to audit:
    ```solidity
    {code}
    ```
    
Answer: """,

    """Examine the Solidity function below and assess if it is "Vulnerable Code" or "Safe Code."
    
    Solidity Function:
    ```solidity
    {code}
    ```
    
Answer: """,

    """Review the Solidity function and classify it as "Vulnerable Code" or "Safe Code" by checking for security issues.
    
    Solidity Function:
    ```solidity
    {code}
    ```
    
Answer: """,

    """Audit the given Solidity function to determine if it should be categorized as "Vulnerable Code" or "Safe Code."
    
    Solidity Code:
    ```solidity
    {code}
    ```
    
Answer: """,
]

def make_conversational_vote(examples, system_prompt, prompt):
    code = examples['code']
    vulnerable = "Vulnerable Code" if examples['vulnerable'] == 1 else "Safe Code"
    conversation = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt.format(code=code)}
    ]
    return {"conversations": conversation, "label": vulnerable}

# Generate 5 datasets using different prompts
datasets = []
for i in range(len(PROMPTS)):
    new_dataset = dataset.map(lambda ex: make_conversational_vote(ex, SYSTEM_PROMPT[i], PROMPTS[i]))
    new_dataset = new_dataset.remove_columns(["project_id", "code", "code_analysis", "vulnerable"])
    datasets.append(new_dataset)

In [5]:
import time, threading, subprocess
from tqdm import tqdm

gpu_utils, gpu_mem = [], []
running = True

def poll():
    while running:
        out = subprocess.check_output([
            "nvidia-smi",
            "--query-gpu=utilization.gpu,memory.used",
            "--format=csv,noheader,nounits"
        ])
        u, m = map(int, out.decode().split(','))
        gpu_utils.append(u); gpu_mem.append(m / 1024)
        time.sleep(0.2)

thread = threading.Thread(target=poll, daemon=True)
thread.start()

import time
from tqdm import tqdm
import re

# Regex pattern for extracting labels
pattern = r"<\|im_start\|>assistant\n<think>\n\n</think>\n\n(.*?)<\|im_end\|>"

y_preds = [[] for _ in range(5)]
row_times = []  # Store total time per row (inference + voting)
final_predictions = []

# Iterate through rows with normal tqdm
for row_idx, messages_set in enumerate(tqdm(zip(*[d['test']['conversations'] for d in datasets]), desc="Processing rows", total=dataset['test'].num_rows)):
    total_start = time.time()  # Start timer for this row

    preds_for_row = []

    # Inference for each dataset
    for i, messages in enumerate(messages_set):
        inputs = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            enable_thinking=False,
            add_generation_prompt=True,
            return_tensors="pt",
        ).to("cuda")

        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=8,
            use_cache=True,
            temperature=0.1,
            min_p=0.1
        )
        decoded_output = tokenizer.batch_decode(outputs)[0]
        match = re.search(pattern, decoded_output, re.DOTALL)
        extracted_label = match.group(1).strip() if match else "Unknown"
        y_preds[i].append(extracted_label)
        preds_for_row.append(extracted_label)

    # Majority voting
    vote_count = {"Vulnerable Code": 0, "Safe Code": 0}
    for pred in preds_for_row:
        if pred in vote_count:
            vote_count[pred] += 1
    final_prediction = max(vote_count, key=vote_count.get)
    final_predictions.append(final_prediction)

    total_end = time.time()  # Stop timer for this row
    row_times.append(total_end - total_start)

running = False
thread.join()

import numpy as np
std_util = np.std(gpu_utils, ddof=1) 

print(f"Avg GPU util: {sum(gpu_utils)/len(gpu_utils):.2f}%")
print(f"Std GPU util: {std_util:.2f}%")
print(f"Peak GPU mem: {max(gpu_mem):.4f} GB")

Processing rows: 100%|██████████| 278/278 [05:02<00:00,  1.09s/it]

Avg GPU util: 26.65%
Std GPU util: 3.81%
Peak GPU mem: 3.7881 GB





In [7]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")

Peak reserved memory = 3.398 GB.


In [None]:
import pickle

with open('_resource_data/gpu_utils_ours_detector.pkl', 'wb') as file:
    pickle.dump(gpu_utils, file)
    
with open('_resource_data/gpu_mem_ours_detector.pkl', 'wb') as file:
    pickle.dump(gpu_mem, file)

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import time, threading, subprocess, re
from tqdm import tqdm
import pandas as pd

model_name = "./codebert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Running on: {device}")

data_files = {'test': 'smart_contract_test.csv'}
datasets = load_dataset('csv', data_files=data_files)
datasets = datasets.rename_column('vulnerable', 'labels')
datasets = datasets.map(lambda example: {'labels': int(example['labels'])})

gpu_utils, gpu_mem = [], []
running = True

def poll():
    while running:
        try:
            out = subprocess.check_output([
                "nvidia-smi",
                "--query-gpu=utilization.gpu,memory.used",
                "--format=csv,noheader,nounits"
            ])
            u, m = map(int, out.decode().strip().split(','))
            gpu_utils.append(u)
            gpu_mem.append(m / 1024)  # MB -> GB
        except Exception as e:
            print(f"nvidia-smi polling error: {e}")
        time.sleep(0.2)

thread = threading.Thread(target=poll, daemon=True)
thread.start()

total_start_time = time.time()

label_map = {0: "Not Vulnerable", 1: "Vulnerable"}
predictions = []
row_times = []

model.eval()

with torch.no_grad():
    for example in tqdm(datasets['test'], desc="Running inference"):
        row_start = time.time()  # Start timer for this row
        
        code_snippet = example['code']  # Adjust column name if needed
        inputs = tokenizer(code_snippet, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        
        outputs = model(**inputs)
        logits = outputs.logits
        pred = torch.argmax(logits, dim=-1).item()
        
        predictions.append({
            "code": code_snippet,
            "true_label": label_map[example['labels']],
            "predicted_label": label_map[pred]
        })
        
        row_end = time.time()
        row_times.append(row_end - row_start)

total_end_time = time.time()
total_duration = total_end_time - total_start_time

running = False
thread.join()

import numpy as np
std_util = np.std(gpu_utils, ddof=1) 

print(f"Avg GPU util: {sum(gpu_utils)/len(gpu_utils):.2f}%")
print(f"Std GPU util: {std_util:.2f}%")
print(f"Peak GPU mem: {max(gpu_mem):.4f} GB")

  from .autonotebook import tqdm as notebook_tqdm


Running on: cuda


Running inference: 100%|██████████| 278/278 [00:03<00:00, 71.12it/s]


Avg GPU util: 43.79%
Std GPU util: 24.55%
Peak GPU mem: 0.9111 GB


In [3]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")

Peak reserved memory = 0.543 GB.


In [None]:
import pickle
    
with open('_resource_data/gpu_utils_codebert.pkl', 'wb') as file:
    pickle.dump(gpu_utils, file)
    
with open('_resource_data/gpu_mem_codebert.pkl', 'wb') as file:
    pickle.dump(gpu_mem, file)

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import time, threading, subprocess, re
from tqdm import tqdm
import pandas as pd

model_name = "./graphcodebert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Running on: {device}")

data_files = {'test': 'smart_contract_test.csv'}
datasets = load_dataset('csv', data_files=data_files)
datasets = datasets.rename_column('vulnerable', 'labels')
datasets = datasets.map(lambda example: {'labels': int(example['labels'])})

gpu_utils, gpu_mem = [], []
running = True

def poll():
    while running:
        try:
            out = subprocess.check_output([
                "nvidia-smi",
                "--query-gpu=utilization.gpu,memory.used",
                "--format=csv,noheader,nounits"
            ])
            u, m = map(int, out.decode().strip().split(','))
            gpu_utils.append(u)
            gpu_mem.append(m / 1024)  # MB -> GB
        except Exception as e:
            print(f"nvidia-smi polling error: {e}")
        time.sleep(0.2)

thread = threading.Thread(target=poll, daemon=True)
thread.start()

total_start_time = time.time()

label_map = {0: "Not Vulnerable", 1: "Vulnerable"}
predictions = []
row_times = []

model.eval()

with torch.no_grad():
    for example in tqdm(datasets['test'], desc="Running inference"):
        row_start = time.time()  # Start timer for this row
        
        code_snippet = example['code']  # Adjust column name if needed
        inputs = tokenizer(code_snippet, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        
        outputs = model(**inputs)
        logits = outputs.logits
        pred = torch.argmax(logits, dim=-1).item()
        
        predictions.append({
            "code": code_snippet,
            "true_label": label_map[example['labels']],
            "predicted_label": label_map[pred]
        })
        
        row_end = time.time()
        row_times.append(row_end - row_start)

total_end_time = time.time()
total_duration = total_end_time - total_start_time

running = False
thread.join()

import numpy as np
std_util = np.std(gpu_utils, ddof=1) 

print(f"Avg GPU util: {sum(gpu_utils)/len(gpu_utils):.2f}%")
print(f"Std GPU util: {std_util:.2f}%")
print(f"Peak GPU mem: {max(gpu_mem):.4f} GB")

  from .autonotebook import tqdm as notebook_tqdm


Running on: cuda


Running inference: 100%|██████████| 278/278 [00:03<00:00, 71.67it/s]


Avg GPU util: 43.53%
Std GPU util: 24.85%
Peak GPU mem: 0.9111 GB


In [3]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")

Peak reserved memory = 0.543 GB.


In [None]:
import pickle
    
with open('_resource_data/gpu_utils_graphcodebert.pkl', 'wb') as file:
    pickle.dump(gpu_utils, file)
    
with open('_resource_data/gpu_mem_graphcodebert.pkl', 'wb') as file:
    pickle.dump(gpu_mem, file)

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import time, threading, subprocess, re
from tqdm import tqdm
import pandas as pd

model_name = "./unixcoder"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Running on: {device}")

data_files = {'test': 'smart_contract_test.csv'}
datasets = load_dataset('csv', data_files=data_files)
datasets = datasets.rename_column('vulnerable', 'labels')
datasets = datasets.map(lambda example: {'labels': int(example['labels'])})

gpu_utils, gpu_mem = [], []
running = True

def poll():
    while running:
        try:
            out = subprocess.check_output([
                "nvidia-smi",
                "--query-gpu=utilization.gpu,memory.used",
                "--format=csv,noheader,nounits"
            ])
            u, m = map(int, out.decode().strip().split(','))
            gpu_utils.append(u)
            gpu_mem.append(m / 1024)  # MB -> GB
        except Exception as e:
            print(f"nvidia-smi polling error: {e}")
        time.sleep(0.2)

thread = threading.Thread(target=poll, daemon=True)
thread.start()

total_start_time = time.time()

label_map = {0: "Not Vulnerable", 1: "Vulnerable"}
predictions = []
row_times = []

model.eval()

with torch.no_grad():
    for example in tqdm(datasets['test'], desc="Running inference"):
        row_start = time.time()  # Start timer for this row
        
        code_snippet = example['code']  # Adjust column name if needed
        inputs = tokenizer(code_snippet, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        
        outputs = model(**inputs)
        logits = outputs.logits
        pred = torch.argmax(logits, dim=-1).item()
        
        predictions.append({
            "code": code_snippet,
            "true_label": label_map[example['labels']],
            "predicted_label": label_map[pred]
        })
        
        row_end = time.time()
        row_times.append(row_end - row_start)

total_end_time = time.time()
total_duration = total_end_time - total_start_time

running = False
thread.join()

import numpy as np
std_util = np.std(gpu_utils, ddof=1) 

print(f"Avg GPU util: {sum(gpu_utils)/len(gpu_utils):.2f}%")
print(f"Std GPU util: {std_util:.2f}%")
print(f"Peak GPU mem: {max(gpu_mem):.4f} GB")

  from .autonotebook import tqdm as notebook_tqdm


Running on: cuda


Running inference: 100%|██████████| 278/278 [00:03<00:00, 72.29it/s]


Avg GPU util: 33.95%
Std GPU util: 21.73%
Peak GPU mem: 0.9150 GB


In [3]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")

Peak reserved memory = 0.547 GB.


In [None]:
import pickle
    
with open('_resource_data/gpu_utils_unixcoder.pkl', 'wb') as file:
    pickle.dump(gpu_utils, file)
    
with open('_resource_data/gpu_mem_unixcoder.pkl', 'wb') as file:
    pickle.dump(gpu_mem, file)

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import time, threading, subprocess, re
from tqdm import tqdm
import pandas as pd

model_name = "./codet5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Running on: {device}")

data_files = {'test': 'smart_contract_test.csv'}
datasets = load_dataset('csv', data_files=data_files)
datasets = datasets.rename_column('vulnerable', 'labels')
datasets = datasets.map(lambda example: {'labels': int(example['labels'])})

gpu_utils, gpu_mem = [], []
running = True

def poll():
    while running:
        try:
            out = subprocess.check_output([
                "nvidia-smi",
                "--query-gpu=utilization.gpu,memory.used",
                "--format=csv,noheader,nounits"
            ])
            u, m = map(int, out.decode().strip().split(','))
            gpu_utils.append(u)
            gpu_mem.append(m / 1024)  # MB -> GB
        except Exception as e:
            print(f"nvidia-smi polling error: {e}")
        time.sleep(0.2)

thread = threading.Thread(target=poll, daemon=True)
thread.start()

total_start_time = time.time()

label_map = {0: "Not Vulnerable", 1: "Vulnerable"}
predictions = []
row_times = []

model.eval()

with torch.no_grad():
    for example in tqdm(datasets['test'], desc="Running inference"):
        row_start = time.time()  # Start timer for this row
        
        code_snippet = example['code']  # Adjust column name if needed
        inputs = tokenizer(code_snippet, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        
        outputs = model(**inputs)
        logits = outputs.logits
        pred = torch.argmax(logits, dim=-1).item()
        
        predictions.append({
            "code": code_snippet,
            "true_label": label_map[example['labels']],
            "predicted_label": label_map[pred]
        })
        
        row_end = time.time()
        row_times.append(row_end - row_start)

total_end_time = time.time()
total_duration = total_end_time - total_start_time

running = False
thread.join()

import numpy as np
std_util = np.std(gpu_utils, ddof=1) 

print(f"Avg GPU util: {sum(gpu_utils)/len(gpu_utils):.2f}%")
print(f"Std GPU util: {std_util:.2f}%")
print(f"Peak GPU mem: {max(gpu_mem):.4f} GB")

  from .autonotebook import tqdm as notebook_tqdm


Running on: cuda


Running inference:   0%|          | 0/278 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Running inference: 100%|██████████| 278/278 [00:11<00:00, 25.04it/s]


Avg GPU util: 39.60%
Std GPU util: 21.00%
Peak GPU mem: 1.5576 GB


In [3]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")

Peak reserved memory = 1.189 GB.


In [None]:
import pickle

with open('_resource_data/gpu_utils_codet5.pkl', 'wb') as file:
    pickle.dump(gpu_utils, file)
    
with open('_resource_data/gpu_mem_codet5.pkl', 'wb') as file:
    pickle.dump(gpu_mem, file)

In [1]:
import pickle
import pandas as pd
import numpy as np

models = [
    "zs_codellama34b",
    "zs_codellama13b",
    "ftcodellama13b",
    "ours_detector",
    "codebert",
    "graphcodebert",
    "unixcoder",
    "codet5"
]

data_dir = "_resource_data"
summary_data = {
    "model": [],
    "avg_gpu_util": [],
    "std_gpu_util": [],
    "max_gpu_mem": []
}

for model in models:
    
    with open(f"{data_dir}/gpu_utils_{model}.pkl", "rb") as f:
        gpu_utils = pickle.load(f)
    
    with open(f"{data_dir}/gpu_mem_{model}.pkl", "rb") as f:
        gpu_mem = pickle.load(f)
    
    avg_gpu_util = np.mean(gpu_utils)
    std_gpu_util = np.std(gpu_utils, ddof=1)
    max_gpu_mem = np.max(gpu_mem)
    
    summary_data["model"].append(model)
    summary_data["avg_gpu_util"].append(float(f"{avg_gpu_util:.2f}"))
    summary_data["std_gpu_util"].append(float(f"{std_gpu_util:.2f}"))
    summary_data["max_gpu_mem"].append(float(f"{max_gpu_mem:.4f}"))

df_summary = pd.DataFrame(summary_data)
df_summary.sort_values(by="model", inplace=True)
df_summary.reset_index(drop=True, inplace=True)

df_summary

Unnamed: 0,model,avg_gpu_util,std_gpu_util,max_gpu_mem
0,codebert,43.79,24.55,0.9111
1,codet5,39.6,21.0,1.5576
2,ftcodellama13b,62.41,24.79,12.1436
3,graphcodebert,43.53,24.85,0.9111
4,ours_detector,26.65,3.81,3.7881
5,unixcoder,33.95,21.73,0.915
6,zs_codellama13b,66.42,22.95,11.0928
7,zs_codellama34b,85.09,9.99,19.5986


In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 16384 
dtype = None 
load_in_4bit = True 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "codellama/CodeLlama-34b-Instruct-hf",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 07-16 06:01:00 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 07-16 06:01:00 [__init__.py:239] Automatically detected platform cuda.


2025-07-16 06:01:02,506	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


==((====))==  Unsloth 2025.5.3: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.5.
   \\   /|    Tesla V100-SXM2-32GB. Num GPUs = 1. Max memory: 31.749 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 7/7 [03:20<00:00, 28.69s/it]


codellama/CodeLlama-34b-Instruct-hf does not have a padding token! Will use pad_token = <unk>.


In [None]:
from datasets import load_dataset

data_files = {
    'train': 'severity_data_train.csv',
    'val': 'severity_data_val.csv',
    'test': 'severity_data_test.csv'
}

dataset = load_dataset('csv', data_files=data_files)

import random
from datasets import concatenate_datasets, DatasetDict

random.seed(42)

SYSTEM_PROMPT = [
    """You are the smartest AI solidity smart contract security auditor in the world that only answer in one word between "low", "medium" or "high".""",
]

PROMPTS = [
    """You need to analyze the given vulnerability explanation and classify the severity of it as "low", "medium" or "high" based on the impact of that vulnerability. Dont give any additional explanations just give the label (low, medium or high).
    
    This is the vulnerability explanation we need to analyze:
    {explanation}
    
Answer: """,
]

def make_conversational_vote(examples, system_prompt, prompt):
    explanation = examples['vuln_explanation']
    severity = examples['severity']
    selected_system_prompt = system_prompt
    selected_user_prompt = prompt
    conversation = [{"role": "system", "content": selected_system_prompt}, 
                    {"role": "user", "content": selected_user_prompt.format(explanation=explanation)}]
    
    return { "conversations" : conversation, 'label': severity}

datasets = []
for i in range(len(PROMPTS)):
    new_dataset = dataset.map(lambda ex: make_conversational_vote(ex, SYSTEM_PROMPT[i], PROMPTS[i]))
    new_dataset = new_dataset.remove_columns(["file_name", "vuln_title", "vuln_recommendation", "vuln_code"])
    datasets.append(new_dataset)
    
test_dataset = datasets[0]['test']

In [3]:
import time, threading, subprocess
from tqdm import tqdm

gpu_utils, gpu_mem = [], []
running = True

def poll():
    while running:
        out = subprocess.check_output([
            "nvidia-smi",
            "--query-gpu=utilization.gpu,memory.used",
            "--format=csv,noheader,nounits"
        ])
        u, m = map(int, out.decode().split(','))
        gpu_utils.append(u); gpu_mem.append(m / 1024)
        time.sleep(0.2)

thread = threading.Thread(target=poll, daemon=True)
thread.start()

from unsloth.chat_templates import get_chat_template
from tqdm import tqdm
import re
import time

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Define regex pattern to extract the assistant's response
pattern = r"\[/INST](.*?)</s>"

y_pred = []
row_times = []

for messages in tqdm(test_dataset['conversations']):
    # Start timer
    start_time = time.time()
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")

    outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                             temperature = 0.6, min_p = 0.1)
    decoded_output = tokenizer.batch_decode(outputs)[0]
    
    # Stop timer
    end_time = time.time()
    elapsed_time = end_time - start_time  # in seconds
    row_times.append(elapsed_time)
    
    # Extract the label using regex
    match = re.search(pattern, decoded_output, re.DOTALL)
    extracted_label = match.group(1).strip() if match else "Unknown"

    # Append the extracted label
    y_pred.append(extracted_label)
    
running = False
thread.join()

import numpy as np
std_util = np.std(gpu_utils, ddof=1) 

print(f"Avg GPU util: {sum(gpu_utils)/len(gpu_utils):.2f}%")
print(f"Std GPU util: {std_util:.2f}%")
print(f"Peak GPU mem: {max(gpu_mem):.4f} GB")

  0%|          | 0/143 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 143/143 [02:12<00:00,  1.08it/s]

Avg GPU util: 90.22%
Std GPU util: 17.20%
Peak GPU mem: 19.5791 GB





In [5]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")

Peak reserved memory = 19.188 GB.


In [None]:
import pickle
    
with open('_resource_data/gpu_utils_sev_zs_codellama34b.pkl', 'wb') as file:
    pickle.dump(gpu_utils, file)
    
with open('_resource_data/gpu_mem_sev_zs_codellama34b.pkl', 'wb') as file:
    pickle.dump(gpu_mem, file)

In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 16384 
dtype = None 
load_in_4bit = True 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "codellama/CodeLlama-13b-Instruct-hf",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 07-16 06:11:20 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 07-16 06:11:20 [__init__.py:239] Automatically detected platform cuda.


2025-07-16 06:11:22,730	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


==((====))==  Unsloth 2025.5.3: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.5.
   \\   /|    Tesla V100-SXM2-32GB. Num GPUs = 1. Max memory: 31.749 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 3/3 [01:16<00:00, 25.56s/it]


codellama/CodeLlama-13b-Instruct-hf does not have a padding token! Will use pad_token = <unk>.


In [None]:
from datasets import load_dataset

data_files = {
    'train': 'severity_data_train.csv',
    'val': 'severity_data_val.csv',
    'test': 'severity_data_test.csv'
}

dataset = load_dataset('csv', data_files=data_files)

import random
from datasets import concatenate_datasets, DatasetDict

random.seed(42)

SYSTEM_PROMPT = [
    """You are the smartest AI solidity smart contract security auditor in the world that only answer in one word between "low", "medium" or "high".""",
]

PROMPTS = [
    """You need to analyze the given vulnerability explanation and classify the severity of it as "low", "medium" or "high" based on the impact of that vulnerability. Dont give any additional explanations just give the label (low, medium or high).
    
    This is the vulnerability explanation we need to analyze:
    {explanation}
    
Answer: """,
]

def make_conversational_vote(examples, system_prompt, prompt):
    explanation = examples['vuln_explanation']
    severity = examples['severity']
    selected_system_prompt = system_prompt
    selected_user_prompt = prompt
    conversation = [{"role": "system", "content": selected_system_prompt}, 
                    {"role": "user", "content": selected_user_prompt.format(explanation=explanation)}]
    
    return { "conversations" : conversation, 'label': severity}

datasets = []
for i in range(len(PROMPTS)):
    new_dataset = dataset.map(lambda ex: make_conversational_vote(ex, SYSTEM_PROMPT[i], PROMPTS[i]))
    new_dataset = new_dataset.remove_columns(["file_name", "vuln_title", "vuln_recommendation", "vuln_code"])
    datasets.append(new_dataset)
    
test_dataset = datasets[0]['test']

In [3]:
import time, threading, subprocess
from tqdm import tqdm

gpu_utils, gpu_mem = [], []
running = True

def poll():
    while running:
        out = subprocess.check_output([
            "nvidia-smi",
            "--query-gpu=utilization.gpu,memory.used",
            "--format=csv,noheader,nounits"
        ])
        u, m = map(int, out.decode().split(','))
        gpu_utils.append(u); gpu_mem.append(m / 1024)
        time.sleep(0.2)

thread = threading.Thread(target=poll, daemon=True)
thread.start()

from unsloth.chat_templates import get_chat_template
from tqdm import tqdm
import re
import time

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Define regex pattern to extract the assistant's response
pattern = r"\[/INST](.*?)</s>"

y_pred = []
row_times = []

for messages in tqdm(test_dataset['conversations']):
    # Start timer
    start_time = time.time()
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")

    outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                             temperature = 0.6, min_p = 0.1)
    decoded_output = tokenizer.batch_decode(outputs)[0]
    
    # Stop timer
    end_time = time.time()
    elapsed_time = end_time - start_time  # in seconds
    row_times.append(elapsed_time)
    
    # Extract the label using regex
    match = re.search(pattern, decoded_output, re.DOTALL)
    extracted_label = match.group(1).strip() if match else "Unknown"

    # Append the extracted label
    y_pred.append(extracted_label)
    
running = False
thread.join()

import numpy as np
std_util = np.std(gpu_utils, ddof=1) 

print(f"Avg GPU util: {sum(gpu_utils)/len(gpu_utils):.2f}%")
print(f"Std GPU util: {std_util:.2f}%")
print(f"Peak GPU mem: {max(gpu_mem):.4f} GB")

  0%|          | 0/143 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 143/143 [00:58<00:00,  2.47it/s]

Avg GPU util: 77.57%
Std GPU util: 23.04%
Peak GPU mem: 10.2900 GB





In [5]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")

Peak reserved memory = 12.332 GB.


In [None]:
import pickle
    
with open('_resource_data/gpu_utils_sev_zs_codellama13b.pkl', 'wb') as file:
    pickle.dump(gpu_utils, file)
    
with open('_resource_data/gpu_mem_sev_zs_codellama13b.pkl', 'wb') as file:
    pickle.dump(gpu_mem, file)

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 16384 
dtype = None 
load_in_4bit = True 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "severity_codellama",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 07-16 08:46:30 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 07-16 08:46:30 [__init__.py:239] Automatically detected platform cuda.


2025-07-16 08:46:32,841	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


==((====))==  Unsloth 2025.5.3: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.5.
   \\   /|    Tesla V100-SXM2-32GB. Num GPUs = 1. Max memory: 31.749 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 3/3 [00:25<00:00,  8.53s/it]
Unsloth 2025.5.3 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


In [None]:
from datasets import load_dataset

# Define the paths to your dataset files
data_files = {
    'train': 'severity_data_train.csv',
    'val': 'severity_data_val.csv',
    'test': 'severity_data_test.csv'
}

# Load the dataset
dataset = load_dataset('csv', data_files=data_files)

import random
from datasets import concatenate_datasets, DatasetDict

random.seed(42)

SYSTEM_PROMPT = [
    """You are the smartest AI solidity smart contract security auditor in the world that only answer in one word between "low", "medium" or "high".""",
]

PROMPTS = [
    """You need to analyze the given vulnerability explanation and classify the severity of it as "low", "medium" or "high" based on the impact of that vulnerability. Dont give any additional explanations just give the label (low, medium or high).
    
    This is the vulnerability explanation we need to analyze:
    {explanation}
    
Answer: """,
]

def make_conversational_vote(examples, system_prompt, prompt):
    explanation = examples['vuln_explanation']
    severity = examples['severity']
    selected_system_prompt = system_prompt
    selected_user_prompt = prompt
    conversation = [{"role": "system", "content": selected_system_prompt}, 
                    {"role": "user", "content": selected_user_prompt.format(explanation=explanation)}]
    
    return { "conversations" : conversation, 'label': severity}

# Generate 5 datasets using different prompts
datasets = []
for i in range(len(PROMPTS)):
    new_dataset = dataset.map(lambda ex: make_conversational_vote(ex, SYSTEM_PROMPT[i], PROMPTS[i]))
    new_dataset = new_dataset.remove_columns(["file_name", "vuln_title", "vuln_explanation", "vuln_recommendation", "vuln_code"])
    datasets.append(new_dataset)
    
test_dataset = datasets[0]['test']

In [4]:
import time, threading, subprocess
from tqdm import tqdm

gpu_utils, gpu_mem = [], []
running = True

def poll():
    while running:
        out = subprocess.check_output([
            "nvidia-smi",
            "--query-gpu=utilization.gpu,memory.used",
            "--format=csv,noheader,nounits"
        ])
        u, m = map(int, out.decode().split(','))
        gpu_utils.append(u); gpu_mem.append(m / 1024)
        time.sleep(0.2)

thread = threading.Thread(target=poll, daemon=True)
thread.start()

from unsloth.chat_templates import get_chat_template
from tqdm import tqdm
import re
import time

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Define regex pattern to extract the assistant's response
pattern = r"\[/INST](.*?)</s>"

y_pred = []
row_times = []

for messages in tqdm(test_dataset['conversations']):
    # Start timer
    start_time = time.time()
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")

    outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                             temperature = 0.6, min_p = 0.1)
    decoded_output = tokenizer.batch_decode(outputs)[0]
    
    # Stop timer
    end_time = time.time()
    elapsed_time = end_time - start_time  # in seconds
    row_times.append(elapsed_time)
    
    # Extract the label using regex
    match = re.search(pattern, decoded_output, re.DOTALL)
    extracted_label = match.group(1).strip() if match else "Unknown"

    # Append the extracted label
    y_pred.append(extracted_label)
    
running = False
thread.join()

import numpy as np
std_util = np.std(gpu_utils, ddof=1) 

print(f"Avg GPU util: {sum(gpu_utils)/len(gpu_utils):.2f}%")
print(f"Std GPU util: {std_util:.2f}%")
print(f"Peak GPU mem: {max(gpu_mem):.4f} GB")

In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")

In [None]:
import pickle
    
with open('_resource_data/gpu_utils_sev_ftcodellama13b.pkl', 'wb') as file:
    pickle.dump(gpu_utils, file)
    
with open('_resource_data/gpu_mem_sev_ftcodellama13b.pkl', 'wb') as file:
    pickle.dump(gpu_mem, file)

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 32768
dtype = None 
load_in_4bit = False 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "ours_severity",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    full_finetuning = True
)
FastLanguageModel.for_inference(model)



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 07-16 06:18:12 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 07-16 06:18:12 [__init__.py:239] Automatically detected platform cuda.


2025-07-16 06:18:14,784	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


==((====))==  Unsloth 2025.5.3: Fast Qwen3 patching. Transformers: 4.51.3. vLLM: 0.8.5.
   \\   /|    Tesla V100-SXM2-32GB. Num GPUs = 1. Max memory: 31.749 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Float16 full finetuning uses more memory since we upcast weights to float32.


Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024, padding_idx=151654)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
        (post_atten

In [None]:
from datasets import load_dataset

# Define the paths to your dataset files
data_files = {
    'train': 'severity_data_train.csv',
    'val': 'severity_data_val.csv',
    'test': 'severity_data_test.csv'
}

# Load the dataset
dataset = load_dataset('csv', data_files=data_files)

import random
from datasets import concatenate_datasets, DatasetDict

random.seed(42)

SYSTEM_PROMPT = [
    """You are the smartest AI solidity smart contract security auditor in the world.""",
    """You are the greatest AI assistant smart contract security auditor in the world.""",
    """You are the best solidity smart contract security auditor in the world""",
    """You are the greatest AI assistant solidity security researcher in the world""",
    """You are the best AI solidity smart contract security auditor in the world."""
]

PROMPTS = [
    """You need to analyze the given vulnerability explanation and classify the severity of it as "low", "medium" or "high" based on the impact of that vulnerability.
    
    This is the vulnerability explanation we need to analyze:
    {explanation}
    
Answer: """,
    
    """Analyze the given vulnerability explanation and determine whether the severity is "low", "medium" or "high" based on the impact if the vulnerability exploited.
    
    Vulnerability explanation to analyze:
    {explanation}
    
Answer: """,

    """Examine the given vulnerability explanation below and assess the severity is it "low", "medium" or "high".
    
    Vulnerability explanation to examine:
    {explanation}
    
Answer: """,

    """Review the vulnerability explanation and classify the severity as "low", "medium" or "high" by checking if the exploitation is occured then how is it will impact the system.
    
    Vulnerability explanation:
    {explanation}
    
Answer: """,

    """Audit the given vulnerability explanation to determine if the severity should be categorized as "low", "medium" or "high".
    
    Vulnerability explanation:
    {explanation}
    
Answer: """,
]

def make_conversational_vote(examples, system_prompt, prompt):
    explanation = examples['vuln_explanation']
    severity = examples['severity']
    selected_system_prompt = system_prompt
    selected_user_prompt = prompt
    conversation = [{"role": "system", "content": selected_system_prompt}, 
                    {"role": "user", "content": selected_user_prompt.format(explanation=explanation)}]
    
    return { "conversations" : conversation, 'label': severity}

# Generate 5 datasets using different prompts
datasets = []
for i in range(len(PROMPTS)):
    new_dataset = dataset.map(lambda ex: make_conversational_vote(ex, SYSTEM_PROMPT[i], PROMPTS[i]))
    new_dataset = new_dataset.remove_columns(["file_name", "vuln_title", "vuln_explanation", "vuln_recommendation", "vuln_code"])
    datasets.append(new_dataset)

Map: 100%|██████████| 662/662 [00:00<00:00, 5176.79 examples/s]
Map: 100%|██████████| 142/142 [00:00<00:00, 4839.41 examples/s]
Map: 100%|██████████| 143/143 [00:00<00:00, 4799.01 examples/s]
Map: 100%|██████████| 662/662 [00:00<00:00, 6137.45 examples/s]
Map: 100%|██████████| 142/142 [00:00<00:00, 4887.62 examples/s]
Map: 100%|██████████| 143/143 [00:00<00:00, 4840.96 examples/s]
Map: 100%|██████████| 662/662 [00:00<00:00, 6171.47 examples/s]
Map: 100%|██████████| 142/142 [00:00<00:00, 4986.40 examples/s]
Map: 100%|██████████| 143/143 [00:00<00:00, 4948.44 examples/s]
Map: 100%|██████████| 662/662 [00:00<00:00, 6220.59 examples/s]
Map: 100%|██████████| 142/142 [00:00<00:00, 5017.15 examples/s]
Map: 100%|██████████| 143/143 [00:00<00:00, 4884.13 examples/s]
Map: 100%|██████████| 662/662 [00:00<00:00, 6236.00 examples/s]
Map: 100%|██████████| 142/142 [00:00<00:00, 5000.93 examples/s]
Map: 100%|██████████| 143/143 [00:00<00:00, 4942.69 examples/s]


In [6]:
import time, threading, subprocess
from tqdm import tqdm

gpu_utils, gpu_mem = [], []
running = True

def poll():
    while running:
        out = subprocess.check_output([
            "nvidia-smi",
            "--query-gpu=utilization.gpu,memory.used",
            "--format=csv,noheader,nounits"
        ])
        u, m = map(int, out.decode().split(','))
        gpu_utils.append(u); gpu_mem.append(m / 1024)
        time.sleep(0.2)

thread = threading.Thread(target=poll, daemon=True)
thread.start()

import time
from tqdm import tqdm
import re

# Regex pattern for extracting labels
pattern = r"<\|im_start\|>assistant\n<think>\n\n</think>\n\n(.*?)<\|im_end\|>"

y_preds = [[] for _ in range(5)]
row_times = []  # Store total time per row (inference + voting)
final_predictions = []

# Iterate through rows with normal tqdm
for row_idx, messages_set in enumerate(tqdm(zip(*[d['test']['conversations'] for d in datasets]), desc="Processing rows", total=dataset['test'].num_rows)):
    total_start = time.time()  # Start timer for this row

    preds_for_row = []

    # Inference for each dataset
    for i, messages in enumerate(messages_set):
        inputs = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            enable_thinking=False,
            add_generation_prompt=True,
            return_tensors="pt",
        ).to("cuda")

        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=8,
            use_cache=True,
            temperature=0.1,
            min_p=0.1
        )
        decoded_output = tokenizer.batch_decode(outputs)[0]
        match = re.search(pattern, decoded_output, re.DOTALL)
        extracted_label = match.group(1).strip() if match else "Unknown"
        y_preds[i].append(extracted_label)
        preds_for_row.append(extracted_label)

    # Majority voting
    vote_count = {"low": 0, "medium": 0, "high": 0}
    for pred in preds_for_row:
        if pred in vote_count:
            vote_count[pred] += 1
    final_prediction = max(vote_count, key=vote_count.get)
    final_predictions.append(final_prediction)

    total_end = time.time()  # Stop timer for this row
    row_times.append(total_end - total_start)

running = False
thread.join()

import numpy as np
std_util = np.std(gpu_utils, ddof=1) 

print(f"Avg GPU util: {sum(gpu_utils)/len(gpu_utils):.2f}%")
print(f"Std GPU util: {std_util:.2f}%")
print(f"Peak GPU mem: {max(gpu_mem):.4f} GB")

Processing rows: 100%|██████████| 143/143 [01:33<00:00,  1.53it/s]

Avg GPU util: 25.27%
Std GPU util: 3.82%
Peak GPU mem: 3.6592 GB





In [8]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")

Peak reserved memory = 3.27 GB.


In [None]:
import pickle

with open('_resource_data/gpu_utils_ours_severity.pkl', 'wb') as file:
    pickle.dump(gpu_utils, file)
    
with open('_resource_data/gpu_mem_ours_severity.pkl', 'wb') as file:
    pickle.dump(gpu_mem, file)

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import time, threading, subprocess, re
from tqdm import tqdm
import pandas as pd

model_name = "./severity_codebert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

print(f"Running on: {device}")
data_files = {'test': 'severity_data_test.csv'}
datasets = load_dataset('csv', data_files=data_files)
severity_mapping = {"low": 0, "medium": 1, "high": 2}
datasets = datasets.map(lambda example: {'labels': severity_mapping[example['severity']]})

gpu_utils, gpu_mem = [], []

running = True

def poll():
    while running:
        try:
            out = subprocess.check_output([
                "nvidia-smi",
                "--query-gpu=utilization.gpu,memory.used",
                "--format=csv,noheader,nounits"
            ])
            u, m = map(int, out.decode().strip().split(','))
            gpu_utils.append(u)
            gpu_mem.append(m / 1024)  # MB -> GB
        except Exception as e:
            print(f"nvidia-smi polling error: {e}")
        time.sleep(0.2)

thread = threading.Thread(target=poll, daemon=True)

thread.start()
total_start_time = time.time()
label_map = {0: "low", 1: "medium", 2: "high"}
predictions = []
row_times = []

model.eval()

with torch.no_grad():
    for example in tqdm(datasets['test'], desc="Running inference"):
        row_start = time.time()  # Start timer for this row

        vuln_exp = example['vuln_explanation']  # Adjust column name if needed
        inputs = tokenizer(vuln_exp, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}

        outputs = model(**inputs)
        logits = outputs.logits
        pred = torch.argmax(logits, dim=-1).item()

        predictions.append({
            "vuln_explanation": vuln_exp,
            "true_label": label_map[example['labels']],
            "predicted_label": label_map[pred]
        })

        row_end = time.time()

        row_times.append(row_end - row_start)

total_end_time = time.time()
total_duration = total_end_time - total_start_time

running = False

thread.join()

import numpy as np
std_util = np.std(gpu_utils, ddof=1) 

print(f"Avg GPU util: {sum(gpu_utils)/len(gpu_utils):.2f}%")
print(f"Std GPU util: {std_util:.2f}%")
print(f"Peak GPU mem: {max(gpu_mem):.4f} GB")

  from .autonotebook import tqdm as notebook_tqdm


Running on: cuda


Running inference: 100%|██████████| 143/143 [00:02<00:00, 56.83it/s]

Avg GPU util: 28.08%
Std GPU util: 25.08%
Peak GPU mem: 0.9111 GB





In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")

Peak reserved memory = 0.543 GB.


In [None]:
import pickle
    
with open('_resource_data/gpu_utils_sev_codebert.pkl', 'wb') as file:
    pickle.dump(gpu_utils, file)
    
with open('_resource_data/gpu_mem_sev_codebert.pkl', 'wb') as file:
    pickle.dump(gpu_mem, file)

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import time, threading, subprocess, re
from tqdm import tqdm
import pandas as pd

model_name = "severity_graphcodebert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

print(f"Running on: {device}")
data_files = {'test': 'severity_data_test.csv'}
datasets = load_dataset('csv', data_files=data_files)
severity_mapping = {"low": 0, "medium": 1, "high": 2}
datasets = datasets.map(lambda example: {'labels': severity_mapping[example['severity']]})

gpu_utils, gpu_mem = [], []

running = True

def poll():
    while running:
        try:
            out = subprocess.check_output([
                "nvidia-smi",
                "--query-gpu=utilization.gpu,memory.used",
                "--format=csv,noheader,nounits"
            ])
            u, m = map(int, out.decode().strip().split(','))
            gpu_utils.append(u)
            gpu_mem.append(m / 1024)  # MB -> GB
        except Exception as e:
            print(f"nvidia-smi polling error: {e}")
        time.sleep(0.2)

thread = threading.Thread(target=poll, daemon=True)

thread.start()
total_start_time = time.time()
label_map = {0: "low", 1: "medium", 2: "high"}
predictions = []
row_times = []

model.eval()

with torch.no_grad():
    for example in tqdm(datasets['test'], desc="Running inference"):
        row_start = time.time()  # Start timer for this row

        vuln_exp = example['vuln_explanation']  # Adjust column name if needed
        inputs = tokenizer(vuln_exp, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}

        outputs = model(**inputs)
        logits = outputs.logits
        pred = torch.argmax(logits, dim=-1).item()

        predictions.append({
            "vuln_explanation": vuln_exp,
            "true_label": label_map[example['labels']],
            "predicted_label": label_map[pred]
        })

        row_end = time.time()

        row_times.append(row_end - row_start)

total_end_time = time.time()
total_duration = total_end_time - total_start_time

running = False

thread.join()

import numpy as np
std_util = np.std(gpu_utils, ddof=1) 

print(f"Avg GPU util: {sum(gpu_utils)/len(gpu_utils):.2f}%")
print(f"Std GPU util: {std_util:.2f}%")
print(f"Peak GPU mem: {max(gpu_mem):.4f} GB")

  from .autonotebook import tqdm as notebook_tqdm


Running on: cuda


Running inference: 100%|██████████| 143/143 [00:02<00:00, 59.72it/s]


Avg GPU util: 28.67%
Std GPU util: 25.03%
Peak GPU mem: 0.9111 GB


In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")

Peak reserved memory = 0.543 GB.


In [None]:
import pickle
    
with open('_resource_data/gpu_utils_sev_graphcodebert.pkl', 'wb') as file:
    pickle.dump(gpu_utils, file)
    
with open('_resource_data/gpu_mem_sev_graphcodebert.pkl', 'wb') as file:
    pickle.dump(gpu_mem, file)

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import time, threading, subprocess, re
from tqdm import tqdm
import pandas as pd

model_name = "./severity_unixcoder"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

print(f"Running on: {device}")
data_files = {'test': 'severity_data_test.csv'}
datasets = load_dataset('csv', data_files=data_files)
severity_mapping = {"low": 0, "medium": 1, "high": 2}
datasets = datasets.map(lambda example: {'labels': severity_mapping[example['severity']]})

gpu_utils, gpu_mem = [], []

running = True

def poll():
    while running:
        try:
            out = subprocess.check_output([
                "nvidia-smi",
                "--query-gpu=utilization.gpu,memory.used",
                "--format=csv,noheader,nounits"
            ])
            u, m = map(int, out.decode().strip().split(','))
            gpu_utils.append(u)
            gpu_mem.append(m / 1024)  # MB -> GB
        except Exception as e:
            print(f"nvidia-smi polling error: {e}")
        time.sleep(0.2)

thread = threading.Thread(target=poll, daemon=True)

thread.start()
total_start_time = time.time()
label_map = {0: "low", 1: "medium", 2: "high"}
predictions = []
row_times = []

model.eval()

with torch.no_grad():
    for example in tqdm(datasets['test'], desc="Running inference"):
        row_start = time.time()  # Start timer for this row

        vuln_exp = example['vuln_explanation']  # Adjust column name if needed
        inputs = tokenizer(vuln_exp, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}

        outputs = model(**inputs)
        logits = outputs.logits
        pred = torch.argmax(logits, dim=-1).item()

        predictions.append({
            "vuln_explanation": vuln_exp,
            "true_label": label_map[example['labels']],
            "predicted_label": label_map[pred]
        })

        row_end = time.time()

        row_times.append(row_end - row_start)

total_end_time = time.time()
total_duration = total_end_time - total_start_time

running = False

thread.join()

import numpy as np
std_util = np.std(gpu_utils, ddof=1) 

print(f"Avg GPU util: {sum(gpu_utils)/len(gpu_utils):.2f}%")
print(f"Std GPU util: {std_util:.2f}%")
print(f"Peak GPU mem: {max(gpu_mem):.4f} GB")

  from .autonotebook import tqdm as notebook_tqdm


Running on: cuda


Running inference: 100%|██████████| 143/143 [00:02<00:00, 55.60it/s]

Avg GPU util: 25.00%
Std GPU util: 24.08%
Peak GPU mem: 0.9150 GB





In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")

Peak reserved memory = 0.547 GB.


In [None]:
import pickle
    
with open('_resource_data/gpu_utils_sev_unixcoder.pkl', 'wb') as file:
    pickle.dump(gpu_utils, file)
    
with open('_resource_data/gpu_mem_sev_unixcoder.pkl', 'wb') as file:
    pickle.dump(gpu_mem, file)

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import time, threading, subprocess, re
from tqdm import tqdm
import pandas as pd

model_name = "./severity_codet5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

print(f"Running on: {device}")
data_files = {'test': 'severity_data_test.csv'}
datasets = load_dataset('csv', data_files=data_files)
severity_mapping = {"low": 0, "medium": 1, "high": 2}
datasets = datasets.map(lambda example: {'labels': severity_mapping[example['severity']]})

gpu_utils, gpu_mem = [], []

running = True

def poll():
    while running:
        try:
            out = subprocess.check_output([
                "nvidia-smi",
                "--query-gpu=utilization.gpu,memory.used",
                "--format=csv,noheader,nounits"
            ])
            u, m = map(int, out.decode().strip().split(','))
            gpu_utils.append(u)
            gpu_mem.append(m / 1024)  # MB -> GB
        except Exception as e:
            print(f"nvidia-smi polling error: {e}")
        time.sleep(0.2)

thread = threading.Thread(target=poll, daemon=True)

thread.start()
total_start_time = time.time()
label_map = {0: "low", 1: "medium", 2: "high"}
predictions = []
row_times = []

model.eval()

with torch.no_grad():
    for example in tqdm(datasets['test'], desc="Running inference"):
        row_start = time.time()  # Start timer for this row

        vuln_exp = example['vuln_explanation']  # Adjust column name if needed
        inputs = tokenizer(vuln_exp, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}

        outputs = model(**inputs)
        logits = outputs.logits
        pred = torch.argmax(logits, dim=-1).item()

        predictions.append({
            "vuln_explanation": vuln_exp,
            "true_label": label_map[example['labels']],
            "predicted_label": label_map[pred]
        })

        row_end = time.time()

        row_times.append(row_end - row_start)

total_end_time = time.time()
total_duration = total_end_time - total_start_time

running = False

thread.join()

import numpy as np
std_util = np.std(gpu_utils, ddof=1) 

print(f"Avg GPU util: {sum(gpu_utils)/len(gpu_utils):.2f}%")
print(f"Std GPU util: {std_util:.2f}%")
print(f"Peak GPU mem: {max(gpu_mem):.4f} GB")

  from .autonotebook import tqdm as notebook_tqdm


Running on: cuda


Running inference:   0%|          | 0/143 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Running inference: 100%|██████████| 143/143 [00:06<00:00, 22.09it/s]


Avg GPU util: 36.26%
Std GPU util: 19.75%
Peak GPU mem: 1.5479 GB


In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")

Peak reserved memory = 1.18 GB.


In [None]:
import pickle

with open('_resource_data/times_sev_codet5.pkl', 'wb') as file:
    pickle.dump(row_times, file)
    
with open('_resource_data/gpu_utils_sev_codet5.pkl', 'wb') as file:
    pickle.dump(gpu_utils, file)
    
with open('_resource_data/gpu_mem_sev_codet5.pkl', 'wb') as file:
    pickle.dump(gpu_mem, file)

In [3]:
import pickle
import pandas as pd
import numpy as np

models = [
    "sev_zs_codellama34b",
    "sev_zs_codellama13b",
    "sev_ftcodellama13b",
    "ours_severity",
    "sev_codebert",
    "sev_graphcodebert",
    "sev_unixcoder",
    "sev_codet5"
]

data_dir = "_resource_data"
summary_data = {
    "model": [],
    "avg_gpu_util": [],
    "std_gpu_util": [],
    "max_gpu_mem": []
}

for model in models:    
    with open(f"{data_dir}/gpu_utils_{model}.pkl", "rb") as f:
        gpu_utils = pickle.load(f)
    
    with open(f"{data_dir}/gpu_mem_{model}.pkl", "rb") as f:
        gpu_mem = pickle.load(f)
    
    avg_gpu_util = np.mean(gpu_utils)
    std_gpu_util = np.std(gpu_utils, ddof=1)
    max_gpu_mem = np.max(gpu_mem)
    
    summary_data["model"].append(model)
    summary_data["avg_gpu_util"].append(float(f"{avg_gpu_util:.2f}"))
    summary_data["std_gpu_util"].append(float(f"{std_gpu_util:.2f}"))
    summary_data["max_gpu_mem"].append(float(f"{max_gpu_mem:.4f}"))

df_summary = pd.DataFrame(summary_data)
df_summary.sort_values(by="model", inplace=True)
df_summary.reset_index(drop=True, inplace=True)

df_summary

Unnamed: 0,model,avg_gpu_util,std_gpu_util,max_gpu_mem
0,ours_severity,25.27,3.82,3.6592
1,sev_codebert,28.08,25.08,0.9111
2,sev_codet5,36.26,19.75,1.5479
3,sev_ftcodellama13b,70.44,25.86,11.3604
4,sev_graphcodebert,28.67,25.03,0.9111
5,sev_unixcoder,25.0,24.08,0.915
6,sev_zs_codellama13b,77.57,23.04,10.29
7,sev_zs_codellama34b,90.22,17.2,19.5791


In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 16384 
dtype = None 
load_in_4bit = True 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "codellama/CodeLlama-34b-Instruct-hf",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 07-16 06:34:24 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 07-16 06:34:24 [__init__.py:239] Automatically detected platform cuda.


2025-07-16 06:34:26,089	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


==((====))==  Unsloth 2025.5.3: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.5.
   \\   /|    Tesla V100-SXM2-32GB. Num GPUs = 1. Max memory: 31.749 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 7/7 [03:24<00:00, 29.16s/it]


codellama/CodeLlama-34b-Instruct-hf does not have a padding token! Will use pad_token = <unk>.


In [None]:
from datasets import load_dataset

data_files = {
    'train': 'vuln_data_train.csv',
    'val': 'vuln_data_val.csv',
    'test': 'vuln_data_test.csv'
}

dataset = load_dataset('csv', data_files=data_files)

import random

random.seed(42)

SYSTEM_PROMPT = [
    """You are the smartest AI solidity smart contract security auditor in the world.""",
]

PROMPTS = [
    """The given Solidity function is vulnerable. Review the code and analyze its security flaws. Just give short explanation why these function is vulnerable.
    
    This is the function we need to audit:
    ```solidity
    {code}
    ```
    
Vulnerability : """,
]


def make_conversational(examples):
    code = examples['vuln_code']
    severity = examples['severity']
    desc = examples['vuln_explanation']
    assistant_prompt = desc
    selected_system_prompt = random.choice(SYSTEM_PROMPT)
    selected_user_prompt = random.choice(PROMPTS) 
    conversation = [{"role": "system", "content": selected_system_prompt}, 
                    {"role": "user", "content": selected_user_prompt.format(code=code)},
                    {"role": "assistant", "content": assistant_prompt}]
    return { "conversations" : conversation, }

dataset = dataset.map(make_conversational)
dataset = dataset.remove_columns(["vuln_title", "vuln_explanation", "severity", "vuln_recommendation"])

from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True,)

test_dataset = dataset['test']
test_dataset = test_dataset.map(lambda row: {'answer':row['conversations'][-1]})
test_dataset = test_dataset.map(lambda row: {'conversations':row['conversations'][:-1]})

Generating train split: 662 examples [00:00, 8316.22 examples/s]
Generating val split: 142 examples [00:00, 14993.23 examples/s]
Generating test split: 143 examples [00:00, 14962.09 examples/s]
Map: 100%|██████████| 662/662 [00:00<00:00, 5467.34 examples/s]
Map: 100%|██████████| 142/142 [00:00<00:00, 4900.90 examples/s]
Map: 100%|██████████| 143/143 [00:00<00:00, 4847.10 examples/s]
Map: 100%|██████████| 662/662 [00:00<00:00, 6338.34 examples/s]
Map: 100%|██████████| 142/142 [00:00<00:00, 6096.18 examples/s]
Map: 100%|██████████| 143/143 [00:00<00:00, 6033.03 examples/s]
Map: 100%|██████████| 143/143 [00:00<00:00, 5000.21 examples/s]
Map: 100%|██████████| 143/143 [00:00<00:00, 4900.29 examples/s]


In [3]:
import time, threading, subprocess
from tqdm import tqdm

gpu_utils, gpu_mem = [], []
running = True

def poll():
    while running:
        out = subprocess.check_output([
            "nvidia-smi",
            "--query-gpu=utilization.gpu,memory.used",
            "--format=csv,noheader,nounits"
        ])
        u, m = map(int, out.decode().split(','))
        gpu_utils.append(u); gpu_mem.append(m / 1024)
        time.sleep(0.2)

thread = threading.Thread(target=poll, daemon=True)
thread.start()

from unsloth.chat_templates import get_chat_template
from tqdm import tqdm
import re
import time

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Define regex pattern to extract the assistant's response
pattern = r"\[/INST](.*?)</s>"

y_pred = []
row_times = []

for messages in tqdm(test_dataset['conversations']):
    # Start timer
    start_time = time.time()
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")

    outputs = model.generate(input_ids=inputs, max_new_tokens=2048, use_cache=True,
                             temperature=0.1, min_p=0.1)
    decoded_output = tokenizer.batch_decode(outputs)[0]
    
    # Stop timer
    end_time = time.time()
    elapsed_time = end_time - start_time  # in seconds
    row_times.append(elapsed_time)

    y_pred.append(decoded_output)
    
running = False
thread.join()

import numpy as np
std_util = np.std(gpu_utils, ddof=1) 

print(f"Avg GPU util: {sum(gpu_utils)/len(gpu_utils):.2f}%")
print(f"Std GPU util: {std_util:.2f}%")
print(f"Peak GPU mem: {max(gpu_mem):.4f} GB")

  0%|          | 0/143 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 143/143 [56:00<00:00, 23.50s/it] 

Avg GPU util: 83.69%
Std GPU util: 6.04%
Peak GPU mem: 19.8115 GB





In [5]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")

Peak reserved memory = 19.42 GB.


In [None]:
import pickle

with open('_resource_data/times_exp_zs_codellama34b.pkl', 'wb') as file:
    pickle.dump(row_times, file)
    
with open('_resource_data/gpu_utils_exp_zs_codellama34b.pkl', 'wb') as file:
    pickle.dump(gpu_utils, file)
    
with open('_resource_data/gpu_mem_exp_zs_codellama34b.pkl', 'wb') as file:
    pickle.dump(gpu_mem, file)

In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 16384 
dtype = None 
load_in_4bit = True 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "codellama/CodeLlama-13b-Instruct-hf",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 07-16 07:39:15 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 07-16 07:39:15 [__init__.py:239] Automatically detected platform cuda.


2025-07-16 07:39:17,174	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


==((====))==  Unsloth 2025.5.3: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.5.
   \\   /|    Tesla V100-SXM2-32GB. Num GPUs = 1. Max memory: 31.749 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 3/3 [01:17<00:00, 25.67s/it]


codellama/CodeLlama-13b-Instruct-hf does not have a padding token! Will use pad_token = <unk>.


In [None]:
from datasets import load_dataset

data_files = {
    'train': 'vuln_data_train.csv',
    'val': 'vuln_data_val.csv',
    'test': 'vuln_data_test.csv'
}

dataset = load_dataset('csv', data_files=data_files)

import random

random.seed(42)

SYSTEM_PROMPT = [
    """You are the smartest AI solidity smart contract security auditor in the world.""",
]

PROMPTS = [
    """The given Solidity function is vulnerable. Review the code and analyze its security flaws. Just give short explanation why these function is vulnerable.
    
    This is the function we need to audit:
    ```solidity
    {code}
    ```
    
Vulnerability : """,
]


def make_conversational(examples):
    code = examples['vuln_code']
    severity = examples['severity']
    desc = examples['vuln_explanation']
    assistant_prompt = desc
    selected_system_prompt = random.choice(SYSTEM_PROMPT)
    selected_user_prompt = random.choice(PROMPTS) 
    conversation = [{"role": "system", "content": selected_system_prompt}, 
                    {"role": "user", "content": selected_user_prompt.format(code=code)},
                    {"role": "assistant", "content": assistant_prompt}]
    return { "conversations" : conversation, }

dataset = dataset.map(make_conversational)
dataset = dataset.remove_columns(["vuln_title", "vuln_explanation", "severity", "vuln_recommendation"])

from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True,)

test_dataset = dataset['test']
test_dataset = test_dataset.map(lambda row: {'answer':row['conversations'][-1]})
test_dataset = test_dataset.map(lambda row: {'conversations':row['conversations'][:-1]})

Map: 100%|██████████| 662/662 [00:00<00:00, 5292.69 examples/s]
Map: 100%|██████████| 142/142 [00:00<00:00, 4903.52 examples/s]
Map: 100%|██████████| 143/143 [00:00<00:00, 4953.47 examples/s]
Map: 100%|██████████| 662/662 [00:00<00:00, 6797.27 examples/s]
Map: 100%|██████████| 142/142 [00:00<00:00, 5915.74 examples/s]
Map: 100%|██████████| 143/143 [00:00<00:00, 6005.90 examples/s]
Map: 100%|██████████| 143/143 [00:00<00:00, 5205.30 examples/s]
Map: 100%|██████████| 143/143 [00:00<00:00, 4997.55 examples/s]


In [3]:
import time, threading, subprocess
from tqdm import tqdm

gpu_utils, gpu_mem = [], []
running = True

def poll():
    while running:
        out = subprocess.check_output([
            "nvidia-smi",
            "--query-gpu=utilization.gpu,memory.used",
            "--format=csv,noheader,nounits"
        ])
        u, m = map(int, out.decode().split(','))
        gpu_utils.append(u); gpu_mem.append(m / 1024)
        time.sleep(0.2)

thread = threading.Thread(target=poll, daemon=True)
thread.start()

from unsloth.chat_templates import get_chat_template
from tqdm import tqdm
import re
import time

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Define regex pattern to extract the assistant's response
pattern = r"\[/INST](.*?)</s>"

y_pred = []
row_times = []

for messages in tqdm(test_dataset['conversations']):
    # Start timer
    start_time = time.time()
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")

    outputs = model.generate(input_ids=inputs, max_new_tokens=2048, use_cache=True,
                             temperature=0.1, min_p=0.1)
    decoded_output = tokenizer.batch_decode(outputs)[0]
    
    # Stop timer
    end_time = time.time()
    elapsed_time = end_time - start_time  # in seconds
    row_times.append(elapsed_time)

    y_pred.append(decoded_output)
    
running = False
thread.join()

import numpy as np
std_util = np.std(gpu_utils, ddof=1) 

print(f"Avg GPU util: {sum(gpu_utils)/len(gpu_utils):.2f}%")
print(f"Std GPU util: {std_util:.2f}%")
print(f"Peak GPU mem: {max(gpu_mem):.4f} GB")

  0%|          | 0/143 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 143/143 [58:10<00:00, 24.41s/it] 

Avg GPU util: 49.77%
Std GPU util: 5.97%
Peak GPU mem: 12.3545 GB





In [5]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")

Peak reserved memory = 12.332 GB.


In [None]:
import pickle

with open('_resource_data/times_exp_zs_codellama13b.pkl', 'wb') as file:
    pickle.dump(row_times, file)
    
with open('_resource_data/gpu_utils_exp_zs_codellama13b.pkl', 'wb') as file:
    pickle.dump(gpu_utils, file)
    
with open('_resource_data/gpu_mem_exp_zs_codellama13b.pkl', 'wb') as file:
    pickle.dump(gpu_mem, file)

In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 16384 
dtype = None 
load_in_4bit = True 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "codellama13b_vuln_exp",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 07-16 08:49:53 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 07-16 08:49:53 [__init__.py:239] Automatically detected platform cuda.


2025-07-16 08:49:55,936	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


==((====))==  Unsloth 2025.5.3: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.5.
   \\   /|    Tesla V100-SXM2-32GB. Num GPUs = 1. Max memory: 31.749 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 3/3 [00:26<00:00,  8.75s/it]
Unsloth 2025.5.3 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


In [None]:
from datasets import load_dataset

# Define the paths to your dataset files
data_files = {
    'train': 'vuln_data_train.csv',
    'val': 'vuln_data_val.csv',
    'test': 'vuln_data_test.csv'
}

# Load the dataset
dataset = load_dataset('csv', data_files=data_files)

import random

random.seed(42)

SYSTEM_PROMPT = [
    """You are the smartest AI solidity smart contract security auditor in the world.""",
]

PROMPTS = [
    """The given Solidity function is vulnerable. Review the code and analyze its security flaws.
    
    This is the function we need to audit:
    ```solidity
    {code}
    ```
    
Vulnerability Explanation: """,
]

def make_conversational(examples):
    code = examples['vuln_code']
    desc = examples['vuln_explanation']
    assistant_prompt = desc
    selected_system_prompt = random.choice(SYSTEM_PROMPT)
    selected_user_prompt = random.choice(PROMPTS) 
    conversation = [{"role": "system", "content": selected_system_prompt}, 
                    {"role": "user", "content": selected_user_prompt.format(code=code)},
                    {"role": "assistant", "content": assistant_prompt}]
    return { "conversations" : conversation, }

dataset = dataset.map(make_conversational)
dataset = dataset.remove_columns(["vuln_title", "vuln_explanation", "severity", "vuln_recommendation"])

from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True,)

test_dataset = dataset['test']
test_dataset = test_dataset.map(lambda row: {'answer':row['conversations'][-1]})
test_dataset = test_dataset.map(lambda row: {'conversations':row['conversations'][:-1]})

In [3]:
import time, threading, subprocess
from tqdm import tqdm

gpu_utils, gpu_mem = [], []
running = True

def poll():
    while running:
        out = subprocess.check_output([
            "nvidia-smi",
            "--query-gpu=utilization.gpu,memory.used",
            "--format=csv,noheader,nounits"
        ])
        u, m = map(int, out.decode().split(','))
        gpu_utils.append(u); gpu_mem.append(m / 1024)
        time.sleep(0.2)

thread = threading.Thread(target=poll, daemon=True)
thread.start()

from unsloth.chat_templates import get_chat_template
from tqdm import tqdm
import re
import time

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Define regex pattern to extract the assistant's response
pattern = r"\[/INST](.*?)</s>"

y_pred = []
row_times = []

for messages in tqdm(test_dataset['conversations']):
    # Start timer
    start_time = time.time()
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")

    outputs = model.generate(input_ids=inputs, max_new_tokens=2048, use_cache=True,
                             temperature=0.1, min_p=0.1)
    decoded_output = tokenizer.batch_decode(outputs)[0]
    
    # Stop timer
    end_time = time.time()
    elapsed_time = end_time - start_time  # in seconds
    row_times.append(elapsed_time)

    y_pred.append(decoded_output)
    
running = False
thread.join()

import numpy as np
std_util = np.std(gpu_utils, ddof=1) 

print(f"Avg GPU util: {sum(gpu_utils)/len(gpu_utils):.2f}%")
print(f"Std GPU util: {std_util:.2f}%")
print(f"Peak GPU mem: {max(gpu_mem):.4f} GB")

  0%|          | 0/143 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 143/143 [3:33:33<00:00, 89.60s/it]   

Avg GPU util: 39.49%
Std GPU util: 3.91%
Peak GPU mem: 12.7861 GB





In [5]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")

Peak reserved memory = 12.395 GB.


In [None]:
import pickle

with open('_resource_data/times_exp_ftcodellama13b.pkl', 'wb') as file:
    pickle.dump(row_times, file)
    
with open('_resource_data/gpu_utils_exp_ftcodellama13b.pkl', 'wb') as file:
    pickle.dump(gpu_utils, file)
    
with open('_resource_data/gpu_mem_exp_ftcodellama13b.pkl', 'wb') as file:
    pickle.dump(gpu_mem, file)

In [None]:
from unsloth import FastLanguageModel

max_seq_length = 32768 
dtype = None 
load_in_4bit = True 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "ours_explanator",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model)



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 07-17 03:37:47 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 07-17 03:37:47 [__init__.py:239] Automatically detected platform cuda.


2025-07-17 03:37:49,191	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


==((====))==  Unsloth 2025.5.3: Fast Qwen3 patching. Transformers: 4.51.3. vLLM: 0.8.5.
   \\   /|    Tesla V100-SXM2-32GB. Num GPUs = 1. Max memory: 31.749 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.5.3 patched 36 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen3ForCausalLM(
      (model): Qwen3Model(
        (embed_tokens): Embedding(151936, 2560, padding_idx=151654)
        (layers): ModuleList(
          (0-1): 2 x Qwen3DecoderLayer(
            (self_attn): Qwen3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2560, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.2, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_p

In [None]:
from datasets import load_dataset

# Define the paths to your dataset files
data_files = {
    'train': 'vuln_data_train.csv',
    'val': 'vuln_data_val.csv',
    'test': 'vuln_data_test.csv'
}

# Load the dataset
dataset = load_dataset('csv', data_files=data_files)

import random
from datasets import concatenate_datasets, DatasetDict

random.seed(42)

SYSTEM_PROMPT = [
    """You are the smartest AI solidity smart contract security auditor in the world""",
    """You are the greatest AI assistant smart contract security auditor in the world""",
    """You are the best solidity smart contract security auditor in the world""",
    """You are the greatest AI assistant solidity security researcher in the world""",
    """You are the best AI solidity smart contract security auditor in the world"""
]

PROMPTS = [
    """The given Solidity function is vulnerable. Review the code and analyze its security flaws.
    
    This is the function we need to audit:
    ```solidity
    {code}
    ```
    
Vulnerability : """,

    """The following Solidity function contains security vulnerabilities. Examine the function and explain the specific weaknesses that make it insecure.
    
    Function for review:
    ```solidity
    {code}
    ```
    
Vulnerability : """,

    """The provided Solidity function has security vulnerabilities. Identify and explain the security issues present in the code.
    
    Code for analysis:
    ```solidity
    {code}
    ```
    
Vulnerability : """,

    """The Solidity function below is vulnerable. Describe the security flaws and their potential risks.
    
    Function under review:
    ```solidity
    {code}
    ```
    
Vulnerability : """,

    """Analyze the following Solidity function. It has security vulnerabilities that need to be explained.
    
    Function to audit:
    ```solidity
    {code}
    ```
    
Vulnerability : """,
]

def make_conversational_vote(examples, system_prompt, prompt):
    code = examples['vuln_code']
    desc = examples['vuln_explanation']
    conversation = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt.format(code=code)}
    ]
    return {"conversations": conversation, "label": desc}

# Generate 5 datasets using different prompts
datasets = []
for i in range(len(PROMPTS)):
    new_dataset = dataset.map(lambda ex: make_conversational_vote(ex, SYSTEM_PROMPT[i], PROMPTS[i]))
    new_dataset = new_dataset.remove_columns(["vuln_title", "vuln_explanation", "severity", "vuln_recommendation"])
    datasets.append(new_dataset)

In [None]:
import time, threading, subprocess
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch

gpu_utils, gpu_mem = [], []
running = True

def poll_gpu():
    while running:
        out = subprocess.check_output([
            "nvidia-smi",
            "--query-gpu=utilization.gpu,memory.used",
            "--format=csv,noheader,nounits"
        ])
        u, m = map(int, out.decode().strip().split(','))
        gpu_utils.append(u)
        gpu_mem.append(m / 1024)
        time.sleep(0.2)

monitor_thread = threading.Thread(target=poll_gpu, daemon=True)
monitor_thread.start()

template = """We have several vulnerability analysis for the smart contract security audit.
Your task is to produce a vulnerability explanation that selects and integrates only the most accurate, logical, reasonable, coherent and credible vulnerabilities from the existing analyses.
    
Make sure to make only one vulnerability explanation in your final answer.
Also make sure your final answer only contain the vulnerability explanation, without any additional commentary.

###Smart Contract Under Review  
```solidity
{code}
````

###Analysis 1
{output_0}

###Analysis 2
{output_1}

###Analysis 3
{output_2}

###Analysis 4
{output_3}

###Analysis 5:
{output_4}

Final Answer:
"""

generation_kwargs = dict(
    do_sample=True,
    temperature=0.6,
    top_k=20,
    top_p=0.95,
    min_p=0,
    repetition_penalty=1.2,
    max_new_tokens=8192,
    use_cache=True
)

proposal_times, aggregation_times, total_times = [], [], []
results = []

# num_rows = len(datasets[0]['test'])
num_rows = 50

for row_idx in tqdm(range(num_rows), desc="End-to-End Pipeline"):
    t_total_start = time.time()

    proposal_outputs = []
    t0 = time.time()

    for dataset in datasets:
        example = dataset['test'][row_idx]
        proposal_input = tokenizer.apply_chat_template(
            example['conversations'],
            tokenize=True, add_generation_prompt=True, return_tensors="pt"
        ).to("cuda")
        proposal_output = model.generate(
            input_ids=proposal_input,
            **generation_kwargs
        )
        decoded_output = tokenizer.decode(
            proposal_output[0], skip_special_tokens=True
        )
        proposal_outputs.append(decoded_output)

    dt_prop = time.time() - t0
    proposal_times.append(dt_prop)

    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    torch.cuda.reset_peak_memory_stats()

    t1 = time.time()
    aggregation_prompt = template.format(
        code=example["vuln_code"],
        output_0=proposal_outputs[0],
        output_1=proposal_outputs[1],
        output_2=proposal_outputs[2],
        output_3=proposal_outputs[3],
        output_4=proposal_outputs[4],
    )
    aggregation_messages = [
        {"role": "system", "content": "You are an expert in smart-contract security audits."},
        {"role": "user", "content": aggregation_prompt},
    ]
    aggregation_input = tokenizer.apply_chat_template(
        aggregation_messages,
        tokenize=True, add_generation_prompt=True, return_tensors="pt"
    ).to("cuda")

    with model.disable_adapter():
        aggregation_output = model.generate(
            input_ids=aggregation_input,
            **generation_kwargs
        )

    aggregation_decoded = tokenizer.decode(
        aggregation_output[0], skip_special_tokens=True
    )
    dt_aggr = time.time() - t1
    aggregation_times.append(dt_aggr)

    total_times.append(time.time() - t_total_start)
    results.append({
        "vuln_code": example["vuln_code"], 
        "proposal_0": proposal_outputs[0],
        "proposal_1": proposal_outputs[1],
        "proposal_2": proposal_outputs[2],
        "proposal_3": proposal_outputs[3],
        "proposal_4": proposal_outputs[4],
        "aggregated": aggregation_decoded,
        "time_proposal": dt_prop,
        "time_aggregation": dt_aggr,
        "time_total": total_times[-1]
    })

running = False
monitor_thread.join()

df = pd.DataFrame(results)
# df.to_csv("_resource_data/exp_result.csv", index=False)

print("\n=== Benchmark Summary ===")
print(f"Total examples: {len(df)}")
print("\nGPU Usage:")
print(f"Avg GPU Utilization: {np.mean(gpu_utils):.2f}% ±{np.std(gpu_utils, ddof=1):.2f}%")
print(f"Peak VRAM Usage: {np.max(gpu_mem):.4f} GB")

In [5]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")

Peak reserved memory = 5.502 GB.


In [None]:
import pickle
    
with open('_resource_data/gpu_utils_exp_ours.pkl', 'wb') as file:
    pickle.dump(gpu_utils, file)
    
with open('_resource_data/gpu_mem_exp_ours.pkl', 'wb') as file:
    pickle.dump(gpu_mem, file)

In [None]:
import pickle
import pandas as pd
import numpy as np

models = [
    "exp_zs_codellama34b",
    "exp_zs_codellama13b",
    "exp_ftcodellama13b",
    "exp_ours",
]

data_dir = "_resource_data"
summary_data = {
    "model": [],
    "avg_gpu_util": [],
    "std_gpu_util": [],
    "max_gpu_mem": []
}

sampling_interval = 0.2 

for model in models:
    with open(f"{data_dir}/gpu_utils_{model}.pkl", "rb") as f:
        gpu_utils = pickle.load(f)
    with open(f"{data_dir}/gpu_mem_{model}.pkl", "rb") as f:
        gpu_mem = pickle.load(f)
    
    if model != "exp_ours":
        with open(f"{data_dir}/times_{model}.pkl", "rb") as f:
            row_times = pickle.load(f)

        total_time = sum(row_times[:50])
        
        num_samples = int(total_time / sampling_interval)

        gpu_utils = gpu_utils[:num_samples]
        gpu_mem = gpu_mem[:num_samples]

        row_times = row_times[:50]

    else:
        print(f"[{model}] Using full GPU monitoring data")

    avg_gpu_util = np.mean(gpu_utils)
    std_gpu_util = np.std(gpu_utils, ddof=1)
    max_gpu_mem = np.max(gpu_mem)
    
    summary_data["model"].append(model)
    summary_data["avg_gpu_util"].append(float(f"{avg_gpu_util:.2f}"))
    summary_data["std_gpu_util"].append(float(f"{std_gpu_util:.2f}"))
    summary_data["max_gpu_mem"].append(float(f"{max_gpu_mem:.4f}"))

df_summary = pd.DataFrame(summary_data)
df_summary.sort_values(by="model", inplace=True)
df_summary.reset_index(drop=True, inplace=True)

df_summary

[exp_zs_codellama34b] Total time for 50 rows: 1170.71s
[exp_zs_codellama34b] Cropping GPU data to 5853 samples (~1170.60s)
[exp_zs_codellama13b] Total time for 50 rows: 1207.19s
[exp_zs_codellama13b] Cropping GPU data to 6035 samples (~1207.00s)
[exp_ftcodellama13b] Total time for 50 rows: 3846.81s
[exp_ftcodellama13b] Cropping GPU data to 19234 samples (~3846.80s)
[exp_ours] Using full GPU monitoring data


Unnamed: 0,model,avg_gpu_util,std_gpu_util,max_gpu_mem
0,exp_ftcodellama13b,38.91,4.11,11.1221
1,exp_ours,36.76,11.65,9.5107
2,exp_zs_codellama13b,49.2,6.03,10.5303
3,exp_zs_codellama34b,82.9,6.65,19.5928


In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 16384 
dtype = None 
load_in_4bit = True 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "codellama/CodeLlama-34b-Instruct-hf",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 07-17 09:29:21 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 07-17 09:29:21 [__init__.py:239] Automatically detected platform cuda.


2025-07-17 09:29:23,101	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


==((====))==  Unsloth 2025.5.3: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.5.
   \\   /|    Tesla V100-SXM2-32GB. Num GPUs = 1. Max memory: 31.749 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 7/7 [03:20<00:00, 28.59s/it]


codellama/CodeLlama-34b-Instruct-hf does not have a padding token! Will use pad_token = <unk>.


In [None]:
from datasets import load_dataset

# Define the paths to your dataset files
data_files = {
    'train': 'vuln_data_train.csv',
    'val': 'vuln_data_val.csv',
    'test': 'vuln_data_test.csv'
}

# Load the dataset
dataset = load_dataset('csv', data_files=data_files)

import random
from datasets import concatenate_datasets, DatasetDict

random.seed(42)

SYSTEM_PROMPT = [
    """You are the smartest AI solidity smart contract security auditor in the world""",
]

PROMPTS = [
    """The given Solidity function is vulnerable and there was an explanation about the vulnerability. Give your patch recommendation based on that vulnerability. Just straight to the recommendation without any additional information.
    
    This is the vulnerable functions:
    ```solidity
    {code}
    ```
    
This is the vulnerability explanation: 
{explanation}

Recommendation:
""",
]

def make_conversational(examples):
    code = examples['vuln_code']
    desc = examples['vuln_explanation']
    recom = examples['vuln_recommendation']
    selected_system_prompt = random.choice(SYSTEM_PROMPT)
    selected_user_prompt = random.choice(PROMPTS) 
    conversation = [{"role": "system", "content": selected_system_prompt}, 
                    {"role": "user", "content": selected_user_prompt.format(code=code, explanation=desc)}]
    return { "conversations" : conversation, "label": recom}

dataset = dataset.map(make_conversational)

test_dataset = dataset['test']

from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True,)

Map: 100%|██████████| 662/662 [00:00<00:00, 5114.81 examples/s]
Map: 100%|██████████| 142/142 [00:00<00:00, 4548.44 examples/s]
Map: 100%|██████████| 143/143 [00:00<00:00, 4660.56 examples/s]
Map: 100%|██████████| 662/662 [00:00<00:00, 7799.41 examples/s]
Map: 100%|██████████| 142/142 [00:00<00:00, 6437.78 examples/s]
Map: 100%|██████████| 143/143 [00:00<00:00, 6523.95 examples/s]


In [3]:
import time, threading, subprocess
from tqdm import tqdm

gpu_utils, gpu_mem = [], []
running = True

def poll():
    while running:
        out = subprocess.check_output([
            "nvidia-smi",
            "--query-gpu=utilization.gpu,memory.used",
            "--format=csv,noheader,nounits"
        ])
        u, m = map(int, out.decode().split(','))
        gpu_utils.append(u); gpu_mem.append(m / 1024)
        time.sleep(0.2)

thread = threading.Thread(target=poll, daemon=True)
thread.start()

from unsloth.chat_templates import get_chat_template
from tqdm import tqdm
import re
import time

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Define regex pattern to extract the assistant's response
pattern = r"\[/INST](.*?)</s>"

y_pred = []
row_times = []

for messages in tqdm(test_dataset['conversations'][:50]):
    # Start timer
    start_time = time.time()
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")

    outputs = model.generate(input_ids=inputs, max_new_tokens=2048, use_cache=True,
                             temperature=0.1, min_p=0.1)
    decoded_output = tokenizer.batch_decode(outputs)[0]
    
    # Stop timer
    end_time = time.time()
    elapsed_time = end_time - start_time  # in seconds
    row_times.append(elapsed_time)

    y_pred.append(decoded_output)
    
running = False
thread.join()

import numpy as np
std_util = np.std(gpu_utils, ddof=1) 

print(f"Avg GPU util: {sum(gpu_utils)/len(gpu_utils):.2f}%")
print(f"Std GPU util: {std_util:.2f}%")
print(f"Peak GPU mem: {max(gpu_mem):.4f} GB")

  0%|          | 0/50 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 50/50 [31:37<00:00, 37.95s/it]


Avg GPU util: 88.55%
Std GPU util: 7.01%
Peak GPU mem: 19.7354 GB


In [6]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")

Peak reserved memory = 19.344 GB.


In [None]:
import pickle
    
with open('_resource_data/gpu_utils_rec_zs_codellama34b.pkl', 'wb') as file:
    pickle.dump(gpu_utils, file)
    
with open('_resource_data/gpu_mem_rec_zs_codellama34b.pkl', 'wb') as file:
    pickle.dump(gpu_mem, file)

In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 16384 
dtype = None 
load_in_4bit = True 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "codellama/CodeLlama-13b-Instruct-hf",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 07-17 10:14:28 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 07-17 10:14:28 [__init__.py:239] Automatically detected platform cuda.


2025-07-17 10:14:30,517	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


==((====))==  Unsloth 2025.5.3: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.5.
   \\   /|    Tesla V100-SXM2-32GB. Num GPUs = 1. Max memory: 31.749 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 3/3 [01:16<00:00, 25.59s/it]


codellama/CodeLlama-13b-Instruct-hf does not have a padding token! Will use pad_token = <unk>.


In [None]:
from datasets import load_dataset

# Define the paths to your dataset files
data_files = {
    'train': 'vuln_data_train.csv',
    'val': 'vuln_data_val.csv',
    'test': 'vuln_data_test.csv'
}

# Load the dataset
dataset = load_dataset('csv', data_files=data_files)

import random
from datasets import concatenate_datasets, DatasetDict

random.seed(42)

SYSTEM_PROMPT = [
    """You are the smartest AI solidity smart contract security auditor in the world""",
]

PROMPTS = [
    """The given Solidity function is vulnerable and there was an explanation about the vulnerability. Give your patch recommendation based on that vulnerability. Just straight to the recommendation without any additional information.
    
    This is the vulnerable functions:
    ```solidity
    {code}
    ```
    
This is the vulnerability explanation: 
{explanation}

Recommendation:
""",
]

def make_conversational(examples):
    code = examples['vuln_code']
    desc = examples['vuln_explanation']
    recom = examples['vuln_recommendation']
    selected_system_prompt = random.choice(SYSTEM_PROMPT)
    selected_user_prompt = random.choice(PROMPTS) 
    conversation = [{"role": "system", "content": selected_system_prompt}, 
                    {"role": "user", "content": selected_user_prompt.format(code=code, explanation=desc)}]
    return { "conversations" : conversation, "label": recom}

dataset = dataset.map(make_conversational)

test_dataset = dataset['test']

from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True,)

Map: 100%|██████████| 662/662 [00:00<00:00, 7217.89 examples/s]
Map: 100%|██████████| 142/142 [00:00<00:00, 5764.92 examples/s]
Map: 100%|██████████| 143/143 [00:00<00:00, 6156.32 examples/s]


In [3]:
import time, threading, subprocess
from tqdm import tqdm

gpu_utils, gpu_mem = [], []
running = True

def poll():
    while running:
        out = subprocess.check_output([
            "nvidia-smi",
            "--query-gpu=utilization.gpu,memory.used",
            "--format=csv,noheader,nounits"
        ])
        u, m = map(int, out.decode().split(','))
        gpu_utils.append(u); gpu_mem.append(m / 1024)
        time.sleep(0.2)

thread = threading.Thread(target=poll, daemon=True)
thread.start()

from unsloth.chat_templates import get_chat_template
from tqdm import tqdm
import re
import time

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Define regex pattern to extract the assistant's response
pattern = r"\[/INST](.*?)</s>"

y_pred = []
row_times = []

for messages in tqdm(test_dataset['conversations'][:50]):
    # Start timer
    start_time = time.time()
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")

    outputs = model.generate(input_ids=inputs, max_new_tokens=2048, use_cache=True,
                             temperature=0.1, min_p=0.1)
    decoded_output = tokenizer.batch_decode(outputs)[0]
    
    # Stop timer
    end_time = time.time()
    elapsed_time = end_time - start_time  # in seconds
    row_times.append(elapsed_time)

    y_pred.append(decoded_output)
    
running = False
thread.join()

import numpy as np
std_util = np.std(gpu_utils, ddof=1) 

print(f"Avg GPU util: {sum(gpu_utils)/len(gpu_utils):.2f}%")
print(f"Std GPU util: {std_util:.2f}%")
print(f"Peak GPU mem: {max(gpu_mem):.4f} GB")

  0%|          | 0/50 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 50/50 [23:50<00:00, 28.62s/it]

Avg GPU util: 49.27%
Std GPU util: 6.66%
Peak GPU mem: 12.4092 GB





In [5]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")

Peak reserved memory = 12.332 GB.


In [None]:
import pickle
    
with open('_resource_data/gpu_utils_rec_zs_codellama13b.pkl', 'wb') as file:
    pickle.dump(gpu_utils, file)
    
with open('_resource_data/gpu_mem_rec_zs_codellama13b.pkl', 'wb') as file:
    pickle.dump(gpu_mem, file)

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 16384 
dtype = None 
load_in_4bit = True 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "codellama13b_recommendation",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 07-17 10:55:47 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 07-17 10:55:47 [__init__.py:239] Automatically detected platform cuda.


2025-07-17 10:55:49,796	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


==((====))==  Unsloth 2025.5.3: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.5.
   \\   /|    Tesla V100-SXM2-32GB. Num GPUs = 1. Max memory: 31.749 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 3/3 [00:26<00:00,  8.69s/it]
Unsloth 2025.5.3 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


In [None]:
from datasets import load_dataset

# Define the paths to your dataset files
data_files = {
    'train': 'vuln_data_train.csv',
    'val': 'vuln_data_val.csv',
    'test': 'vuln_data_test.csv'
}

# Load the dataset
dataset = load_dataset('csv', data_files=data_files)

import random
from datasets import concatenate_datasets, DatasetDict

random.seed(42)

SYSTEM_PROMPT = [
    """You are the smartest AI solidity smart contract security auditor in the world""",
]

PROMPTS = [
    """The given Solidity function is vulnerable and there was an explanation about the vulnerability. Give your patch recommendation based on that vulnerability. Just straight to the recommendation without any additional information.
    
    This is the vulnerable functions:
    ```solidity
    {code}
    ```
    
This is the vulnerability explanation: 
{explanation}

Recommendation:
""",
]

def make_conversational(examples):
    code = examples['vuln_code']
    desc = examples['vuln_explanation']
    recom = examples['vuln_recommendation']
    selected_system_prompt = random.choice(SYSTEM_PROMPT)
    selected_user_prompt = random.choice(PROMPTS) 
    conversation = [{"role": "system", "content": selected_system_prompt}, 
                    {"role": "user", "content": selected_user_prompt.format(code=code, explanation=desc)}]
    return { "conversations" : conversation, "label": recom}

dataset = dataset.map(make_conversational)

test_dataset = dataset['test']

from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True,)

Map: 100%|██████████| 662/662 [00:00<00:00, 5464.62 examples/s]
Map: 100%|██████████| 142/142 [00:00<00:00, 4690.29 examples/s]
Map: 100%|██████████| 143/143 [00:00<00:00, 4742.10 examples/s]
Map: 100%|██████████| 662/662 [00:00<00:00, 6752.14 examples/s]
Map: 100%|██████████| 142/142 [00:00<00:00, 6048.51 examples/s]
Map: 100%|██████████| 143/143 [00:00<00:00, 6040.56 examples/s]
Map: 100%|██████████| 143/143 [00:00<00:00, 4929.81 examples/s]
Map: 100%|██████████| 143/143 [00:00<00:00, 4785.27 examples/s]


In [3]:
import time, threading, subprocess
from tqdm import tqdm

gpu_utils, gpu_mem = [], []
running = True

def poll():
    while running:
        out = subprocess.check_output([
            "nvidia-smi",
            "--query-gpu=utilization.gpu,memory.used",
            "--format=csv,noheader,nounits"
        ])
        u, m = map(int, out.decode().split(','))
        gpu_utils.append(u); gpu_mem.append(m / 1024)
        time.sleep(0.2)

thread = threading.Thread(target=poll, daemon=True)
thread.start()

from unsloth.chat_templates import get_chat_template
from tqdm import tqdm
import re
import time

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Define regex pattern to extract the assistant's response
pattern = r"\[/INST](.*?)</s>"

y_pred = []
row_times = []

for messages in tqdm(test_dataset['conversations'][:50]):
    # Start timer
    start_time = time.time()
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")

    outputs = model.generate(input_ids=inputs, max_new_tokens=2048, use_cache=True,
                             temperature=0.1, min_p=0.1)
    decoded_output = tokenizer.batch_decode(outputs)[0]
    
    # Stop timer
    end_time = time.time()
    elapsed_time = end_time - start_time  # in seconds
    row_times.append(elapsed_time)

    y_pred.append(decoded_output)
    
running = False
thread.join()

import numpy as np
std_util = np.std(gpu_utils, ddof=1) 

print(f"Avg GPU util: {sum(gpu_utils)/len(gpu_utils):.2f}%")
print(f"Std GPU util: {std_util:.2f}%")
print(f"Peak GPU mem: {max(gpu_mem):.4f} GB")

  0%|          | 0/50 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 50/50 [1:02:36<00:00, 75.12s/it] 

Avg GPU util: 39.75%
Std GPU util: 4.06%
Peak GPU mem: 11.1221 GB





In [6]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")

Peak reserved memory = 12.332 GB.


In [None]:
import pickle
    
with open('_resource_data/gpu_utils_rec_ftcodellama13b.pkl', 'wb') as file:
    pickle.dump(gpu_utils, file)
    
with open('_resource_data/gpu_mem_rec_ftcodellama13b.pkl', 'wb') as file:
    pickle.dump(gpu_mem, file)

In [None]:
from unsloth import FastLanguageModel

max_seq_length = 32768 
dtype = None 
load_in_4bit = True 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "ours_recom",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model)



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!




INFO 07-17 15:22:25 [importing.py:53] Triton module has been replaced with a placeholder.


INFO 07-17 15:22:25 [__init__.py:239] Automatically detected platform cuda.


2025-07-17 15:22:27,471	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


==((====))==  Unsloth 2025.5.3: Fast Qwen3 patching. Transformers: 4.51.3. vLLM: 0.8.5.
   \\   /|    Tesla V100-SXM2-32GB. Num GPUs = 1. Max memory: 31.749 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.5.3 patched 36 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen3ForCausalLM(
      (model): Qwen3Model(
        (embed_tokens): Embedding(151936, 2560, padding_idx=151654)
        (layers): ModuleList(
          (0-1): 2 x Qwen3DecoderLayer(
            (self_attn): Qwen3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2560, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.2, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_p

In [None]:
from datasets import load_dataset

# Define the paths to your dataset files
data_files = {
    'train': 'vuln_data_train.csv',
    'val': 'vuln_data_val.csv',
    'test': 'vuln_data_test.csv'
}

# Load the dataset
dataset = load_dataset('csv', data_files=data_files)

import random
from datasets import concatenate_datasets, DatasetDict

random.seed(42)

SYSTEM_PROMPT = [
    """You are the smartest AI solidity smart contract security auditor in the world""",
    """You are the greatest AI assistant smart contract security auditor in the world""",
    """You are the best solidity smart contract security auditor in the world""",
    """You are the greatest AI assistant solidity security researcher in the world""",
    """You are the best AI solidity smart contract security auditor in the world"""
]

PROMPTS = [
    """The given Solidity function is vulnerable and there was an explanation about the vulnerability. Give your patch recommendation based on that vulnerability.
    
    This is the vulnerable functions:
    ```solidity
    {code}
    ```
    
This is the vulnerability explanation: 
{explanation}

Recommendation:
""",

    """The following Solidity function contains security vulnerabilities. Based on vulnerability explanation give the recommendation to patch the security issues.
    
    Functions that has vulnerability:
    ```solidity
    {code}
    ```
    
Vulnerability explanation of the function: 
{explanation}

Recommendation:
""",

    """The provided Solidity function has security vulnerabilities. Your task is to give recommendation to close the security issue.
    
    Vulnerable code:
    ```solidity
    {code}
    ```
    
Vulnerability explanation: 
{explanation}

Recommendation:
""",

    """The Solidity function below is vulnerable. Give your best recommendation to close that security issue based on vulnerability explanation.
    
    Functions that vulnerable:
    ```solidity
    {code}
    ```
    
Vulnerability explanation: 
{explanation}

Recommendation:
""",

    """Analyze the following Solidity function. It has security vulnerabilities that need to be patched. Give your best recommendation.
    
    Vulnerable functions:
    ```solidity
    {code}
    ```
    
The vulnerability explanation: 
{explanation}

Recommendation:
""",
]

def make_conversational_vote(examples, system_prompt, prompt):
    code = examples['vuln_code']
    desc = examples['vuln_explanation']
    recom = examples['vuln_recommendation']
    conversation = [{"role": "system", "content": system_prompt}, 
                    {"role": "user", "content": prompt.format(code=code, explanation=desc)}]
    return {"conversations": conversation, "label": recom}

# Generate 5 datasets using different prompts
datasets = []
for i in range(len(PROMPTS)):
    new_dataset = dataset.map(lambda ex: make_conversational_vote(ex, SYSTEM_PROMPT[i], PROMPTS[i]))
    new_dataset = new_dataset.remove_columns(["vuln_title", "severity"])
    datasets.append(new_dataset)

In [None]:
import time, threading, subprocess
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch

gpu_utils, gpu_mem = [], []
running = True

def poll_gpu():
    while running:
        out = subprocess.check_output([
            "nvidia-smi",
            "--query-gpu=utilization.gpu,memory.used",
            "--format=csv,noheader,nounits"
        ])
        u, m = map(int, out.decode().strip().split(','))
        gpu_utils.append(u)
        gpu_mem.append(m / 1024)
        time.sleep(0.2)

monitor_thread = threading.Thread(target=poll_gpu, daemon=True)
monitor_thread.start()

template = """We have several vulnerability mitigation/recommendation for the smart contract security audit.
Your task is to produce a recommendation/mitigation steps that selects and integrates only the most accurate, logical, reasonable, coherent and credible recommendation from the existing analyses.
Make sure your final answer only contain the final recommendation/mitigation answer, without any additional commentary.

###Vulnerability Explanation
{explanation}

###Smart Contract Code
{code}

###Analysis 1
{output_0}

###Analysis 2
{output_1}

###Analysis 3
{output_2}

###Analysis 4
{output_3}

###Analysis 5:
{output_4}

Final Answer:
"""

generation_kwargs = dict(
    do_sample=True,
    temperature=0.6,
    top_k=20,
    top_p=0.95,
    min_p=0,
    repetition_penalty=1.2,
    max_new_tokens=8192,
    use_cache=True
)

proposal_times, aggregation_times, total_times = [], [], []
results = []

# num_rows = len(datasets[0]['test'])
num_rows = 50

for row_idx in tqdm(range(num_rows), desc="End-to-End Pipeline"):
    t_total_start = time.time()

    proposal_outputs = []
    t0 = time.time()

    for dataset in datasets:
        example = dataset['test'][row_idx]
        proposal_input = tokenizer.apply_chat_template(
            example['conversations'],
            tokenize=True, add_generation_prompt=True, return_tensors="pt"
        ).to("cuda")
        proposal_output = model.generate(
            input_ids=proposal_input,
            **generation_kwargs
        )
        decoded_output = tokenizer.decode(
            proposal_output[0], skip_special_tokens=True
        )
        proposal_outputs.append(decoded_output)

    dt_prop = time.time() - t0
    proposal_times.append(dt_prop)

    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    torch.cuda.reset_peak_memory_stats()

    t1 = time.time()
    aggregation_prompt = template.format(
        code=example["vuln_code"],
        explanation=example["vuln_explanation"],
        output_0=proposal_outputs[0],
        output_1=proposal_outputs[1],
        output_2=proposal_outputs[2],
        output_3=proposal_outputs[3],
        output_4=proposal_outputs[4],
    )
    aggregation_messages = [
        {"role": "system", "content": "You are an expert in smart-contract security audits."},
        {"role": "user", "content": aggregation_prompt},
    ]
    aggregation_input = tokenizer.apply_chat_template(
        aggregation_messages,
        tokenize=True, add_generation_prompt=True, return_tensors="pt"
    ).to("cuda")

    with model.disable_adapter():
        aggregation_output = model.generate(
            input_ids=aggregation_input,
            **generation_kwargs
        )

    aggregation_decoded = tokenizer.decode(
        aggregation_output[0], skip_special_tokens=True
    )
    dt_aggr = time.time() - t1
    aggregation_times.append(dt_aggr)

    total_times.append(time.time() - t_total_start)
    results.append({
        "vuln_code": example["vuln_code"], 
        "vuln_explanation": example["vuln_explanation"], 
        "proposal_0": proposal_outputs[0],
        "proposal_1": proposal_outputs[1],
        "proposal_2": proposal_outputs[2],
        "proposal_3": proposal_outputs[3],
        "proposal_4": proposal_outputs[4],
        "aggregated": aggregation_decoded,
        "time_proposal": dt_prop,
        "time_aggregation": dt_aggr,
        "time_total": total_times[-1]
    })

running = False
monitor_thread.join()

df = pd.DataFrame(results)
# df.to_csv("_resource_data/recom_result.csv", index=False)

print("\n=== Benchmark Summary ===")
print("\nGPU Usage:")
print(f"Avg GPU Utilization: {np.mean(gpu_utils):.2f}% ±{np.std(gpu_utils, ddof=1):.2f}%")
print(f"Peak VRAM Usage: {np.max(gpu_mem):.4f} GB")


End-to-End Pipeline:   0%|                                                                             | 0/50 [00:00<?, ?it/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



End-to-End Pipeline:   2%|█▎                                                                | 1/50 [04:41<3:49:39, 281.22s/it]


End-to-End Pipeline:   4%|██▋                                                               | 2/50 [09:04<3:36:20, 270.42s/it]


End-to-End Pipeline:   6%|███▉                                                              | 3/50 [12:25<3:07:14, 239.03s/it]


End-to-End Pipeline:   8%|█████▎                                                            | 4/50 [17:07<3:16:13, 255.95s/it]


End-to-End Pipeline:  10%|██████▌                                                           | 5/50 [21:25<3:12:23, 256.53s/it]


End-to-End Pipeline:  12%|███████▉                                                          | 6/50 [24:51<2:55:30, 239.33s/it]


End-to-End Pipeline:  14%|█████████▏                                                        | 7/50 [29:12<2:56:47, 246.69s/it]


End-to-End Pipeline:  16%|██████████▌                                                       | 8/50 [34:16<3:05:16, 264.68s/it]


End-to-End Pipeline:  18%|███████████▉                                                      | 9/50 [38:42<3:01:08, 265.09s/it]


End-to-End Pipeline:  20%|█████████████                                                    | 10/50 [43:11<2:57:34, 266.36s/it]


End-to-End Pipeline:  22%|██████████████▎                                                  | 11/50 [47:41<2:53:54, 267.56s/it]


End-to-End Pipeline:  24%|███████████████▌                                                 | 12/50 [51:43<2:44:32, 259.81s/it]


End-to-End Pipeline:  26%|████████████████▉                                                | 13/50 [55:17<2:31:36, 245.84s/it]


End-to-End Pipeline:  28%|██████████████████▏                                              | 14/50 [58:57<2:22:52, 238.12s/it]


End-to-End Pipeline:  30%|██████████████████▉                                            | 15/50 [1:02:33<2:14:57, 231.34s/it]


End-to-End Pipeline:  32%|████████████████████▏                                          | 16/50 [1:06:06<2:07:58, 225.83s/it]


End-to-End Pipeline:  34%|█████████████████████▍                                         | 17/50 [1:08:31<1:50:46, 201.42s/it]


End-to-End Pipeline:  36%|██████████████████████▋                                        | 18/50 [1:12:25<1:52:39, 211.23s/it]


End-to-End Pipeline:  38%|███████████████████████▉                                       | 19/50 [1:15:55<1:49:04, 211.11s/it]


End-to-End Pipeline:  40%|█████████████████████████▏                                     | 20/50 [1:19:49<1:48:52, 217.75s/it]


End-to-End Pipeline:  42%|██████████████████████████▍                                    | 21/50 [1:24:04<1:50:38, 228.92s/it]


End-to-End Pipeline:  44%|███████████████████████████▋                                   | 22/50 [1:26:31<1:35:27, 204.54s/it]


End-to-End Pipeline:  46%|████████████████████████████▉                                  | 23/50 [1:29:27<1:28:07, 195.85s/it]


End-to-End Pipeline:  48%|██████████████████████████████▏                                | 24/50 [1:32:17<1:21:27, 187.99s/it]


End-to-End Pipeline:  50%|███████████████████████████████▌                               | 25/50 [1:36:06<1:23:28, 200.32s/it]


End-to-End Pipeline:  52%|████████████████████████████████▊                              | 26/50 [1:39:17<1:19:05, 197.73s/it]


End-to-End Pipeline:  54%|██████████████████████████████████                             | 27/50 [1:43:24<1:21:25, 212.43s/it]


End-to-End Pipeline:  56%|███████████████████████████████████▎                           | 28/50 [1:47:43<1:23:01, 226.44s/it]


End-to-End Pipeline:  58%|████████████████████████████████████▌                          | 29/50 [1:51:23<1:18:35, 224.57s/it]


End-to-End Pipeline:  60%|█████████████████████████████████████▊                         | 30/50 [1:55:18<1:15:49, 227.45s/it]


End-to-End Pipeline:  62%|███████████████████████████████████████                        | 31/50 [1:59:50<1:16:15, 240.84s/it]


End-to-End Pipeline:  64%|████████████████████████████████████████▎                      | 32/50 [2:05:52<1:23:11, 277.29s/it]


End-to-End Pipeline:  66%|█████████████████████████████████████████▌                     | 33/50 [2:11:49<1:25:21, 301.29s/it]


End-to-End Pipeline:  68%|██████████████████████████████████████████▊                    | 34/50 [2:16:01<1:16:23, 286.48s/it]


End-to-End Pipeline:  70%|████████████████████████████████████████████                   | 35/50 [2:20:51<1:11:50, 287.39s/it]


End-to-End Pipeline:  72%|█████████████████████████████████████████████▎                 | 36/50 [2:25:34<1:06:45, 286.13s/it]


End-to-End Pipeline:  74%|████████████████████████████████████████████████                 | 37/50 [2:29:44<59:38, 275.24s/it]


End-to-End Pipeline:  76%|█████████████████████████████████████████████████▍               | 38/50 [2:33:24<51:45, 258.76s/it]


End-to-End Pipeline:  78%|██████████████████████████████████████████████████▋              | 39/50 [2:37:56<48:11, 262.85s/it]


End-to-End Pipeline:  80%|████████████████████████████████████████████████████             | 40/50 [2:41:00<39:51, 239.14s/it]


End-to-End Pipeline:  82%|█████████████████████████████████████████████████████▎           | 41/50 [2:44:57<35:44, 238.28s/it]


End-to-End Pipeline:  84%|██████████████████████████████████████████████████████▌          | 42/50 [2:49:42<33:39, 252.49s/it]


End-to-End Pipeline:  86%|███████████████████████████████████████████████████████▉         | 43/50 [2:54:46<31:15, 267.97s/it]


End-to-End Pipeline:  88%|█████████████████████████████████████████████████████████▏       | 44/50 [2:58:50<26:04, 260.67s/it]


End-to-End Pipeline:  90%|██████████████████████████████████████████████████████████▌      | 45/50 [3:03:38<22:25, 269.05s/it]


End-to-End Pipeline:  92%|███████████████████████████████████████████████████████████▊     | 46/50 [3:06:35<16:04, 241.22s/it]


End-to-End Pipeline:  94%|█████████████████████████████████████████████████████████████    | 47/50 [3:10:14<11:43, 234.56s/it]


End-to-End Pipeline:  96%|██████████████████████████████████████████████████████████████▍  | 48/50 [3:13:02<07:09, 214.60s/it]


End-to-End Pipeline:  98%|███████████████████████████████████████████████████████████████▋ | 49/50 [3:18:30<04:08, 248.76s/it]


End-to-End Pipeline: 100%|█████████████████████████████████████████████████████████████████| 50/50 [3:22:17<00:00, 242.22s/it]


End-to-End Pipeline: 100%|█████████████████████████████████████████████████████████████████| 50/50 [3:22:17<00:00, 242.75s/it]





=== Benchmark Summary ===

GPU Usage:
Avg GPU Utilization: 36.27% ±12.16%
Peak VRAM Usage: 9.2979 GB


In [None]:
import torch

used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")

Peak reserved memory = 5.736 GB.


In [None]:
import pickle
    
with open('_resource_data/gpu_utils_rec_ours.pkl', 'wb') as file:
    pickle.dump(gpu_utils, file)
    
with open('_resource_data/gpu_mem_rec_ours.pkl', 'wb') as file:
    pickle.dump(gpu_mem, file)

In [5]:
import pickle
import pandas as pd
import numpy as np

models = [
    "rec_zs_codellama34b",
    "rec_zs_codellama13b",
    "rec_ftcodellama13b",
    "rec_ours",
]

data_dir = "_resource_data"
summary_data = {
    "model": [],
    "avg_gpu_util": [],
    "std_gpu_util": [],
    "max_gpu_mem": []
}

sampling_interval = 0.2  # seconds between GPU samples

for model in models:
    with open(f"{data_dir}/gpu_utils_{model}.pkl", "rb") as f:
        gpu_utils = pickle.load(f)
    with open(f"{data_dir}/gpu_mem_{model}.pkl", "rb") as f:
        gpu_mem = pickle.load(f)
    
    avg_gpu_util = np.mean(gpu_utils)
    std_gpu_util = np.std(gpu_utils, ddof=1)
    max_gpu_mem = np.max(gpu_mem)
    
    summary_data["model"].append(model)
    summary_data["avg_gpu_util"].append(float(f"{avg_gpu_util:.2f}"))
    summary_data["std_gpu_util"].append(float(f"{std_gpu_util:.2f}"))
    summary_data["max_gpu_mem"].append(float(f"{max_gpu_mem:.4f}"))

df_summary = pd.DataFrame(summary_data)
df_summary.sort_values(by="model", inplace=True)
df_summary.reset_index(drop=True, inplace=True)

df_summary

Unnamed: 0,model,avg_gpu_util,std_gpu_util,max_gpu_mem
0,rec_ftcodellama13b,39.75,4.06,11.1221
1,rec_ours,36.27,12.16,9.2979
2,rec_zs_codellama13b,49.27,6.66,12.4092
3,rec_zs_codellama34b,88.55,7.01,19.7354
