In [1]:
import torch
import gc
import pandas as pd
import pickle
import json

In [2]:
gc.collect()

torch.cuda.empty_cache()

## Merge method_change table into df

In [3]:
with open('./dataset/test_pickles_bak/df.pkl', 'rb') as file:
    df = pickle.load(file)
    df = df.dropna()
    df = df[(df['token_count'] < 12000)]
    # df = df[:30000]

In [4]:
df

Unnamed: 0,file_change_id,programming_language,cwe_id,cwe_name,non_vuln_code,vuln_code,cwe_description,cve_description,token_count,diff_added,diff_deleted,vuln_code_num_lines,non_vuln_code_num_lines,class,method_name
0,41461181100456,C,CWE-252,Unchecked Return Value,/* Map in a shared object's segments from the ...,/* Map in a shared object's segments from the ...,The product does not check the return value fr...,"[{'lang': 'en', 'value': 'manual/search.texi i...",6592.0,"[(152, const char *const start = name;), (15...","[(158, /* $ORIGIN is not expanded for SU...",1449,1467,CWE-754,_dl_dst_count
1,41461181100456,C,CWE-252,Unchecked Return Value,/* Map in a shared object's segments from the ...,/* Map in a shared object's segments from the ...,The product does not check the return value fr...,"[{'lang': 'en', 'value': 'manual/search.texi i...",6592.0,"[(152, const char *const start = name;), (15...","[(158, /* $ORIGIN is not expanded for SU...",1449,1467,CWE-754,expand_dynamic_string_token
2,41461181100456,C,CWE-252,Unchecked Return Value,/* Map in a shared object's segments from the ...,/* Map in a shared object's segments from the ...,The product does not check the return value fr...,"[{'lang': 'en', 'value': 'manual/search.texi i...",6592.0,"[(152, const char *const start = name;), (15...","[(158, /* $ORIGIN is not expanded for SU...",1449,1467,CWE-754,_dl_dst_substitute
3,41461181100456,C,CWE-252,Unchecked Return Value,/* Map in a shared object's segments from the ...,/* Map in a shared object's segments from the ...,The product does not check the return value fr...,"[{'lang': 'en', 'value': 'manual/search.texi i...",6592.0,"[(152, const char *const start = name;), (15...","[(158, /* $ORIGIN is not expanded for SU...",1449,1467,CWE-754,_dl_dst_count
4,41461181100456,C,CWE-252,Unchecked Return Value,/* Map in a shared object's segments from the ...,/* Map in a shared object's segments from the ...,The product does not check the return value fr...,"[{'lang': 'en', 'value': 'manual/search.texi i...",6592.0,"[(152, const char *const start = name;), (15...","[(158, /* $ORIGIN is not expanded for SU...",1449,1467,CWE-754,expand_dynamic_string_token
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193645,233098165980480,Python,CWE-79,Improper Neutralization of Input During Web Pa...,"from flask import blueprints, request, jsonify...","from flask import blueprints, request, jsonify...",The product does not neutralize or incorrectly...,"[{'lang': 'en', 'value': 'A stored Cross-Site ...",371.0,"[(2, from werkzeug.utils import secure_filenam...","[(21, manager.create_project(project_name)...",54,63,CWE-74,create_project
193646,168198219170456,Python,CWE-73,External Control of File Name or Path,import json\nimport os\nfrom datetime import d...,import json\nimport os\nfrom datetime import d...,The product allows user input to control or in...,"[{'lang': 'en', 'value': 'A local file read vu...",1284.0,"[(177, )]","[(177, ), (178, def get_project_files(self...",201,177,CWE-20,get_project_files
193647,168198219170456,Python,CWE-22,Improper Limitation of a Pathname to a Restric...,import json\nimport os\nfrom datetime import d...,import json\nimport os\nfrom datetime import d...,The product uses external input to construct a...,"[{'lang': 'en', 'value': 'A directory traversa...",1284.0,"[(177, )]","[(177, ), (178, def get_project_files(self...",201,177,CWE-668,get_project_files
193648,168198219170456,Python,CWE-346,Origin Validation Error,import json\nimport os\nfrom datetime import d...,import json\nimport os\nfrom datetime import d...,The product does not properly verify that the ...,"[{'lang': 'en', 'value': 'A CORS misconfigurat...",1284.0,"[(177, )]","[(177, ), (178, def get_project_files(self...",201,177,CWE-346,get_project_files


In [5]:
def extract_cve_description(cve_list):
    if not cve_list:
        return "No CVE description available."
    
    for cve in cve_list:
        if cve.get("lang") == "en":
            return cve.get("value", "No CVE description available.")
    
    return cve_list[0].get("value", "No CVE description available.")

def format_diff_deleted(diff_deleted):
    if not diff_deleted:
        return "No specific deleted lines available."
    return "\n".join([f"{line[1]}" for line in diff_deleted])

def generate_prompts(df, output_file="prompts.txt"):
    prompts = []
    for _, row in df.iterrows():
        prompt = f"""
### Instruction:
You are a security expert tasked with identifying vulnerabilities in a given code. Carefully analyze the code using CWE (Common Weakness Enumeration) descriptions and determine if it contains any vulnerabilities step by step.

### Input:
Here is a code snippet that may contain a security vulnerability:

{row['vuln_code']}


### Response:
Name of function or method that could be exploited: {row['method_name']}
The part of the code that could be exploited: 
{format_diff_deleted(row['diff_deleted'])}

The identified vulnerability corresponds to {row['cwe_id']}, which is known as "{row['cwe_name']}." This type of vulnerability is described as follows: {row['cwe_description']}

The reason this code is classified under {row['cwe_id']} is that {extract_cve_description(row['cve_description'])}
        """
        prompts.append(prompt)
    return prompts

prompts = generate_prompts(df)

## Unsloth

In [6]:
from unsloth import FastLanguageModel
max_seq_length = 18000  # 任意の値を選択可能。RoPEスケーリングは内部で自動的にサポート
dtype = torch.bfloat16  # 自動検出の場合はNone。Tesla T4、V100の場合はFloat16、Ampere以降の場合はBfloat16
load_in_4bit = True  # メモリ使用量を削減するために4ビット量子化を使用。Falseも可能

# 4倍高速なダウンロードとOOMの回避のためにサポートされている4ビット事前量子化モデル
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",  # 新しいMistral v3は2倍高速
    "unsloth/codellama-7b-bnb-4bit",
    "unsloth/phi-4-bnb-4bit",  # Llama-3 15兆トークンモデルは2倍高速
    "unsloth/DeepSeek-R1-Distill-Qwen-14B-bnb-4bit",
]  # その他のモデルについては、https://huggingface.co/unsloth をご覧ください

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=fourbit_models[0],
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    # token="hf_...",  # meta-llama/Llama-2-7b-hfのようなゲート付きモデルを使用する場合は、これを使用
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    GPU: GRID A100-10C. Max memory: 9.996 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post2. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [7]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"], 
    use_rslora=True,
    use_gradient_checkpointing="unsloth"
)

Unsloth 2025.2.12 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [8]:
from datasets import Dataset

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(prompts):
    return { "text": [prompt + EOS_TOKEN for prompt in prompts] }

# Format the dataset
formatted_data = formatting_prompts_func(prompts)

# Convert to Hugging Face Dataset
dataset = Dataset.from_dict(formatted_data)

In [9]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=4,
    packing=False,  # 短いシーケンスの場合、トレーニングを5倍高速化できます
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        warmup_steps=5,
        max_steps=60,
        # num_train_epochs = 1,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_torch",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=1234,
        output_dir="outputs",
    ),
)


Converting train dataset to ChatML (num_proc=4):   0%|          | 0/46871 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=4):   0%|          | 0/46871 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=4):   0%|          | 0/46871 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=4):   0%|          | 0/46871 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [10]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 46,871 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 8
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 422.00 MiB. GPU 0 has a total capacity of 10.00 GiB of which 306.50 MiB is free. Including non-PyTorch memory, this process has 8.80 GiB memory in use. Of the allocated memory 7.96 GiB is allocated by PyTorch, and 351.01 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)