In [1]:
import torch
import gc
import pandas as pd
import pickle
import json

In [10]:

gc.collect()

torch.cuda.empty_cache()

## Merge method_change table into df

In [2]:
with open('./dataset/test_pickles/df.pkl', 'rb') as file:
    df = pickle.load(file)
    df = df.dropna()
    df = df[(df['token_count'] < 20000)]

In [3]:
len(df)

51496

In [4]:
def extract_cve_description(cve_list):
    if not cve_list:
        return "No CVE description available."
    
    for cve in cve_list:
        if cve.get("lang") == "en":
            return cve.get("value", "No CVE description available.")
    
    return cve_list[0].get("value", "No CVE description available.")

def format_diff_deleted(diff_deleted):
    if not diff_deleted:
        return "No specific deleted lines available."
    return "\n".join([f"{line[1]}" for line in diff_deleted])

def generate_prompts(df, output_file="prompts.txt"):
    prompts = []
    for _, row in df.iterrows():
        prompt = f"""
### Instruction:
You are a security expert tasked with identifying vulnerabilities in a given code. Carefully analyze the code using CWE (Common Weakness Enumeration) descriptions and determine if it contains any vulnerabilities step by step.

### Input:
Here is a code snippet that may contain a security vulnerability:

{row['vuln_code']}


### Response:
Name of function or method that could be exploited: {row['method_name']}
The part of the code that could be exploited: 
{format_diff_deleted(row['diff_deleted'])}

The identified vulnerability corresponds to {row['cwe_id']}, which is known as "{row['cwe_name']}." This type of vulnerability is described as follows: {row['cwe_description']}

The reason this code is classified under {row['cwe_id']} is that {extract_cve_description(row['cve_description'])}
        """
        prompts.append(prompt)
    return prompts

prompts = generate_prompts(df)

## Unsloth

In [5]:
from unsloth import FastLanguageModel
max_seq_length = 25000  # 任意の値を選択可能。RoPEスケーリングは内部で自動的にサポート
dtype = torch.bfloat16  # 自動検出の場合はNone。Tesla T4、V100の場合はFloat16、Ampere以降の場合はBfloat16
load_in_4bit = True  # メモリ使用量を削減するために4ビット量子化を使用。Falseも可能

# 4倍高速なダウンロードとOOMの回避のためにサポートされている4ビット事前量子化モデル
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",  # 新しいMistral v3は2倍高速
    "unsloth/codellama-7b-bnb-4bit",
    "unsloth/phi-4-bnb-4bit",  # Llama-3 15兆トークンモデルは2倍高速
    "unsloth/DeepSeek-R1-Distill-Qwen-14B-bnb-4bit",
]  # その他のモデルについては、https://huggingface.co/unsloth をご覧ください

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=fourbit_models[0],
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    # token="hf_...",  # meta-llama/Llama-2-7b-hfのようなゲート付きモデルを使用する場合は、これを使用
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"], 
    use_rslora=True,
    use_gradient_checkpointing="unsloth"
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA GeForce RTX 3060 Ti. Max memory: 7.772 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.2.12 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [6]:
from datasets import Dataset

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(prompts):
    return { "text": [prompt + EOS_TOKEN for prompt in prompts] }

# Format the dataset
formatted_data = formatting_prompts_func(prompts)

# Convert to Hugging Face Dataset
dataset = Dataset.from_dict(formatted_data)

In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=1,
    packing=False,  # 短いシーケンスの場合、トレーニングを5倍高速化できます
    args=TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        # num_train_epochs = 1,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=1234,
        output_dir="outputs",
    ),
)


Applying chat template to train dataset: 100%|██████████| 51496/51496 [00:01<00:00, 28693.06 examples/s]
Tokenizing train dataset: 100%|██████████| 51496/51496 [08:09<00:00, 105.16 examples/s] 
Tokenizing train dataset: 100%|██████████| 51496/51496 [03:29<00:00, 245.85 examples/s]


In [11]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 51,496 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


OutOfMemoryError: CUDA out of memory. Tried to allocate 784.00 MiB. GPU 0 has a total capacity of 7.77 GiB of which 411.31 MiB is free. Including non-PyTorch memory, this process has 7.30 GiB memory in use. Of the allocated memory 7.12 GiB is allocated by PyTorch, and 33.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)