In [1]:
import torch
import gc
import pandas as pd
import pickle
import numpy as np
import json
import tiktoken

In [3]:
max_token = 6000

with open('./dataset/df_method_final.pkl', 'rb') as file:
    df = pickle.load(file)
    df = df.dropna()
    df = df[df['token_count'] <= max_token]

In [4]:
df['token_count'].max()

5998

In [5]:
def extract_cve_description(cve_list):
    if not cve_list:
        return "No CVE description available."
    
    for cve in cve_list:
        if cve.get("lang") == "en":
            return cve.get("value", "No CVE description available.")
    
    return cve_list[0].get("value", "No CVE description available.")

def format_diff_deleted(diff_deleted):
    if not diff_deleted:
        return "No specific deleted lines available."
    return "\n".join([f"{line[1]}" for line in diff_deleted])

def generate_prompts(df, model):
    # Vuln codes
    prompts = []
    for _, row in df.iterrows():
        prompt = f"""You are a highly skilled code analysis assistant specialized in identifying security vulnerabilities in software code.
USER: Your task is to analyze the provided code snippet, identify potential security weaknesses, and determine if the code is vulnerable based on the Common Weakness Enumeration (CWE) framework. Carefully analyze the code step by step, and if vulnerabilities are found, provide the following:
The name of the function or code segment that is vulnerable.
The specific part of the code that could be vulnerable.
The corresponding CWE ID and title.
A brief explanation of the vulnerability and its potential impact.
If no vulnerabilities are found, state that the code appears secure with respect to the CWE framework. 
Here is the code:
{row['vuln_code']}
ASSISTANT: Let's analyse the code. This code is classified under {row['cwe_id']}, which is known as "{row['cwe_name']}". This type of vulnerability is described as follows: {row['cwe_description']}.
{extract_cve_description(row['cve_description'])}
Here are the parts of the code that could be exploited: 
{row['vuln_part']}"""
        prompts.append(prompt)

    # Non-Vuln codes
    for _, row in df.iterrows():
        prompt = f"""You are a highly skilled code analysis assistant specialized in identifying security vulnerabilities in software code.
USER: Your task is to analyze the provided code snippet, identify potential security weaknesses, and determine if the code is vulnerable based on the Common Weakness Enumeration (CWE) framework. Carefully analyze the code step by step, and if vulnerabilities are found, provide the following:
The name of the function or code segment that is vulnerable.
The specific part of the code that could be vulnerable.
The corresponding CWE ID and title.
A brief explanation of the vulnerability and its potential impact.
If no vulnerabilities are found, state that the code appears secure with respect to the CWE framework. 
Here is the code:
{row['non_vuln_code']}
ASSISTANT: Upon careful analysis of the given code, I found that each function performs its intended task without introducing any vulnerabilities.
As a result, no vulnerabilities were identified, and no CWE classification is necessary. """
        prompts.append(prompt)
    return prompts

for model in models.keys():
    prompts = generate_prompts(df, model)
    with open(f'./dataset/{model}_prompts.pkl', 'wb') as f:
        pickle.dump(prompts, f)

In [6]:
print(prompts[100])

You are a highly skilled code analysis assistant specialized in identifying security vulnerabilities in software code.
USER: Your task is to analyze the provided code snippet, identify potential security weaknesses, and determine if the code is vulnerable based on the Common Weakness Enumeration (CWE) framework. Carefully analyze the code step by step, and if vulnerabilities are found, provide the following:
The name of the function or code segment that is vulnerable.
The specific part of the code that could be vulnerable.
The corresponding CWE ID and title.
A brief explanation of the vulnerability and its potential impact.
If no vulnerabilities are found, state that the code appears secure with respect to the CWE framework. 
Here is the code:
#ifndef _ASM_X86_PGTABLE_H
#define _ASM_X86_PGTABLE_H

#include <asm/page.h>
#include <asm/e820.h>

#include <asm/pgtable_types.h>

/*
 * Macro to mark a page protection value as UC-
 */
#define pgprot_noncached(prot)					\
	((boot_cpu_data.x86 >

In [7]:
# Load the tokenizer
encoding = tiktoken.get_encoding("cl100k_base")

# Function to recalculate token count
def recalculate_token_count(text):
    return len(encoding.encode(text)) if text else 0

big = recalculate_token_count(prompts[0])
for prompt in prompts:
    tmp = recalculate_token_count(prompt)
    if tmp > big:
        big = tmp

In [8]:
len(prompts)

13002