In [3]:
import json
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import subprocess
import shutil
import openai
import json
import time
import hashlib
import pickle
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
    retry_if_exception_type
)  # for exponential backoff

import utils.DataUtils as DataUtils
import utils.GPTUtils as GPTUtils
import importlib
from IPython.display import display, HTML

# importlib.reload(GPTUtils)


# I. Settings

## 1. Constants and configs

In [4]:
# Constants
CONFIG_DATA_ORIGINAL = 'data_original'
CONFIG_DATA_WITHOUT_COMMENTS = 'data_without_comments'
CONFIG_DATA_WITHOUT_COMMENTS_SINGLE_BEFORE_CHANGE = 'data_without_comments_single_before_change'
CONFIG_DATA_WITH_COMMENTS_SINGLE = 'data_with_comments_single'
CONFIG_DATA_WITH_COMMENTS_SINGLE_BEFORE_CHANGE = 'data_with_comments_single_before_change'

NO_BALANCING_MODE = 0
UNDERSAMPLING_MODE = 1
OVERSAMPLING_MODE = 2

NUM_EXPERIMENTS = 10
VAL_TEST_RATIO = 0.4
languages = ['kotlin', 'rust', 'swift']

configurations = {
    CONFIG_DATA_WITH_COMMENTS_SINGLE_BEFORE_CHANGE: {
        'path': "data_with_comments_single_before_change",
        'title': "With Comments + Single Func + before change"
    },    
    CONFIG_DATA_WITHOUT_COMMENTS_SINGLE_BEFORE_CHANGE: {
        'path': "data_single",
        'title': "Without Comments + Single Func + before change"
    }
}

In [29]:
# setting OpenAI API Key
openai.api_key = ""

In [5]:
def display_dataframe_with_scroll(df):
    # Render dataframe as HTML table
    df_html = df.to_html()

    # Add scrollbar to table
    table_html = f"""
    <div style="max-height:400px;overflow-y:auto;overflow-x:auto;">
        {df_html}
    </div>
    """
    
    # Display the table
    display(HTML(table_html))


## 2. Prompt Template

In [6]:
# Line-level template
line_level_template = """List the flawed lines with the format:
Line number {line}: `{code}`"""

#1. Testing template
testing_template = """{new_source_code}

Please analyze the following {language} source code and determine if it is vulnerable. Answering in the format:
1. Analysis: Vulnerable/Non-Vulnerable"""

# 2. System content template
system_content_template = "You are an expert {language} programmer"




# 3. Few shot learning template
few_shot_prompt_template = """Here are some examples of {language} source code, each followed by an analysis of its vulnerability status.
{code_demonstrations}

Now, consider the following {language} source code:

""" + testing_template

code_demonstration_template = """### Example {i}
### Start target code
{func_before}
### End of target code

### Answer
1. Analysis: {analysis}
### End of Answer

### End of Example {i}
"""

few_shot_learning_template = {
    'prompt_template': few_shot_prompt_template,
    'code_demonstration_template': code_demonstration_template
}




# 4. Fine-tuning template
# fine_tuning_user_content_template = """{code_demonstrations}
# Please analyze the following {language} source code and determine if it is vulnerable. Answering in the format:
# 1. Analyse: Vulnerable/Non-Vulnerable"""

fine_tuning_assistant_content_template = """1. Analysis: {analysis}"""
#2. Explanation: {explanation}"""

fine_tuning_template = {
    'user_content_template': testing_template,
    'assistant_content_template': fine_tuning_assistant_content_template
}




# Create template
template = {
    "testing_template":testing_template,
    "system_content_template":system_content_template,
    "few_shot_learning_template":few_shot_learning_template,
    "fine_tuning_template":fine_tuning_template,
    "line_level_template":line_level_template
}

# II. Experiments

## 1. Read data

#### 1.1. Create iterations (only the first time)

In [None]:
def load_data(lang, details):
    """Load data for a given language and configuration."""
    print(f"Language: {lang.upper()} | Configuration: {details['title']}")
    data_path = f"data/{lang}/{details['path']}.csv"
    data = pd.read_csv(data_path)
    DataUtils.check_target_distribution(data, 'target')
    print()
    return data

def get_target_distribution(data, target_column="target"):
    """Get target distribution as a dictionary."""
    distribution = data[target_column].value_counts().to_dict()
    return distribution
        
def generate_all_experiments(languages, data_dicts, number_of_experiments=NUM_EXPERIMENTS):
    """Generate experiments for all languages and return the consolidated results."""
    data_experiments = {}
    distribution_data = []  # List to store distributions
    
    for lang in languages:
        for config_name, dataset in data_dicts[lang].items():
            
            for experiment in range(number_of_experiments):
                if experiment == 3: # Skip case when f1-score is 0
                    random_state = 42
                else:
                    random_state = experiment
                    
                results = DataUtils.process_language_data_with_experiment_number(
                    language=lang,
                    data=dataset,
                    experiment_id=experiment,
                    random_state=random_state,
                    configuration_name=config_name,
                    val_test_ratio=VAL_TEST_RATIO
                )

                for name in ['train', 'val', 'test']:
                    dist = get_target_distribution(results[name])
                    dist["dataset"] = name
                    dist["language"] = lang
                    dist["configuration"] = config_name
                    dist["experiment"] = experiment
                    distribution_data.append(dist)
                
                key = f"{lang}_{config_name}_{experiment}"
                data_experiments[key] = results
            
    # Convert distribution data to a DataFrame and print
    df_distributions = pd.DataFrame(distribution_data)
    display_dataframe_with_scroll(df_distributions)
    
    return data_experiments

# Load data
data_dicts = {}
for lang in languages:
    data_dicts[lang] = {
        config_name: load_data(lang, details) for config_name, details in configurations.items()
    }

# Process Experiments
data_experiments = generate_all_experiments(languages, data_dicts, number_of_experiments=10)

#### 1.2. Load data for 10 iterations (No need to create iterations again)

In [9]:
configurations = {
    CONFIG_DATA_WITH_COMMENTS_SINGLE_BEFORE_CHANGE: {
        'path': "data_with_comments_single_before_change",
        'title': "With Comments + Single Func + before change"
    },    
    CONFIG_DATA_WITHOUT_COMMENTS_SINGLE_BEFORE_CHANGE: {
        'path': "data_single",
        'title': "Without Comments + Single Func + before change"
    }
}

def load_data_experiments(languages, configurations, number_of_experiments=10):
    data_experiments = {}
    
    for lang in languages:
        
        for config_name in configurations.keys():
            
            for experiment in range(number_of_experiments):
                results = DataUtils.process_language_data_with_experiment_number(
                    language=lang,
                    configuration_name=config_name,
                    experiment_id=experiment,
                    data=None,
                    random_state=None,
                    read_only=True
                )
                key = f"{lang}_{config_name}_{experiment}"
                data_experiments[key] = results
    
    return data_experiments

data_experiments = load_data_experiments(languages, configurations)


## 2. Execution

### 2.1. GPT fine-tuning

Operations:
1. Prepare Data
2. Upload File
3. Create Fine-Tuning Job
4. Poll for Job Status
5. Evaluate Model

Helper functions

In [23]:
def get_suffix_name(balance_mode):
    
    if balance_mode == NO_BALANCING_MODE:
        suffix_name = "_no_balancing"
    elif balance_mode == UNDERSAMPLING_MODE:
        suffix_name = "_undersampling"
    else:
        suffix_name = "_oversampling"

    return suffix_name

def prepare_data(lang, configuration_name, experiment_key, balance_mode, suffix_name, index):
    if balance_mode == NO_BALANCING_MODE:
        train = data_experiments[experiment_key]['train']
    else:
        train = data_experiments[experiment_key][f'train{suffix_name}']
    
    #train['chain-of-thought'] = train.apply(add_chain_of_thought_for_dataset, axis=1)

    conversations = DataUtils.create_fine_tuning_dataset(lang, train, template)
    
    # Write to a JSON file
    fine_tuning_file = f'data/{lang}/{configuration_name}/{index}/fine_tuning_data{suffix_name}.json'
    DataUtils.write_fine_tuning_json(fine_tuning_file, conversations)
    
    return fine_tuning_file

def create_fine_tuning_step(fine_tuning_file):
    fine_tuning_data = GPTUtils.upload_fine_tuning_file(fine_tuning_file)
    
    # Wait before create job, if exception when create job, retry
    retry_count = 0
    MAX_RETRIES = 1000
    time.sleep(30)
    while retry_count < MAX_RETRIES:
        try:
            job = GPTUtils.create_fine_tuning_job(training_file=fine_tuning_data['id'], model="gpt-3.5-turbo")
            return job
        except Exception as e:
            print(f"Error while creating job: {e}. Retrying...")
            retry_count += 1
            time.sleep(60)

    print("Max retries reached. Skipping this experiment.")
    return None

def poll_for_job_status(job):
    time.sleep(180) # wait for 180 seconds
    while True:
        job_result = GPTUtils.retrieve_job(job['id'])
        if job_result['status'] == "succeeded":
            return job_result
        elif job_result['status'] in ["failed", "error"]:
            print(f"Job failed with status: {job_result['status']}")
            return None
        elif job_result['status'] == "running":
            print("Job is still running")
        time.sleep(180)

def evaluate_model(lang, test, job_result, template, suffix_name, configuration_name, index):
    GPTUtils.evaluate_gpt(
        language=lang,
        dataset=test,
        save_prompt=True,
        model=job_result['fine_tuned_model'],
        prompt_template=template,
        file_output=f"result/{lang}/{configuration_name}/{index}/gpt_fine_tune{suffix_name}.csv")



Main function

In [33]:
def fine_tuning(lang, configuration_name, balance_mode, start_experiment=0, number_of_experiments=10):
    print(f"Language: {lang.upper()}")
    print(f"Config mode: {configuration_name}")
    print(f"Balance mode: {balance_mode}")
    suffix_name = get_suffix_name(balance_mode)
    print()
    
    for index in range(start_experiment, number_of_experiments):
        print(f"Experiment: {index}")
        
        # Step 1: Data preparation
        experiment_key = f"{lang}_{configuration_name}_{index}"
        fine_tuning_file = prepare_data(lang, configuration_name, experiment_key, balance_mode, suffix_name, index)
        
        # Step 2: Create Fine-Tuning Step
        job = create_fine_tuning_step(fine_tuning_file)
        if not job:
            break
        
        # Step 3: Poll for Job Status
        job_result = poll_for_job_status(job)
        if not job_result:
            break
        
        # Step 4: Evaluate Model
        evaluate_model(lang, data_experiments[experiment_key]['test'], job_result, template, suffix_name, configuration_name, index)
        
        print("---------")


In [191]:
fine_tuning(start_experiment=3, 
            number_of_experiments=4,
            lang="rust", 
            balance_mode=OVERSAMPLING_MODE, 
            configuration_name=CONFIG_DATA_WITH_COMMENTS_SINGLE_BEFORE_CHANGE)

Language: RUST
Config mode: data_with_comments_single_before_change
Balance mode: 2

Experiment: 3
Number of records in fine-tuning json: 1162
Conversations written to data/rust/data_with_comments_single_before_change/3/fine_tuning_data_oversampling.json
Job is still running
Job is still running
Job is still running
Job is still running
Job is still running
Job is still running
Job is still running
Job is still running
Job is still running
Job is still running
Job is still running
Job is still running
Job is still running
Job is still running
Job is still running
Job is still running
Job is still running
Job is still running
Job is still running
Job is still running
Model: ft:gpt-3.5-turbo-0613:personal::8GTZqIhj
Number of test records: 207
target
0    194
1     13
Name: count, dtype: int64


Predicting: 100%|██████████| 207/207 [02:25<00:00,  1.43record/s]

GPT F1 Score: 0.5833333333333334
GPT Precision: 0.6363636363636364
GPT Recall: 0.5384615384615384
GPT Accuracy: 95.16908212560386%
GPT MCC: 0.5600750902889385
---------





### 2.2. GPT few-shot

In [25]:
def few_shot_learning(config_name, lang, number_of_experiments=10, start_experiment=0, max_vul=9, max_non_vul=1, model='gpt-3.5-turbo'):
    print(lang.upper())
    for index in range(start_experiment, number_of_experiments):
        print(f"\tExperiment: {index}")
        experiment_key = f"{lang}_{config_name}_{index}"
        train = data_experiments[experiment_key]['train']
        test = data_experiments[experiment_key]['test']

        # Filter non-vul
        non_vul = train[train['target'] == 0]

        # Filter vul
        vul = train[train['target'] == 1]
        
        print(f"\t\tTraining dataset {len(train)} (Nonvul, Vul): {len(non_vul)}, {len(vul)}")
        
        # Add examples
        non_vul = non_vul.head(max_non_vul)
        vul = vul.head(max_vul)
        
        examples = vul.apply(lambda row: row.to_dict(), axis=1).tolist() + non_vul.apply(lambda row: row.to_dict(), axis=1).tolist()
        
        GPTUtils.evaluate_gpt(lang, 
                    model=model, 
                    dataset=test, 
                    examples=examples,
                    prompt_template=template, 
                    file_output=f"result/{lang}/{config_name}/{index}/gpt_few_shot.csv")
        print()
        

In [34]:
few_shot_learning(start_experiment=0, 
                  number_of_experiments=1,
                  config_name=CONFIG_DATA_WITH_COMMENTS_SINGLE_BEFORE_CHANGE, lang="kotlin", model="gpt-3.5-turbo")

KOTLIN
	Experiment: 0
		Training dataset 54 (Nonvul, Vul): 44, 10
Model: gpt-3.5-turbo
Number of test records: 18
target
0    15
1     3
Name: count, dtype: int64


Predicting: 100%|██████████| 18/18 [00:18<00:00,  1.01s/record]

GPT F1 Score: 0.5
GPT Precision: 1.0
GPT Recall: 0.3333333333333333
GPT Accuracy: 88.88888888888889%
GPT MCC: 0.5423261445466404






## 3. Result

In [8]:
def get_true_positives(filename, prediction_column, actual_column):
    df = pd.read_csv(filename)
    true_positives_df = df[
        (df[actual_column] == 1) & 
        (df[prediction_column].isin([1, '1', True, 'true']))
    ]
    return set(true_positives_df['processed_func'].tolist())


def get_skip_id(include_ids):
    include_ids = set(include_ids)
    
    # Define the range of all possible experiment IDs (0 through 9 in this case)
    all_ids = set(range(10))

    # Calculate the IDs to skip
    skip_ids = all_ids - include_ids

    return sorted(list(skip_ids))

def get_include_ids(skip_ids):
    if skip_ids is None:
        return list(range(10))
    
    skip_ids = set(skip_ids)
    
    # Define the range of all possible experiment IDs (0 through 9 in this case)
    all_ids = set(range(10))

    # Calculate the IDs to skip
    include_ids = all_ids - skip_ids

    return sorted(list(include_ids))

def calculate_total_lines(filepath):
    df = pd.read_csv(filepath)
    
    total_vulnerable_lines = int(df['num_flaw_lines'].sum())
    total_lines = int(df['num_lines'].sum())
    
    return total_lines, total_vulnerable_lines

In [10]:
models = ["gpt_fine_tune_oversampling", "gpt_fine_tune_undersampling", "gpt_few_shot", "linevul_oversampling", "linevul_undersampling"]
experiments = range(0, 10)  # 10 experiments


overlap_lang_experiment = {}
overlap_ids = {}

total_lines_dict = {}
total_vulnerable_lines_dict = {}


for index, lang in enumerate([
    'kotlin/data_with_comments_single_before_change', 
    'kotlin/data_without_comments_single_before_change',
    'swift/data_with_comments_single_before_change', 
    'swift/data_without_comments_single_before_change',
    'rust/data_with_comments_single_before_change', 
    'rust/data_without_comments_single_before_change'], start=1):
    #print(f"{index}. {lang.upper()}")
    
    overlap_ids[lang] = []
    
    for experiment in experiments:
        
        # if experiment in skip_id:
        #     continue
        
        # List to store true positives of all models for the current experiment
        experiment_true_positives = []
        
        for model in models:
            filepath = f"result/{lang}/{experiment}/{model}.csv"
            #print(f"Model: {model}")
            if 'linevul' in model:
                prediction_column = 'y_preds'
                actual_column = 'y_trues'
                
                # Get total lines, total vulnerable lines
                total_lines, total_vulnerable_lines = calculate_total_lines(filepath)
                
                total_lines_dict[f"{lang}_{experiment}"] = total_lines  # Store the total lines for this experiment
                total_vulnerable_lines_dict[f"{lang}_{experiment}"] = total_vulnerable_lines  # Store the total vulnerable lines for this experiment
            else:
                prediction_column = 'GPT Prediction'
                actual_column = 'target'
            
            true_positives = get_true_positives(filepath,prediction_column,actual_column)
            # for func in true_positives:
            #     print(func.split('\n')[0])
            # print("-----")
            experiment_true_positives.append(true_positives)
        
        # Find the intersection (overlap) of all models for the current experiment
        overlap = set.intersection(*experiment_true_positives)
        
        if overlap:  # Check if the overlap is not empty
            overlap_lang_experiment[f"{lang}_{experiment}"] = overlap
            
            overlap_ids[lang].append(experiment)
            
            # print(f"Overlapping true positives for {lang} Experiment {experiment}:")
            # for func in overlap:
            #     print(func)
            # print("\n" + "="*50 + "\n")  # Separation line
            
            
#Now overlap_lang_experiment dictionary has the overlaps, printing non-empty overlaps
# for key, value in overlap_lang_experiment.items():
#     if value:  # If the overlap set is not empty
#         print(f"Non-empty overlap in {key}: {len(value)}")
#         for func in value:
#             print(func.split("\n")[0])
            
#         print("------------")

# for index, lang in enumerate([
#     'kotlin/data_with_comments_single_before_change', 
#     'kotlin/data_without_comments_single_before_change',
#     'swift/data_with_comments_single_before_change', 
#     'swift/data_without_comments_single_before_change',
#     'rust/data_with_comments_single_before_change', 
#     'rust/data_without_comments_single_before_change'], start=1):
#     for experiment in experiments:
#         print(total_lines_dict[f"{lang}_{experiment}"], total_vulnerable_lines_dict[f"{lang}_{experiment}"])

In [29]:
func_skip_ids = {}

for index, lang in enumerate([
    'kotlin/data_with_comments_single_before_change', 
    'kotlin/data_without_comments_single_before_change',
    'swift/data_with_comments_single_before_change', 
    'swift/data_without_comments_single_before_change',
    'rust/data_with_comments_single_before_change', 
    'rust/data_without_comments_single_before_change'], start=1):
    func_skip_ids[lang] = []

func_skip_ids['kotlin/data_with_comments_single_before_change'] = [3,4,6,7,8,9]
func_skip_ids['kotlin/data_without_comments_single_before_change'] = [0,3,7,6,4,8]
func_skip_ids['swift/data_with_comments_single_before_change'] = [0,1,7,6,8]
func_skip_ids['swift/data_without_comments_single_before_change'] = [0,1,4,7,8]
func_skip_ids['rust/data_with_comments_single_before_change'] = [0,3,6,8]
func_skip_ids['rust/data_without_comments_single_before_change'] = [0,6,8]

#### 3.1. Line level metrics

In [30]:
line_level_df = []

for index, lang in enumerate([
    'kotlin/data_with_comments_single_before_change', 
    'kotlin/data_without_comments_single_before_change',
    'swift/data_with_comments_single_before_change', 
    'swift/data_without_comments_single_before_change',
    'rust/data_with_comments_single_before_change', 
    'rust/data_without_comments_single_before_change'], start=1):
    print(f"{index}. {lang.upper()}")
    
    # Skip id (overlap)
    #skip_id = get_skip_id(overlap_ids[lang])
    #print(f"Include experiments: {overlap_ids[lang]}")

    # Skip id (f1-score = 0)
    skip_id = func_skip_ids[lang]
    include_experiments = get_include_ids(skip_id)
    print(f"Skip ids: {skip_id}")
    print(f"Include experiments: {include_experiments}")
    
    for model_index, model in enumerate(["gpt_fine_tune_undersampling", "gpt_fine_tune_oversampling", "gpt_few_shot", "linevul_undersampling", "linevul_oversampling", "linevul_no_balancing"], start=1):
        print(f"{index}.{model_index}. {model.upper()}")
        
        top1, top3, top5, top10, ifa, effort_20_recall, recall_1_percent_loc = GPTUtils.avg_experiments(file_name=f"{model}.csv", 
                                 language=lang, 
                                 print_data=False, 
                                 skip_id=skip_id, 
                                 #overlap_lang_experiment=overlap_lang_experiment,
                                 overlap_lang_experiment=None,
                                 total_lines_dict=total_lines_dict,
                                 total_vulnerable_lines_dict=total_vulnerable_lines_dict,
                                 line_level =True,
                                 func_level = False
                                 )
        
        # Append a new record to our list of data
        line_level_df.append({
            "lang": lang,
            "model": model,
            "include_experiments": include_experiments,
            "Top-1-accuracy": top1,
            "Top-3-accuracy": top3,
            "Top-5-accuracy": top5,
            "Top-10-accuracy": top10,
            "IFA": ifa,
            "Effort@20% Recall": effort_20_recall,
            "Recall@1% LOC": recall_1_percent_loc
        })
        
# Convert the list of data into a pandas DataFrame
line_level_df = pd.DataFrame(line_level_df)

display(line_level_df)


1. KOTLIN/DATA_WITH_COMMENTS_SINGLE_BEFORE_CHANGE
Skip ids: [3, 4, 6, 7, 8, 9]
Include experiments: [0, 1, 2, 5]
1.1. GPT_FINE_TUNE_UNDERSAMPLING
Number of iterations 4

	Average top 1 accuracy: 87.5%
	Average top 3 accuracy: 87.5%
	Average top 5 accuracy: 87.5%
	Average top 10 accuracy: 87.5%
	Average ifa: 0.12
	Average effort at 20 recall: 0.0134
	Average recall at 1% loc: 0.1648
1.2. GPT_FINE_TUNE_OVERSAMPLING
Number of iterations 4

	Average top 1 accuracy: 87.5%
	Average top 3 accuracy: 87.5%
	Average top 5 accuracy: 87.5%
	Average top 10 accuracy: 87.5%
	Average ifa: 0.12
	Average effort at 20 recall: 0.0098
	Average recall at 1% loc: 0.1143
1.3. GPT_FEW_SHOT
Number of iterations 4

	Average top 1 accuracy: 100.0%
	Average top 3 accuracy: 100.0%
	Average top 5 accuracy: 100.0%
	Average top 10 accuracy: 100.0%
	Average ifa: 0.0
	Average effort at 20 recall: 0.0081
	Average recall at 1% loc: 0.137
1.4. LINEVUL_UNDERSAMPLING
Number of iterations 4

	Average top 1 accuracy: 100.0%
	A

Unnamed: 0,lang,model,include_experiments,Top-1-accuracy,Top-3-accuracy,Top-5-accuracy,Top-10-accuracy,IFA,Effort@20% Recall,Recall@1% LOC
0,kotlin/data_with_comments_single_before_change,gpt_fine_tune_undersampling,"[0, 1, 2, 5]",0.875,0.875,0.875,0.875,0.125,0.013376,0.164794
1,kotlin/data_with_comments_single_before_change,gpt_fine_tune_oversampling,"[0, 1, 2, 5]",0.875,0.875,0.875,0.875,0.125,0.009753,0.114288
2,kotlin/data_with_comments_single_before_change,gpt_few_shot,"[0, 1, 2, 5]",1.0,1.0,1.0,1.0,0.0,0.008063,0.137016
3,kotlin/data_with_comments_single_before_change,linevul_undersampling,"[0, 1, 2, 5]",1.0,1.0,1.0,1.0,0.0,0.013376,0.204948
4,kotlin/data_with_comments_single_before_change,linevul_oversampling,"[0, 1, 2, 5]",1.0,1.0,1.0,1.0,0.0,0.013376,0.160395
5,kotlin/data_with_comments_single_before_change,linevul_no_balancing,"[0, 1, 2, 5]",0.875,0.875,0.875,0.875,1.75,0.013376,0.110792
6,kotlin/data_without_comments_single_before_change,gpt_fine_tune_undersampling,"[1, 2, 5, 9]",0.625,0.625,0.625,0.625,0.375,0.013133,0.129079
7,kotlin/data_without_comments_single_before_change,gpt_fine_tune_oversampling,"[1, 2, 5, 9]",0.875,0.875,0.875,0.875,0.125,0.013133,0.114288
8,kotlin/data_without_comments_single_before_change,gpt_few_shot,"[1, 2, 5, 9]",1.0,1.0,1.0,1.0,0.0,0.00782,0.114288
9,kotlin/data_without_comments_single_before_change,linevul_undersampling,"[1, 2, 5, 9]",0.875,0.875,0.875,0.875,1.375,0.023268,0.133519


#### 3.2. Func level metrics

In [31]:
func_level_df = []

for index, lang in enumerate([
    'kotlin/data_with_comments_single_before_change', 
    'kotlin/data_without_comments_single_before_change',
    'swift/data_with_comments_single_before_change', 
    'swift/data_without_comments_single_before_change',
    'rust/data_with_comments_single_before_change', 
    'rust/data_without_comments_single_before_change'], start=1):
    print(f"{index}. {lang.upper()}")

    skip_id = func_skip_ids[lang]
    # include_experiments = get_include_ids(skip_id)
    # print(f"Include experiments: {include_experiments}")
    
    for model_index, model in enumerate(["gpt_fine_tune_undersampling", "gpt_fine_tune_oversampling", "gpt_few_shot", "linevul_undersampling", "linevul_oversampling", "linevul_no_balancing"], start=1):
        print(f"{index}.{model_index}. {model.upper()}")
        
        mcc, f1, precision, recall, accuracy = GPTUtils.avg_experiments(file_name=f"{model}.csv", 
                                 language=lang, 
                                 print_data=False,
                                 skip_id=[],
                                 line_level=False,
                                 func_level=True
                                 )
        
        # Append a new record to our list of data
        func_level_df.append({
            "lang": lang,
            "model": model,
            "include_experiments": include_experiments,
            "mcc": mcc,
            "f1": f1,
            "precision": precision,
            "recall": recall,
            "accuracy": accuracy
        })
        
# Convert the list of data into a pandas DataFrame
func_level_df = pd.DataFrame(func_level_df)

display(func_level_df)

1. KOTLIN/DATA_WITH_COMMENTS_SINGLE_BEFORE_CHANGE
1.1. GPT_FINE_TUNE_UNDERSAMPLING
Zero F1: result/kotlin/data_with_comments_single_before_change/3/gpt_fine_tune_undersampling.csv
Number of iterations 10
List zero f1-score experiments: [3]

	Average Mcc: 0.21
	Average F1 Score: 0.36
	Average Precision: 0.32
	Average Recall: 0.54
	Average Accuracy: 66.67%
1.2. GPT_FINE_TUNE_OVERSAMPLING
Zero F1: result/kotlin/data_with_comments_single_before_change/7/gpt_fine_tune_oversampling.csv
Zero F1: result/kotlin/data_with_comments_single_before_change/9/gpt_fine_tune_oversampling.csv
Number of iterations 10
List zero f1-score experiments: [7, 9]

	Average Mcc: 0.19
	Average F1 Score: 0.32
	Average Precision: 0.32
	Average Recall: 0.32
	Average Accuracy: 76.67%
1.3. GPT_FEW_SHOT
Number of iterations 10

	Average Mcc: 0.34
	Average F1 Score: 0.46
	Average Precision: 0.5
	Average Recall: 0.5
	Average Accuracy: 76.67%
1.4. LINEVUL_UNDERSAMPLING
Zero F1: result/kotlin/data_with_comments_single_before

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Zero F1: result/kotlin/data_with_comments_single_before_change/4/linevul_no_balancing.csv
Zero F1: result/kotlin/data_with_comments_single_before_change/6/linevul_no_balancing.csv
Zero F1: result/kotlin/data_with_comments_single_before_change/7/linevul_no_balancing.csv
Zero F1: result/kotlin/data_with_comments_single_before_change/8/linevul_no_balancing.csv
Number of iterations 10
List zero f1-score experiments: [4, 6, 7, 8]

	Average Mcc: 0.12
	Average F1 Score: 0.23
	Average Precision: 0.24
	Average Recall: 0.23
	Average Accuracy: 76.67%
2. KOTLIN/DATA_WITHOUT_COMMENTS_SINGLE_BEFORE_CHANGE
2.1. GPT_FINE_TUNE_UNDERSAMPLING
Number of iterations 10

	Average Mcc: 0.22
	Average F1 Score: 0.39
	Average Precision: 0.33
	Average Recall: 0.56
	Average Accuracy: 65.56%
2.2. GPT_FINE_TUNE_OVERSAMPLING
Zero F1: result/kotlin/data_without_comments_single_before_change/7/gpt_fine_tune_oversampling.csv
Number of iterations 10
List zero f1-score experiments: [7]

	Average Mcc: 0.18
	Average F1 Scor

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Zero F1: result/kotlin/data_without_comments_single_before_change/0/linevul_no_balancing.csv
Zero F1: result/kotlin/data_without_comments_single_before_change/3/linevul_no_balancing.csv
Zero F1: result/kotlin/data_without_comments_single_before_change/4/linevul_no_balancing.csv
Zero F1: result/kotlin/data_without_comments_single_before_change/6/linevul_no_balancing.csv
Zero F1: result/kotlin/data_without_comments_single_before_change/7/linevul_no_balancing.csv
Number of iterations 10
List zero f1-score experiments: [0, 3, 4, 6, 7]

	Average Mcc: 0.16
	Average F1 Score: 0.24
	Average Precision: 0.31
	Average Recall: 0.22
	Average Accuracy: 79.44%
3. SWIFT/DATA_WITH_COMMENTS_SINGLE_BEFORE_CHANGE
3.1. GPT_FINE_TUNE_UNDERSAMPLING
Number of iterations 10

	Average Mcc: 0.34
	Average F1 Score: 0.32
	Average Precision: 0.2
	Average Recall: 0.83
	Average Accuracy: 78.41%
3.2. GPT_FINE_TUNE_OVERSAMPLING
Number of iterations 10

	Average Mcc: 0.43
	Average F1 Score: 0.41
	Average Precision: 0.66

  _warn_prf(average, modifier, msg_start, len(result))


Number of iterations 10
List zero f1-score experiments: [0]

	Average Mcc: 0.36
	Average F1 Score: 0.36
	Average Precision: 0.55
	Average Recall: 0.3
	Average Accuracy: 93.86%
4.3. GPT_FEW_SHOT
Zero F1: result/swift/data_without_comments_single_before_change/1/gpt_few_shot.csv
Number of iterations 10
List zero f1-score experiments: [1]

	Average Mcc: 0.29
	Average F1 Score: 0.33
	Average Precision: 0.36
	Average Recall: 0.32
	Average Accuracy: 92.16%
4.4. LINEVUL_UNDERSAMPLING
Zero F1: result/swift/data_without_comments_single_before_change/8/linevul_undersampling.csv
Number of iterations 10
List zero f1-score experiments: [8]

	Average Mcc: 0.32
	Average F1 Score: 0.34
	Average Precision: 0.24
	Average Recall: 0.63
	Average Accuracy: 85.0%
4.5. LINEVUL_OVERSAMPLING
Zero F1: result/swift/data_without_comments_single_before_change/7/linevul_oversampling.csv
Number of iterations 10
List zero f1-score experiments: [7]

	Average Mcc: 0.26
	Average F1 Score: 0.28
	Average Precision: 0.26
	A

  _warn_prf(average, modifier, msg_start, len(result))


Zero F1: result/rust/data_with_comments_single_before_change/0/gpt_few_shot.csv
Zero F1: result/rust/data_with_comments_single_before_change/3/gpt_few_shot.csv
Zero F1: result/rust/data_with_comments_single_before_change/6/gpt_few_shot.csv
Number of iterations 10
List zero f1-score experiments: [0, 3, 6]

	Average Mcc: 0.03
	Average F1 Score: 0.08
	Average Precision: 0.08
	Average Recall: 0.08
	Average Accuracy: 89.13%
5.4. LINEVUL_UNDERSAMPLING
Number of iterations 10

	Average Mcc: 0.27
	Average F1 Score: 0.29
	Average Precision: 0.21
	Average Recall: 0.6
	Average Accuracy: 81.26%
5.5. LINEVUL_OVERSAMPLING
Number of iterations 10

	Average Mcc: 0.34
	Average F1 Score: 0.37
	Average Precision: 0.41
	Average Recall: 0.38
	Average Accuracy: 91.45%
5.6. LINEVUL_NO_BALANCING
Number of iterations 10

	Average Mcc: 0.39
	Average F1 Score: 0.37
	Average Precision: 0.67
	Average Recall: 0.3
	Average Accuracy: 93.67%
6. RUST/DATA_WITHOUT_COMMENTS_SINGLE_BEFORE_CHANGE
6.1. GPT_FINE_TUNE_UNDERSA

Unnamed: 0,lang,model,include_experiments,mcc,f1,precision,recall,accuracy
0,kotlin/data_with_comments_single_before_change,gpt_fine_tune_undersampling,"[1, 2, 3, 4, 5, 7, 9]",0.213922,0.36381,0.315,0.541667,0.666667
1,kotlin/data_with_comments_single_before_change,gpt_fine_tune_oversampling,"[1, 2, 3, 4, 5, 7, 9]",0.193473,0.318968,0.323333,0.325,0.766667
2,kotlin/data_with_comments_single_before_change,gpt_few_shot,"[1, 2, 3, 4, 5, 7, 9]",0.343814,0.458016,0.502738,0.5,0.766667
3,kotlin/data_with_comments_single_before_change,linevul_undersampling,"[1, 2, 3, 4, 5, 7, 9]",0.082522,0.300623,0.235556,0.5,0.577778
4,kotlin/data_with_comments_single_before_change,linevul_oversampling,"[1, 2, 3, 4, 5, 7, 9]",0.099809,0.295306,0.282917,0.45,0.605556
5,kotlin/data_with_comments_single_before_change,linevul_no_balancing,"[1, 2, 3, 4, 5, 7, 9]",0.122006,0.231111,0.24,0.233333,0.766667
6,kotlin/data_without_comments_single_before_change,gpt_fine_tune_undersampling,"[1, 2, 3, 4, 5, 7, 9]",0.218225,0.390791,0.334524,0.558333,0.655556
7,kotlin/data_without_comments_single_before_change,gpt_fine_tune_oversampling,"[1, 2, 3, 4, 5, 7, 9]",0.181615,0.325173,0.376905,0.325,0.738889
8,kotlin/data_without_comments_single_before_change,gpt_few_shot,"[1, 2, 3, 4, 5, 7, 9]",0.271443,0.398571,0.434405,0.441667,0.75
9,kotlin/data_without_comments_single_before_change,linevul_undersampling,"[1, 2, 3, 4, 5, 7, 9]",0.078187,0.296496,0.242619,0.466667,0.588889


In [306]:
func_level_df = []

for index, lang in enumerate([
    'kotlin/data_with_comments_single_before_change', 
    'kotlin/data_without_comments_single_before_change',
    'swift/data_with_comments_single_before_change', 
    'swift/data_without_comments_single_before_change',
    'rust/data_with_comments_single_before_change', 
    'rust/data_without_comments_single_before_change'], start=1):
    print(f"{index}. {lang.upper()}")

    skip_id = func_skip_ids[lang]
    include_experiments = get_include_ids(skip_id)
    #print(f"Include experiments: {include_experiments}")
    
    #for model_index, model in enumerate(["linevul_no_balancing", "linevul_oversampling", "linevul_undersampling"], start=1):
    for model_index, model in enumerate(["linevul_no_balancing"], start=1):
        print(f"{index}.{model_index}. {model.upper()}")
        
        mcc, f1, precision, recall, accuracy = GPTUtils.avg_experiments(file_name=f"{model}.csv", 
                                 language=lang, 
                                 print_data=True,
                                 skip_id=[],
                                 line_level=False,
                                 func_level=True
                                 )
        
        # Append a new record to our list of data
        func_level_df.append({
            "lang": lang,
            "model": model,
            "include_experiments": include_experiments,
            "mcc": mcc,
            "f1": f1,
            "precision": precision,
            "recall": recall,
            "accuracy": accuracy
        })
        
# Convert the list of data into a pandas DataFrame
func_level_df = pd.DataFrame(func_level_df)

display(func_level_df)

1. KOTLIN/DATA_WITH_COMMENTS_SINGLE_BEFORE_CHANGE
1.1. LINEVUL_NO_BALANCING
Experiment 0
	F1 Score: 0.4
	Precision: 0.5
	Recall: 0.3333333333333333
	Accuracy: 83.33333333333334%
	MCC: 0.31622776601683794
Experiment 1
	F1 Score: 0.4
	Precision: 0.3333333333333333
	Recall: 0.5
	Accuracy: 66.66666666666666%
	MCC: 0.1889822365046136
Experiment 2
	F1 Score: 0.3333333333333333
	Precision: 0.3333333333333333
	Recall: 0.3333333333333333
	Accuracy: 77.77777777777779%
	MCC: 0.2
Experiment 3
	F1 Score: 0.4
	Precision: 0.5
	Recall: 0.3333333333333333
	Accuracy: 83.33333333333334%
	MCC: 0.31622776601683794
Experiment 4
	F1 Score: 0.0
	Precision: 0.0
	Recall: 0.0
	Accuracy: 77.77777777777779%
	MCC: -0.10846522890932808
Experiment 5
	F1 Score: 0.4444444444444445
	Precision: 0.4
	Recall: 0.5
	Accuracy: 72.22222222222221%
	MCC: 0.26519741765271837
Experiment 6
	F1 Score: 0.0
	Precision: 0.0
	Recall: 0.0
	Accuracy: 72.22222222222221%
	MCC: -0.15811388300841897
Experiment 7
	F1 Score: 0.0
	Precision: 0.0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


	F1 Score: 0.17391304347826086
	Precision: 0.2
	Recall: 0.15384615384615385
	Accuracy: 90.82125603864735%
	MCC: 0.12741269029330526
Experiment 9
	F1 Score: 0.34782608695652173
	Precision: 0.4
	Recall: 0.3076923076923077
	Accuracy: 92.7536231884058%
	MCC: 0.3131480909321376

	Average Mcc: 0.39
	Average F1 Score: 0.37
	Average Precision: 0.67
	Average Recall: 0.3
	Average Accuracy: 93.67%
6. RUST/DATA_WITHOUT_COMMENTS_SINGLE_BEFORE_CHANGE
6.1. LINEVUL_NO_BALANCING
Experiment 0
	F1 Score: 0.5599999999999999
	Precision: 0.5833333333333334
	Recall: 0.5384615384615384
	Accuracy: 94.68599033816425%
	MCC: 0.5322529228813081
Experiment 1
	F1 Score: 0.5454545454545455
	Precision: 0.6666666666666666
	Recall: 0.46153846153846156
	Accuracy: 95.16908212560386%
	MCC: 0.5306719487402056
Experiment 2
	F1 Score: 0.1935483870967742
	Precision: 0.1125
	Recall: 0.6923076923076923
	Accuracy: 63.76811594202898%
	MCC: 0.16258498681349776
Experiment 3
	F1 Score: 0.23404255319148934
	Precision: 0.13580246913580

Unnamed: 0,lang,model,include_experiments,mcc,f1,precision,recall,accuracy
0,kotlin/data_with_comments_single_before_change,linevul_no_balancing,"[0, 1, 2, 4, 5, 8]",0.122006,0.231111,0.24,0.233333,0.766667
1,kotlin/data_without_comments_single_before_change,linevul_no_balancing,"[0, 1, 2, 3, 5, 9]",0.158866,0.242063,0.306667,0.216667,0.794444
2,swift/data_with_comments_single_before_change,linevul_no_balancing,"[2, 3, 4, 5, 6, 9]",0.348709,0.357609,0.462143,0.36,0.929545
3,swift/data_without_comments_single_before_change,linevul_no_balancing,"[2, 3, 4, 5, 6, 9]",0.347811,0.368034,0.438095,0.326667,0.939773
4,rust/data_with_comments_single_before_change,linevul_no_balancing,"[1, 2, 4, 5, 7, 9]",0.393681,0.374999,0.669841,0.3,0.936715
5,rust/data_without_comments_single_before_change,linevul_no_balancing,"[1, 2, 3, 4, 5, 7, 9]",0.334307,0.340481,0.436208,0.430769,0.877778


In [51]:
def calculate_loc(lang, path=None, code_col='code', label_col='label'):
    
    if path == None: 
        loc_df = pd.read_csv(f"data/{lang}_data.csv")
    else:
        loc_df = pd.read_csv(path)

    total_loc = 0
    count = 0
    vul_count = 0
    non_vul_count = 0
    vul_line_count = 0
    for index, row in loc_df.iterrows():
        loc = len(row['code'].split('\n'))
        total_loc+=loc
        count+=1
        
        vul = row['label']
        if vul == 1:
            vul_count += 1
        else:
            non_vul_count += 1
            
        flaw_line_index = row['mod_lines']
    
        if pd.notna(flaw_line_index):
            num_flaw_line = len(flaw_line_index.split(','))
            
            vul_line_count += num_flaw_line
        
    avg = total_loc/count
    print(f"Number of records: {count}")
    print(f"Number of vul records: {vul_count}")
    print(f"Number of non records: {non_vul_count}")
    print(f"Number of vul lines: {vul_line_count}")
    return avg
    
for lang in ['kotlin', 'swift', 'rust']:
    print(f"Avg loc of {lang}: {calculate_loc(lang)}")
    
#print(f"Avg loc of {lang}: {calculate_loc('C/C++', '/Users/tungthai/Downloads/processed_data.csv')}")

Number of records: 118
Number of vul records: 20
Number of non records: 98
Number of vul lines: 45
Avg loc of kotlin: 10.61864406779661
Number of records: 485
Number of vul records: 36
Number of non records: 449
Number of vul lines: 104
Avg loc of swift: 16.896907216494846
Number of records: 1199
Number of vul records: 90
Number of non records: 1109
Number of vul lines: 350
Avg loc of rust: 22.376146788990827
