Import Libraries

In [1]:
import pandas as pd
import json
import csv
import os


Fetch data

In [2]:
def readData(category):
    filesBase = "plain_" 
    filename = filesBase+category
    filepath = os.path.join("data", filename)
    
    with open(filepath, 'r') as file:
        file_data = json.load(file)
    
    return file_data

In [3]:
def getSource(file_data):
    
    #source = []
    bad_parts = []
    good_parts = []
    full_code = []
    for item in file_data.items():
        for proDict in item[1].items():
            pro = proDict[1]
            files = pro.get("files")
            for file in files.items():
                f = file[1]
                
                # get bad and good parts from diff
                changes = f.get("changes")
                full_code.append(f.get("sourceWithComments"))
                for change in changes:
                    bad_parts.append(change.get("badparts"))
                    good_parts.append(change.get("goodparts"))               
    
    return bad_parts, good_parts, full_code

In [4]:
def get_bad_lines(bad_parts):
    
    vulns = []
    for vuln in bad_parts:
        lines = ""
        for line in vuln:
            lines = lines + line
        vulns.append(lines)
    
    return vulns

In [5]:
injection = readData("command_injection.json")
redirect = readData("open_redirect.json")
disclosure = readData("path_disclosure.json")
remote = readData("remote_code_execution.json")
sql = readData("sql.json")
xsrf = readData("xsrf.json")
xss = readData("xss.json")

bad_parts, good_parts, full_code = getSource(injection)
vulns_injection = get_bad_lines(bad_parts)
bad_parts, good_parts, full_code = getSource(redirect)
vulns_redirect = get_bad_lines(bad_parts)
bad_parts, good_parts, full_code = getSource(disclosure)
vulns_disclosure = get_bad_lines(bad_parts)
bad_parts, good_parts, full_code = getSource(remote)
vulns_remote = get_bad_lines(bad_parts)
bad_parts, good_parts, full_code = getSource(sql)
vulns_sql = get_bad_lines(bad_parts)
bad_parts, good_parts, full_code = getSource(xsrf)
vulns_xsrf = get_bad_lines(bad_parts)
bad_parts, good_parts, full_code = getSource(xss)
vulns_xss = get_bad_lines(bad_parts)

Create dataset dataframe - csv

In [6]:
vulns_data = []
for source, vulns_list in [
    ("command_injection", vulns_injection),
    ("open_redirect", vulns_redirect),
    ("path_disclosure", vulns_disclosure),
    ("remote_code_execution", vulns_remote),
    ("sql_injection", vulns_sql),
    ("xsrf", vulns_xsrf),
    ("xss", vulns_xss)
]:
    for vuln in vulns_list:
        vulns_data.append((vuln, source))

# Create a DataFrame from the list of tuples
df = pd.DataFrame(vulns_data, columns=['Vulnerability', 'Category'])

# Store df
df.to_csv('vuln_categories_dataset.csv', index=False)

# Display the DataFrame
print(df)

                                          Vulnerability           Category
0         @staticmethod    def test(self):          ...  command_injection
1                                     version = "1.0.9"  command_injection
2             resp_start = self._helpers.bytesToStri...  command_injection
3                 'any': ['"&timeout $time&\'`sleep ...  command_injection
4                 if (self._attack(basePair, inserti...  command_injection
...                                                 ...                ...
4525  from flask import Flask,requestfrom termcolor ...                xss
4526                                              pl=[]                xss
4527              i.md = markdown(i.content, extensi...                xss
4528          context['md'] = markdown(self.object.c...                xss
4529          return HttpResponseBadRequest(form.err...                xss

[4530 rows x 2 columns]
