In [16]:
import pandas as pd
import json
import csv
import os
import re
from collections import Counter


Set the seeder to have as stable random operations as possible

In [2]:
seed = 123
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)

Read data

In [3]:
data = pd.read_csv('vuln_categories_dataset.csv')

Distribution of classes in dataset

In [4]:
label_frequencies = data['Category'].value_counts()
print("Label Frequencies:\n", label_frequencies)

Label Frequencies:
 sql_injection            1431
xsrf                      976
command_injection         721
path_disclosure           481
open_redirect             442
remote_code_execution     334
xss                       145
Name: Category, dtype: int64


Shuffle the dataset

In [5]:
data = data.sample(frac=1, random_state=seed).reset_index(drop=True)

Print the first part of the dataset to have a look

In [6]:
print(data.head())

                                       Vulnerability       Category
0              f"""            FROM {PRODUCTS_TAB...  sql_injection
1      client.listenTCP()    proxy = Proxy(proxy_...           xsrf
2  from django.http import HttpResponse, HttpResp...  open_redirect
3  def write_preset(conn, queryin, descriptin):\t...  sql_injection
4                          update_query = self.up...  sql_injection


Data Cleansing - Make Sequences of tokens

In [14]:
def replace_strings_and_numericals(source_code):
    
    # Replace strings with "STR$ID"
    modified_code = re.sub(r'(\"[^\"]*\"|\'.*?\')', '"STR$ID"', source_code)
    
    # Replace numerical values with "NUM%ID"
    modified_code = re.sub(r'\b\d+\b', 'NUM%ID', modified_code)
    
    return modified_code

def remove_comments(source_code):
    # Remove single-line comments
    modified_code = re.sub(r'(#.*)', '', source_code)
    
    # Remove multi-line comments
    modified_code = re.sub(r'\'\'\'[\s\S]*?\'\'\'|\"\"\"[\s\S]*?\"\"\"', '', modified_code)
    
    return modified_code

def count_token_appearances(strings):
    # Step 1: Count the occurrence of each token
    token_counts = Counter()
    for string in strings:
        tokens = string.split()  # Assuming tokens are separated by whitespace
        token_counts.update(tokens)

    return token_counts

def convert_to_lowercase(source_code):
    # Convert all letters to lowercase
    return source_code.lower()

def remove_blank_lines(source_code):
    # Remove blank lines
    return re.sub(r'^\s*\n', '', source_code, flags=re.MULTILINE)

def dataTokenization(dataset):    
    
    for i in range(len(dataset)):
        
        source_code = dataset[i]
        
        source_code = replace_strings_and_numericals(source_code)
        
        #source_code = remove_comments(source_code)
        
        source_code = convert_to_lowercase(source_code)
        
        source_code = remove_blank_lines(source_code)
        
        dataset[i] = source_code
        
    token_counts = count_token_appearances(dataset)
    ordered_token_counts = token_counts.most_common()
        
    return dataset, ordered_token_counts

In [23]:
data["Vulnerability"], ordered_token_counts = dataTokenization(data["Vulnerability"].tolist())
print(data["Vulnerability"])
print('\n', ordered_token_counts[0:10])

0                   f"str$id""str$id""str$id"         ...
1           client.listentcp()    proxy = proxy(proxy_...
2       from django.http import httpresponse, httpresp...
3       def write_preset(conn, queryin, descriptin):\t...
4                               update_query = self.up...
                              ...                        
4525                url(r"str$id", taskassets.as_view()),
4526            response = self.client.post(url)      ...
4527                       print("str$id" + str(self.kd))
4528                             "str$id""str$id""str$id"
4529        self.assertequal(        num%id, app.post(...
Name: Vulnerability, Length: 4530, dtype: object

 [('=', 5157), ('"str$id"', 1628), ('+', 941), ('%', 902), ('if', 883), ('"str$id",', 849), ('response', 845), ('return', 761), ('"str$id":', 604), ('in', 546)]


Data analysis

In [29]:
def getLengths(string_list):
    # Initialize an empty list to store lengths
    lengths = []

    # Iterate over each string in the list
    for string in string_list:
        # Split the string into tokens (words)
        tokens = string.split()
        # Calculate the length (number of tokens) of the string
        length = len(tokens)
        # Append the length to the lengths list
        lengths.append(length)

    return lengths

In [33]:
lengths = getLengths(data["Vulnerability"].tolist())
data["Length"] = lengths

print("avg of lengths = ", np.mean(lengths))
print("max of lengths = ", max(lengths))
print("min of lengths = ", min(lengths))

avg of lengths =  10.89514348785872
max of lengths =  1400
min of lengths =  0


In [32]:
# Sort data samples by length. Remove the samples with zero tokens. 
# Detect and remove outlier with very large length.
len_sorted = sorted(lengths)
print(len_sorted)


[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

In [39]:
filtered_df = data[data["Length"] == max(lengths) | (data["Length"] == 0)]
print(filtered_df)

data = data.drop(filtered_df.index)


# Reset index
data = data.reset_index(drop=True)


                                          Vulnerability           Category  \
27                self.cursor.execute("str$id".forma...      sql_injection   
111                              sys.exit(num%id)\t\r\r    path_disclosure   
122               csr_state_id=csr_state_logout.csr_...      open_redirect   
140          \t\t\t\t "str$id".join(self.or_conditions)      sql_injection   
148                                      help="str$id")  command_injection   
...                                                 ...                ...   
4451                         bot.add_cog(database(bot))  command_injection   
4504                           "str$id""str$id""str$id"    path_disclosure   
4506                                           print(q)      sql_injection   
4514                              httpresponseredirect,      open_redirect   
4527                           "str$id""str$id""str$id"               xsrf   

      Length  
27         1  
111        1  
122        1  
140

Bag of Words (BoW)

'    client.listentcp()    proxy = proxy(proxy_port="str$id", app_port="str$id")    context.call_to_terminate_proxy = proxy.run_on_a_thread()'