In [118]:
import pandas as pd
import json
import csv
import os
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer


Set the seeder to have as stable random operations as possible

In [65]:
seed = 123
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)

Read data

In [66]:
data = pd.read_csv('vuln_categories_dataset.csv')

Distribution of classes in dataset

In [67]:
label_frequencies = data['Category'].value_counts()
print("Label Frequencies:\n", label_frequencies)

Label Frequencies:
 sql_injection            1431
xsrf                      976
command_injection         721
path_disclosure           481
open_redirect             442
remote_code_execution     334
xss                       145
Name: Category, dtype: int64


Shuffle the dataset

In [68]:
data = data.sample(frac=1, random_state=seed).reset_index(drop=True)

Print the first part of the dataset to have a look

In [69]:
print(data.head())

                                       Vulnerability       Category
0              f"""            FROM {PRODUCTS_TAB...  sql_injection
1      client.listenTCP()    proxy = Proxy(proxy_...           xsrf
2  from django.http import HttpResponse, HttpResp...  open_redirect
3  def write_preset(conn, queryin, descriptin):\t...  sql_injection
4                          update_query = self.up...  sql_injection


Data Cleansing - Make Sequences of tokens

In [70]:
def replace_strings_and_numericals(source_code):
    
    # Replace strings with "STR$ID"
    modified_code = re.sub(r'(\"[^\"]*\"|\'.*?\')', '"STR$ID"', source_code)
    
    # Replace numerical values with "NUM%ID"
    modified_code = re.sub(r'\b\d+\b', 'NUM%ID', modified_code)
    
    return modified_code

def remove_comments(source_code):
    # Remove single-line comments
    modified_code = re.sub(r'(#.*)', '', source_code)
    
    # Remove multi-line comments
    modified_code = re.sub(r'\'\'\'[\s\S]*?\'\'\'|\"\"\"[\s\S]*?\"\"\"', '', modified_code)
    
    return modified_code

def count_token_appearances(strings):
    # Step 1: Count the occurrence of each token
    token_counts = Counter()
    for string in strings:
        tokens = string.split()  # Assuming tokens are separated by whitespace
        token_counts.update(tokens)

    return token_counts

def convert_to_lowercase(source_code):
    # Convert all letters to lowercase
    return source_code.lower()

def remove_blank_lines(source_code):
    # Remove blank lines
    return re.sub(r'^\s*\n', '', source_code, flags=re.MULTILINE)

def dataTokenization(dataset):    
    
    for i in range(len(dataset)):
        
        source_code = dataset[i]
        
        source_code = replace_strings_and_numericals(source_code)
        
        #source_code = remove_comments(source_code)
        
        source_code = convert_to_lowercase(source_code)
        
        source_code = remove_blank_lines(source_code)
        
        dataset[i] = source_code
        
    token_counts = count_token_appearances(dataset)
    ordered_token_counts = token_counts.most_common()
        
    return dataset, ordered_token_counts

In [71]:
data["Vulnerability"], ordered_token_counts = dataTokenization(data["Vulnerability"].tolist())
print(data["Vulnerability"])
print('\n', ordered_token_counts[0:10])

0                   f"str$id""str$id""str$id"         ...
1           client.listentcp()    proxy = proxy(proxy_...
2       from django.http import httpresponse, httpresp...
3       def write_preset(conn, queryin, descriptin):\t...
4                               update_query = self.up...
                              ...                        
4525                url(r"str$id", taskassets.as_view()),
4526            response = self.client.post(url)      ...
4527                       print("str$id" + str(self.kd))
4528                             "str$id""str$id""str$id"
4529        self.assertequal(        num%id, app.post(...
Name: Vulnerability, Length: 4530, dtype: object

 [('=', 5181), ('"str$id"', 1629), ('+', 942), ('%', 903), ('if', 886), ('"str$id",', 850), ('response', 847), ('return', 763), ('"str$id":', 604), ('in', 546)]


Data analysis

In [72]:
def getLengths(string_list):
    # Initialize an empty list to store lengths
    lengths = []

    # Iterate over each string in the list
    for string in string_list:
        # Split the string into tokens (words)
        tokens = string.split()
        # Calculate the length (number of tokens) of the string
        length = len(tokens)
        # Append the length to the lengths list
        lengths.append(length)

    return lengths

In [73]:
lengths = getLengths(data["Vulnerability"].tolist())
data["Length"] = lengths

print("avg of lengths = ", np.mean(lengths))
print("max of lengths = ", max(lengths))
print("min of lengths = ", min(lengths))

avg of lengths =  10.970860927152318
max of lengths =  1400
min of lengths =  0


In [74]:
# Sort data samples by length. Remove the samples with zero tokens. 
# Detect and remove outlier with very large length.
len_sorted = sorted(lengths)
print(len_sorted)


[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

In [75]:
data[data["Length"] == 0]

Unnamed: 0,Vulnerability,Category,Length
301,\t\t,sql_injection,0
654,\t\t,sql_injection,0
2476,\t\t,sql_injection,0
3532,\r,sql_injection,0
3725,\t\t,sql_injection,0
4068,\t\t,sql_injection,0


In [76]:
# Drop rows with Length equal to zero or equal to the maximum value
dataset = data[(data['Length'] != 0) & (data['Length'] != max(lengths))]

# Reset index
dataset = dataset.reset_index(drop=True)

print(dataset)

                                          Vulnerability         Category  \
0                 f"str$id""str$id""str$id"         ...    sql_injection   
1         client.listentcp()    proxy = proxy(proxy_...             xsrf   
2     from django.http import httpresponse, httpresp...    open_redirect   
3     def write_preset(conn, queryin, descriptin):\t...    sql_injection   
4                             update_query = self.up...    sql_injection   
...                                                 ...              ...   
4518              url(r"str$id", taskassets.as_view()),  path_disclosure   
4519          response = self.client.post(url)      ...             xsrf   
4520                     print("str$id" + str(self.kd))  path_disclosure   
4521                           "str$id""str$id""str$id"             xsrf   
4522      self.assertequal(        num%id, app.post(...             xsrf   

      Length  
0          9  
1          8  
2          9  
3        175  
4         14

Bag of Words (BoW)

In [155]:
vectorizer = CountVectorizer(min_df=20) 
X = vectorizer.fit_transform(dataset["Vulnerability"].tolist())
vocab = vectorizer.get_feature_names()
bow_data = pd.DataFrame(X.toarray(), columns=vocab)
full_data = pd.concat([bow_data, dataset.drop("Vulnerability", axis=1)], axis=1)
full_data.head()



Unnamed: 0,__init__,_cli_run,_run_ssh,add,and,andraise,api,app,append,args,...,value,values,version,volume_name,where,while,with,worker,x1,xsrf_client
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [157]:
word_counts = X.sum(axis=0)

# Create a dictionary mapping each word to its total count
word_count_dict = {word: count for word, count in zip(vocab, word_counts.A1)}

sorted_word_count_dict = {k: v for k, v in sorted(word_count_dict.items(), key=lambda item: item[1], reverse=True)}

# Print the dictionary
print(sorted_word_count_dict)

{'id': 12214, 'str': 10897, 'self': 3962, 'num': 1547, 'url': 1123, 'client': 1057, 'response': 933, 'if': 862, 'execute': 829, 'return': 740, 'get': 702, 'path': 589, 'def': 573, 'query': 567, 'in': 541, 'post': 502, 'import': 488, 'format': 465, 'sql': 434, 'none': 430, 'cursor': 420, 'request': 418, 'json': 385, 'content_type': 383, 'dumps': 365, 'true': 361, 'body': 358, 'for': 336, 'os': 336, 'data': 334, 'from': 331, 'db': 318, 'print': 291, 'name': 265, 'and': 248, 'not': 238, 'username': 232, 'false': 220, 'join': 217, 'password': 215, 'args': 202, 'value': 198, 'cur': 193, 'the': 190, 'to': 187, 'cmd': 184, 'else': 182, 'conn': 173, 'user': 173, 'append': 144, 'login': 143, 'ssh_cmd': 140, 'jc': 138, 'result': 127, '_cli_run': 125, 'len': 125, 'assertequal': 124, 'or': 124, 'is': 122, 'headers': 121, 'command': 117, 'key': 114, 'user_id': 109, 'lambda': 104, 'models': 104, 'asserttrue': 103, 'except': 102, 'split': 100, 'as': 98, 'session': 98, 'where': 98, 'mox': 96, 're': 92

In [162]:
print(len(vocab))

237


In [154]:
full_data.to_csv('bow_data.csv', index=False)

Store sequences of tokens format

In [160]:
dataset.to_csv('sequences_data.csv', index=False)