In [1]:
import csv, os
from gensim.models import Word2Vec
from gensim.models import FastText
import pandas as pd

import nltk

In [2]:
seed = 123456

In [3]:
# hyperparams

dim = 100
method = "w2v" # ft
min_count = 10
epochs = 10

In [4]:
# read embddings corpus corpus_embeddings
root_path = os.path.join('..', '..', '..')

dataset = pd.read_csv(os.path.join(root_path, 'data', 'train.csv'))

data = dataset.sample(frac=1, random_state=seed).reset_index(drop=True)
print(data.head())
print(len(data))

    index Access Gained Attack Origin Authentication Required Availability  \
0   22328           NaN         Local            Not required     Complete   
1   39415           NaN         Local            Not required          NaN   
2   60861           NaN        Remote           Single system          NaN   
3   84364           NaN         Local            Not required      Partial   
4  177580           NaN        Remote            Not required     Complete   

           CVE ID                                        CVE Page   CWE ID  \
0   CVE-2011-4621   https://www.cvedetails.com/cve/CVE-2011-4621/      NaN   
1   CVE-2014-1738   https://www.cvedetails.com/cve/CVE-2014-1738/  CWE-264   
2  CVE-2017-14604  https://www.cvedetails.com/cve/CVE-2017-14604/   CWE-20   
3   CVE-2018-6560   https://www.cvedetails.com/cve/CVE-2018-6560/  CWE-436   
4   CVE-2016-1621   https://www.cvedetails.com/cve/CVE-2016-1621/  CWE-119   

  Complexity Confidentiality  ... parentID  \
0        Low    

In [5]:
data = data[["processed_func"]]
data.head()

Unnamed: 0,processed_func
0,void account_system_time(struct task_struct *p...
1,static void redo_fd_request(void)\n{\n\tint dr...
2,cancel_filesystem_info_for_file (NautilusDirec...
3,"buffer_write (ProxySide *side,\n ..."
4,virtual ~Trans16x16DCT() {}\n


In [6]:
data = data.dropna(subset=["processed_func"])

In [7]:
word_counts = data["processed_func"].apply(lambda x: len(x.split()))
max_length = word_counts.max()
print("Maximum number of words:", max_length)

Maximum number of words: 15441


In [8]:
train_data = pd.DataFrame(({'Text': data['processed_func']}))
#data = data[0:100]
train_data.head()

Unnamed: 0,Text
0,void account_system_time(struct task_struct *p...
1,static void redo_fd_request(void)\n{\n\tint dr...
2,cancel_filesystem_info_for_file (NautilusDirec...
3,"buffer_write (ProxySide *side,\n ..."
4,virtual ~Trans16x16DCT() {}\n


In [9]:
text = train_data["Text"].values.tolist()

In [10]:
# Write each function to the file
file_path = os.path.join(root_path, 'data', 'tokenizer_train_data.txt')

with open(file_path, "w", encoding="utf-8") as file:
    for function in text:
        file.write(function + "\n")

In [11]:
with open(os.path.join(root_path, 'data', 'tokenizer_train_data.txt'), 'r', encoding='utf-8') as file:
    corpus = file.read() #.lower().replace('\n', ' ')

In [12]:
corpusList = corpus.split("}\n")

In [13]:
def dropBlank(tokens0):
    tokens = []
    for i in range(0, len(tokens0)):
        temp = tokens0[i]
        if temp != '':
            tokens.append(temp)
    return tokens

def dropEmpty(tokens0):
    tokens = []
    for i in range(0, len(tokens0)):
        temp = tokens0[i]
        if temp != []:
            tokens.append(temp)
    return tokens

In [14]:
def stringToList(string):
    codeLinesList = []
    for line in string.splitlines():
        codeLinesList.append(line)
    return codeLinesList

def tokenizeLines(codeLinesList):
    codeTokens = []
    
    for line in codeLinesList:
        templineTokens = nltk.word_tokenize(line)
        codeTokens.extend(templineTokens)
    
    return codeTokens

In [15]:
def dataTokenization(corpus):
    
    allTokens = []
    for i in range(0, len(corpus)):
        stringLines = corpus[i]
        
        #convert source code from string to list of lines
        lines = stringToList(stringLines)
        
        #tokenize lines to list of words
        tokens0 = tokenizeLines(lines)
        
        #remove blank lines
        tokens = dropBlank(tokens0)
        
        #lower case
        for w in range(0, len(tokens)):
            tokens[w] = tokens[w].lower()
         
        allTokens.append(tokens)
        
    return allTokens

In [16]:
def embVectors(dim, epochs, min_count, method, corpusList): 

    data = dataTokenization(corpusList)
    
    data = dropEmpty(data)
    
    if method == "w2v": 
        model = Word2Vec(data, vector_size=dim, workers=4, epochs=epochs, min_count=min_count) #, window=20
        fileEmb = method + '_embeddings.txt'
        model.wv.save_word2vec_format(fileEmb, binary=False)
    elif method == "ft":
        model_ted = FastText(vector_size=dim, min_count=min_count)
        model_ted.build_vocab(corpus_iterable=data)
        model_ted.train(corpus_iterable=data, total_examples=len(data), epochs=epochs)
        fileEmb = method + '_embeddings.txt'
        model_ted.wv.save_word2vec_format(fileEmb, binary=False)
    
    return fileEmb

In [17]:
fileEmb = embVectors(dim, epochs, min_count, method, corpusList)