In [1]:
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 28 17:16:19 2022

@author: iliaskaloup
"""

import tensorflow

import os, json, glob, time, sys, re
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import json
import math

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras import regularizers
from tensorflow.keras import layers
from tensorflow.keras.layers import LSTM, SimpleRNN, Bidirectional
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import Masking
from tensorflow.keras.layers import Embedding, MaxPool1D
from tensorflow.keras.callbacks import CSVLogger
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
from tensorflow.keras import optimizers
from tensorflow.keras.constraints import max_norm
from tensorflow.keras.layers import Bidirectional, BatchNormalization
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.initializers import glorot_uniform, RandomUniform, lecun_uniform, Constant
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer
from collections import OrderedDict
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D, GlobalMaxPool1D
import tensorflow.keras.backend as K

# import gensim
# from gensim import downloader as api
# from gensim.models import Word2Vec

import io
from contextlib import redirect_stdout
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
#from sklearn import metrics
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, \
roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.utils import shuffle
import time
import random

from transformers import GPT2Tokenizer, GPT2LMHeadModel, BertModel, BertTokenizer, AutoTokenizer, TFAutoModel, OpenAIGPTTokenizer, OpenAIGPTModel


In [2]:
# define seeder
seed = 123
np.random.seed(seed)
random.seed(seed)
tensorflow.random.set_seed(seed)

In [3]:
def readFiles(inputPath):
    suffix = ".py"        
    paths=[]
    for root, dirs, files in os.walk(inputPath):
        for file in files:
            if (file.endswith(suffix)):
                 paths.append(file)                 
    return paths

def createListOfTokens(codeFilename):
	"""
	Given a file name in the current working directory, read each line 
	and append it to a list.
	Return: the contents of the file as a list of strings
	"""
	codeLinesList = []

	with open(codeFilename, "r", encoding="utf8") as fin:
		for line in fin:
			codeLinesList.append(line)

	return codeLinesList

def listToString(s): 
    
    # initialize an empty string
    str1 = "" 
    
    # traverse in the string  
    for ele in s: 
        str1 += ele  
    
    # return string  
    return str1 

def stringToList(string):
    codeLinesList = []
    for line in string.splitlines():
        codeLinesList.append(line)
    return codeLinesList

def remove_comments(input_str):
    
    input_str = re.sub('#.*', '', input_str)
    input_str = re.sub('""".*"""', '', input_str)          
    input_str = re.sub("'''.*'''", '', input_str)    

    doc_reg_1 = r'("""|\'\'\')([\s\S]*?)(\1\s*)(?=class)'
    doc_reg_2 = r'(\s+def\s+.*:\s*)\n(\s*"""|\s*\'\'\')([\s\S]*?)(\2[^\n\S]*)'
    input_str = re.sub(doc_reg_1, '', input_str)
    input_str = re.sub(doc_reg_2, r'\1', input_str)
    
    return input_str

def dropHeaders(lines):
    linList = []
    for line in lines:
        if not re.search('import',line):
             if not re.search('from',line):
                  linList.append(line)
    return linList

def dropBlank(tokens0):
    tokens = []
    for i in range(0, len(tokens0)):
        temp = tokens0[i]
        if temp != '':
            tokens.append(temp)
    return tokens

def tokenizeLines(codeLinesList):
	"""
	Tokenize each file line, i.e. entry of code list, based on a specified
    regex.
    ***The regex used is not applicable to every case. One needs to configure
    it based on file inputs.

	Return: list containing code tokens
	"""
    #enalaktika codeLinesList.split() gia na meinoun ta tokens mazi me to punctuation
	codeTokens = []

	for line in codeLinesList:
		templineTokens = re.split('[\.,\[\];:(\s)?\\\\!\t{}"<>+=~*&^%/|\\-\']', line)
		codeTokens.extend(templineTokens)

	return codeTokens

def dropEmpty(tokens0):
    tokens = []
    for i in range(0, len(tokens0)):
        temp = tokens0[i]
        if temp != []:
            tokens.append(temp)
    return tokens

def makeSequences(paths, inputPath, folder, label):
    allTokens = []
    for i in range(0, len(paths)):
        path = paths[i]
        
        if label == 0:
            path2 = os.path.join(inputPath, folder, path)
        elif label == 1:
            path2 = os.path.join(inputPath, folder, path)
        #tokenize source code in a list of lines
        lines0 = createListOfTokens(path2)
        
        #convert source code from list of lines to string
        stringLines = listToString(lines0)
        
        stringLinesNoDigit = re.sub(r"$\d+\W+|\b\d+\b|\W+\d+$", "<numId$>", stringLines) #replace numbers 
        
        stringLinesNoStr = re.sub(r'(["])(?:(?=(\\?))\2.)*?\1', "<strId$>", stringLinesNoDigit) #replace strings
        
        stringLinesNoChar = re.sub(r"(['])(?:(?=(\\?))\2.)*?\1", "<strId$>", stringLinesNoStr) #replace chars
        
        #remove comments from source code
        linesNoCom = remove_comments(stringLinesNoChar)
        
        #convert source code from string to list of lines
        lines = stringToList(linesNoCom)
        
        #remove headers
        lines = dropHeaders(lines)
        
        #tokenize lines to list of words
        tokens0 = tokenizeLines(lines)
        
        #remove blank lines
        tokens = dropBlank(tokens0)
        
        #add the filename info
        tokens.insert(0, path)
        #add the label info
        if (label == 1):
            tokens.insert(1, 1)
        else:
            tokens.insert(1, 0)
        
        #merged all files into one list
        allTokens.append(tokens)
        
    return allTokens

def writeSequences(inputPath, neutralPath, vulnPath):
    
    #read neutral
    neutralPath = os.path.join(inputPath, neutralPath)
    neutral = readFiles(neutralPath)

    # read vulnerable
    vulnPath = os.path.join(inputPath, vulnPath)
    vulns = readFiles(vulnPath)
    
    #parse all files
    print("Tokenization is starting...")
    
    neutralTokens = makeSequences(neutral, inputPath, neutralPath, 0)
    vulnTokens = makeSequences(vulns, inputPath, vulnPath, 1)
    
    print("Tokenization has been completed.")
    
    # merge neutral and vulns
    allTokens = neutralTokens + vulnTokens
    
    #save dataset to csv
    print("Save dataset to file...")
    
    with open("sequences.csv","w", encoding="utf8") as f:
        wr = csv.writer(f)
        wr.writerows(allTokens)
    
    print("End of Analysis.") 

    return allTokens 

def uniqueWords(data):
    allWords = []
    for i in range(len(data)):
        for j in range(0, len(data[i])):
            allWords.append(data[i][j])
    
    vc = pd.Series(allWords).value_counts()
    uniques=vc.index.values.tolist()        
    return allWords, uniques, vc

def getUniques(data):
    allWords = []
    for i in range(len(data)):
        for j in range(2, len(data[i])):
            allWords.append(data[i][j])
    
    vc = pd.Series(allWords).value_counts()
    uniques=vc.index.values.tolist()        
    return allWords, uniques, vc

def getLabels(data):
    values = []
    for i in range(len(data)):
        values.append(int(data[i][1]))
    serVal = pd.DataFrame(values)
    return serVal

def writeBow(data, filename):
    features = []
    for i in range(0, len(data)):
        line = data[i][2:]
        features.append(line)
        
    allWords, uniques, tokenFreq = uniqueWords(features)
    
    vocab = uniques #tokenFreq[tokenFreq > 1 ]
    
    labels = getLabels(data)
    
    freqs = []
    for i in range(0, len(features)):
        occ = []
        for j in range(0, len(vocab)):
            occ.append(features[i].count(vocab[j]))
        freqs.append(occ)
    
    dataset = pd.DataFrame.from_records(freqs, columns=vocab)# list of lists to dataframe
    dataset['label'] = labels
    dataset.to_csv(filename, index = None, header=True)

    return dataset

def labelFrequencies(data):
    values = []
    for i in range(len(data)):
        values.append(data[i][1])
    serVal = pd.DataFrame(values)
    return serVal

def getLengths(data):
    lens = []
    for i in range(len(data)):
        lens.append(len(data[i])-2)
    lens = pd.DataFrame(lens)
    lensFreq = lens[0].value_counts()
    lensFreq=pd.DataFrame(lensFreq)
    return lens, lensFreq
 
def getStd(len_sorted, med):
    res=0
    for i in range(len(len_sorted)):
        x = len_sorted.iloc[i,0]
        res = res + (x-med)*(x-med)
    res2 = res / len(len_sorted)
    res_final = math.sqrt(res2)
    print("stdev= ", res_final)
    return res_final
    
def average(lst):
    return sum(lst) / len(lst)

def removeSparse(data, tokens2):
    new_data = []
    for i in range(len(data)):
        new = []
        new.append(data[i][0])
        new.append(data[i][1])
        for j in range(2, len(data[i])):
            temp = data[i][j]
            if temp in tokens2:
                new.append(temp)
        new_data.append(new)
    return new_data
    
def analyzeData():
    
    inputPath = os.getcwd()
    neutralPath = "neutralPy"
    vulnPath = "vuln"
    
    
    ## data cleansing
    #data = writeSequences(inputPath, neutralPath, vulnPath)
    #print("bow")
    with open('sequences.csv', newline='', encoding='utf-8') as f:
        reader = csv.reader(f)
        data = list(reader)
    data = dropEmpty(data)
    
    #bow = writeBow(data, 'BoW_dataset.csv')
    #bow = pd.read_csv('BoW_dataset.csv', sep =',')
    
    
    ## data analysis
    allWords, uniques, tokenFreq = uniqueWords(data)
    tokenFreq = pd.DataFrame(tokenFreq)
    l = tokenFreq.index.values.tolist()
    tokenFreq.insert(0,'token', l)
    tokenFreq.rename(columns = {0: 'times'}, inplace = True)
    tokenFreq = tokenFreq.reset_index(drop=True)
    avgToken = tokenFreq['times']
    print("mean of token's appearance = ", average(avgToken))
    print("max of token's appearance = ", max(tokenFreq['times']))
    print("min of token's appearance = ", min(tokenFreq['times']))
    
    '''freqs = tokenFreq['times']
    freqs = freqs.tolist()
    freqs = pd.DataFrame(freqs)
    medFreq = freqs.median()[0]
    print("median= ", medFreq)
    stdevFreq = freqs[0].std()
    print("standard deviation = ", stdevFreq)
    # finding the 1st and 3rd quartiles
    q1 = np.quantile(freqs[0], 0.25)
    q3 = np.quantile(freqs[0], 0.75)
    # finding the iqr region
    iqr = q3-q1
    # finding upper and lower whiskers
    upper_bound = q3+(1.5*iqr)
    lower_bound = q1-(1.5*iqr)
    print(iqr, upper_bound, lower_bound)
    outliers = freqs[0][(freqs[0] <= lower_bound) | (freqs[0] >= upper_bound)]
    #print('The following are the outliers in the boxplot:{}'.format(outliers))
    split_point = len(freqs) - len(outliers)
    print("Splitting point: ", split_point)
    max_len = freqs.iloc[split_point-1,:][0]
    print("max number of occurencies selected: ", max_len)'''
    tokens2 = tokenFreq[tokenFreq['times'] > 10 ]
    new_data = removeSparse(data, tokens2["token"].tolist())
    
    #check balance
    labels = labelFrequencies(new_data)
    labelFreq = labels[0].value_counts()
    print("labels balance:\n",labelFreq)
    
    # find the length of each class file
    lens, lensFreq = getLengths(new_data)
    l = lensFreq.index.values.tolist()
    
    lensFreq.insert(0,'lengths', l)
    lensFreq.rename(columns = {0: 'times'}, inplace = True)
    lensFreq = lensFreq.reset_index(drop=True)
    #print(lensFreq)
    avgLen = average(lens.iloc[:,0])# mesos oros twn sequence lengths
    print("avg of lengths = ",avgLen)
    print("max of lengths = ", max(lens.iloc[:,0]))
    print("min of lengths = ", min(lens.iloc[:,0]))
    
    len_sorted = lens.sort_values(by=[0])
    len_sorted = len_sorted.reset_index(drop=True)
    
    med = len_sorted.median()[0]
    print("median= ", med)
    
    plt.plot(len_sorted[0])
    plt.show()
    
    stdev = len_sorted[0].std()
    print("standard deviation = ", stdev)
    
    #manual computation of st. d.
    stdev2 = getStd(len_sorted, med)
    
    split_point = med + 2*stdev
    print("median + 2*st.d= ", split_point)
    
    plt.boxplot(len_sorted[0])
    plt.show()
    
    # finding the 1st and 3rd quartiles
    q1 = np.quantile(len_sorted[0], 0.25)
    q3 = np.quantile(len_sorted[0], 0.75)
    # finding the iqr region
    iqr = q3-q1
    # finding upper and lower whiskers
    upper_bound = q3+(1.5*iqr)
    lower_bound = q1-(1.5*iqr)
    print(iqr, upper_bound, lower_bound)
    outliers = len_sorted[0][(len_sorted[0] <= lower_bound) | (len_sorted[0] >= upper_bound)]
    #print('The following are the outliers in the boxplot:{}'.format(outliers))
    split_point = len(len_sorted) - len(outliers)
    print("Splitting point: ", split_point)
    max_len = len_sorted.iloc[split_point-1,:][0]
    print("max length selected: ", max_len)
    
    #reshape dataset
    data2 = []
    for i in range(len(new_data)):
        length = len(new_data[i]) - 2
        if length <= max_len :
            data2.append(new_data[i])
    
    #save reduced dataset
    print("Save dataset to file...")
    with open("data_reduced.csv","w", encoding='utf-8') as f:
        wr = csv.writer(f)
        wr.writerows(data2)
    print("End of Analysis.")
    
    
    #reduce dataset in BERT's max lenght = 512 tokesn max per sequence
    data3 = []
    for i in range(len(new_data)):
        length = len(new_data[i]) - 2
        if length <= 510 :  # max_len=512 (510 + SEP and CLS)
            data3.append(new_data[i])
    
    #save reduced dataset
    print("Save dataset to file...")
    with open("data_reduced_bert.csv","w", encoding='utf-8') as f:
        wr = csv.writer(f)
        wr.writerows(data3)
    print("End of Analysis.")
    
    # data analysis on bert reduced set
    #check balance
    labels = labelFrequencies(data3)
    labelFreq = labels[0].value_counts()
    print("labels balance:\n",labelFreq)
    
    allWords, uniques, tokenFreq = uniqueWords(data3)
    print("vocab: ", len(uniques))
    
    # bag of words from reduced dataset
    bow_reduced = writeBow(data3, 'BoW_dataset_reduced.csv')
    #bow_reduced = pd.read_csv('BoW_dataset_reduced.csv', sep =',')
    
    return data3, bow_reduced

def recall_metric(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = (true_positives + K.epsilon()) / (possible_positives + K.epsilon())
        return recall

def precision_metric(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = (true_positives + K.epsilon()) / (predicted_positives + K.epsilon())
        return precision

def f1_metric(y_true, y_pred):

    prec = precision_metric(y_true, y_pred)
    rec = recall_metric(y_true, y_pred)
    f1 = 2*((prec*rec)/(prec+rec+K.epsilon()))
    return f1

def f2_metric(y_true, y_pred):

    prec = precision_metric(y_true, y_pred)
    rec = recall_metric(y_true, y_pred)
    f2 = 5*((prec*rec)/(4*prec+rec+K.epsilon()))
    return f2

def f2_loss(y_true, y_pred):

    y_true = tensorflow.cast(y_true, tensorflow.float32)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    #tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f2 = 5*p*r / (4*p+r+K.epsilon())
    f2 = tensorflow.where(tensorflow.math.is_nan(f2), tensorflow.zeros_like(f2), f2)
    
    return 1 - K.mean(f2)


def buildBiLstm(max_len, top_words, dim, seed, embedding_matrix):
    model = Sequential()
    #model.add(Embedding(input_dim=top_words+1, output_dim=dim, input_length=None, mask_zero=True))
    model.add(Embedding(input_dim=top_words, output_dim=dim, input_length=None, weights=[embedding_matrix], mask_zero=True, trainable=False))
    #model.add(SimpleRNN(300, dropout=0.3, stateful=False))
    #model.add(LSTM(100, dropout=0.2, return_sequences=True, stateful=False))
    model.add(Bidirectional(LSTM(100, dropout=0.2, return_sequences=True, stateful=False)))
    model.add(Bidirectional(LSTM(100, dropout=0.2, stateful=False)))
    #model.add(LSTM(100, dropout=0.2, stateful=False))
    #model.add(Activation('relu')) #dropout=0.2, recurrent_dropout=0.2, kernel_constraint=max_norm(3), bias_constraint=max_norm(3)
    #model.add(BatchNormalization(momentum=0.0))
    model.add(Dense(1,activation='sigmoid'))
    #model.compile(loss=f2_loss, optimizer='adam', metrics=[f2_metric])
    model.compile(loss="binary_crossentropy", optimizer='adam', metrics=[f2_metric])
    return model

def buildLstm(max_len, top_words, dim, seed, embedding_matrix):
    model = Sequential()
    #model.add(Embedding(input_dim=top_words+1, output_dim=dim, input_length=None, mask_zero=True))
    model.add(Embedding(input_dim=top_words, output_dim=dim, input_length=None, weights=[embedding_matrix], mask_zero=True, trainable=False))
    #model.add(SimpleRNN(300, dropout=0.3, stateful=False))
    model.add(LSTM(100, dropout=0.2, return_sequences=True, stateful=False))
    #model.add(Bidirectional(LSTM(300, dropout=0.3, stateful=False)))
    model.add(LSTM(100, dropout=0.2, stateful=False))
    #model.add(Activation('relu')) #dropout=0.2, recurrent_dropout=0.2, kernel_constraint=max_norm(3), bias_constraint=max_norm(3)
    #model.add(BatchNormalization(momentum=0.0))
    model.add(Dense(1,activation='sigmoid'))
    #model.compile(loss=f2_loss, optimizer='adam', metrics=[f2_metric])
    model.compile(loss="binary_crossentropy", optimizer='adam', metrics=[f2_metric])
    return model

def buildCnn(max_len, top_words, dim, seed, embedding_matrix):
    cnn_model = Sequential()
    cnn_model.add(Embedding(top_words, dim, input_length=None, weights=[embedding_matrix], mask_zero=True, trainable=False))
    cnn_model.add(Conv1D(filters = 128, kernel_size = 5, activation = 'relu'))
    '''cnn_model.add(MaxPooling1D(pool_size = 5))
    cnn_model.add(Conv1D(filters = 128, kernel_size = 5, activation = 'relu'))
    cnn_model.add(MaxPooling1D(pool_size = 5))
    cnn_model.add(Conv1D(filters = 128, kernel_size = 5, activation = 'relu'))'''
    cnn_model.add(GlobalMaxPool1D())
    #cnn_model.add(Dense(units = 128, activation = 'relu'))
    cnn_model.add(Dense(units = 1, activation = 'sigmoid'))
    #cnn_model.compile(loss=f2_loss, optimizer='adam', metrics=[f2_metric])
    cnn_model.compile(loss="binary_crossentropy", optimizer = "adam") # "binary_crossentropy" # f2_loss
    return cnn_model


In [4]:
# Read pythonvp directory and construct train and test csv files.
def extract_tokens_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        try:
            content = file.read()
            tokens = content.split()  # Split content into words
            return ','.join(tokens)
        except:
            return ""

def create_csv(directory, output_csv):
    with open(output_csv, 'w', newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['File', 'Tokens'])

        for root, _, files in os.walk(directory):
            for file in files:
                file_path = os.path.join(root, file)
                tokens = extract_tokens_from_file(file_path)
                csv_writer.writerow([file_path, tokens])

def add_label_to_csv(input_csv, label_value, output_csv):
    df = pd.read_csv(input_csv)
    df['Label'] = label_value
    df.to_csv(output_csv, index=False)

def concatenate_csvs(csv1, csv2, output_csv):
    df1 = pd.read_csv(csv1)
    df2 = pd.read_csv(csv2)
    merged_df = pd.concat([df1, df2], ignore_index=True)
    merged_df.to_csv(output_csv, index=False)

base_directory = 'pythonvp'
train_val_clean_csv = 'train_val_clean.csv'
train_val_vuln_csv = 'train_val_vuln.csv'
test_clean_csv = 'test_clean.csv'
test_vuln_csv = 'test_vuln.csv'
test_clean_labeled_csv = 'test_clean_labeled.csv'
test_vuln_labeled_csv = 'test_vuln_labeled.csv'
train_val_clean_labeled_csv = 'train_val_clean_labeled.csv'
train_val_vuln_labeled_csv = 'train_val_vuln_labeled.csv'
train_val_merged_csv = 'train_val_merged.csv'
test_merged_csv = 'test_merged.csv'

train_val_clean_dir = os.path.join(base_directory, 'train_val', 'clean')
train_val_vuln_dir = os.path.join(base_directory, 'train_val', 'vuln')
test_clean_dir = os.path.join(base_directory, 'test', 'clean')
test_vuln_dir = os.path.join(base_directory, 'test', 'vuln')

create_csv(train_val_clean_dir, train_val_clean_csv)
create_csv(train_val_vuln_dir, train_val_vuln_csv)
create_csv(test_clean_dir, test_clean_csv)
create_csv(test_vuln_dir, test_vuln_csv)

# Add label column with value 0 to test_clean_csv
add_label_to_csv(train_val_clean_csv, 0, train_val_clean_labeled_csv)

# Add label column with value 1 to test_vuln_csv
add_label_to_csv(train_val_vuln_csv, 1, train_val_vuln_labeled_csv)

# Add label column with value 0 to test_clean_csv
add_label_to_csv(test_clean_csv, 0, test_clean_labeled_csv)

# Add label column with value 1 to test_vuln_csv
add_label_to_csv(test_vuln_csv, 1, test_vuln_labeled_csv)

concatenate_csvs(train_val_clean_labeled_csv, train_val_vuln_labeled_csv, train_val_merged_csv)
concatenate_csvs(test_clean_labeled_csv, test_vuln_labeled_csv, test_merged_csv)


In [5]:
with open(train_val_merged_csv, newline='', encoding='utf-8') as f:
    reader = csv.reader(f)
    data = list(reader)
data = data[1:]

train_data = []
for i in range(0, len(data)):
    d = data[i]
    new_row = [d[0]]
    input_string = d[1]
    string_list = input_string.split(',')
    new_row.extend(string_list)
    new_row.append(d[2]) 
    train_data.append(new_row)



with open(test_merged_csv, newline='', encoding='utf-8') as f:
    reader = csv.reader(f)
    data = list(reader)
data = data[1:]

test_data = []
for i in range(0, len(data)):
    d = data[i]
    new_row = [d[0]]
    input_string = d[1]
    string_list = input_string.split(',')
    new_row.extend(string_list)
    new_row.append(d[2]) 
    test_data.append(new_row)
test_data

train_len = len(train_data)
train_data.extend(test_data)

all_data = []
for inner_list in train_data:
    last_element = inner_list.pop() 
    inner_list.insert(1, last_element)
    all_data.append(inner_list)

In [6]:
# deep learning
lines = []
for i in range(0, len(all_data)):
    line = all_data[i][2:]
    lows = [w.lower() for w in line]
    lines.append(lows)

lens, lensFreq = getLengths(lines)
max_len = max(lens[0]) + 2

labels = getLabels(all_data)
max_len

510

In [7]:
for i in range(0, len(all_data)):
    if all_data[i][1] == '1':
        print(all_data[i][0])

pythonvp\train_val\vuln\_0x0F-python-object_relational_mapping_1-filter_states_2190.py
pythonvp\train_val\vuln\_0x0F-python-object_relational_mapping_10-model_state_my_get_2056.py
pythonvp\train_val\vuln\_0x0F-python-object_relational_mapping_3-my_safe_filter_states_2042.py
pythonvp\train_val\vuln\_2016_13_solve_1112.py
pythonvp\train_val\vuln\_3-back-end_codes_tournament_tournament_2186.py
pythonvp\train_val\vuln\_acedu_hwmllib_618.py
pythonvp\train_val\vuln\_addons_point_of_sale_wizard_pos_close_statement_1582.py
pythonvp\train_val\vuln\_addons_point_of_sale_wizard_pos_close_statement_1586.py
pythonvp\train_val\vuln\_addons_point_of_sale_wizard_pos_close_statement_1590.py
pythonvp\train_val\vuln\_addons_point_of_sale_wizard_pos_close_statement_1594.py
pythonvp\train_val\vuln\_addons_point_of_sale_wizard_pos_close_statement_1600.py
pythonvp\train_val\vuln\_addons_point_of_sale_wizard_pos_open_statement_1584.py
pythonvp\train_val\vuln\_addons_point_of_sale_wizard_pos_open_statement_158

In [8]:
for i in range(0, len(all_data)):
    if all_data[i][1] == '0':
        print(all_data[i][0])

pythonvp\train_val\clean\0001_initial.py
pythonvp\train_val\clean\0001_not_initial.py
pythonvp\train_val\clean\0001_setup_extensions.py
pythonvp\train_val\clean\0002_a2.py
pythonvp\train_val\clean\0002_add-webhook.py
pythonvp\train_val\clean\0002_add_book.py
pythonvp\train_val\clean\0002_add_importedfile_model.py
pythonvp\train_val\clean\0002_alter_domain_unique.py
pythonvp\train_val\clean\0002_alter_id.py
pythonvp\train_val\clean\0002_auto_20170614_1741.py
pythonvp\train_val\clean\0002_build_command_initial.py
pythonvp\train_val\clean\0002_c2.py
pythonvp\train_val\clean\0002_delete_post.py
pythonvp\train_val\clean\0002_increase_max_length.py
pythonvp\train_val\clean\0002_initial.py
pythonvp\train_val\clean\0002_logentry_remove_auto_add.py
pythonvp\train_val\clean\0002_remove_content_type_name.py
pythonvp\train_val\clean\0002_remove_ipaddressfield_ip.py
pythonvp\train_val\clean\0002_rename_last_4_digits.py
pythonvp\train_val\clean\0002_settings_sync_assignees.py
pythonvp\train_val\clea

In [9]:
count = 0
for i in range(0, len(all_data)):
    if len(all_data[i]) > 120:
        count+=1
print(count)

2603


In [10]:
# def getVocab(sequences):
#     vocab = []
#     for i in range(len(sequences)):
#         for j in range(len(sequences[i][0])):
#             if sequences[i][0][j] not in vocab:
#                 vocab.append(sequences[i][0][j])
#     return vocab

In [11]:
# # w2v embeddings

# dim = 200

# fileEmb = 'w2v_embeddingsIters.txt'
# embeddings_index = {}
# f = open(os.path.join('', fileEmb), encoding="utf-8")
# for line in f:    
#     values = line.split()
#     word = values[0]
#     coefs = np.asarray(values[1:])
#     embeddings_index[word] = coefs   
# f.close()

# tokenizer_obj = Tokenizer()
# tokenizer_obj.fit_on_texts(lines)
# sequences = tokenizer_obj.texts_to_sequences(lines)
# tokenizer_json = tokenizer_obj.to_json()
# with io.open('tokenizer.json', 'w', encoding='utf-8') as f:
#     f.write(json.dumps(tokenizer_json, ensure_ascii=False))
    
# #pad sequences
# word_index = tokenizer_obj.word_index
# lines_pad = pad_sequences(sequences, padding = 'post', maxlen = max_len)
# sentiment = labels.iloc[:,0].values

# num_words = len(word_index) + 1
# embedding_matrix = np.zeros((num_words, dim))
# for word, i in word_index.items():
#     if i > num_words:
#         continue
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         embedding_matrix[i] = embedding_vector

In [12]:
# # gpt-based embeddings

# # Load the GPT-2 model and tokenizer
# model_name = "gpt2"  # 'gpt2' './model_logs_fromScratch', './model_logs/checkpoint-30000'
# tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# model = GPT2LMHeadModel.from_pretrained(model_name)

# embedding_matrix = model.transformer.wte.weight
# num_words = embedding_matrix.size(0)
# dim = embedding_matrix.size(1)

# sentences = [' '.join(line) for line in lines] # Combine tokens in each inner list into complete sentences
# sequences = [tokenizer.encode(sente, add_special_tokens=False, truncation=True, max_length=1024, return_tensors="tf").numpy() for sente in sentences] # Tokenize the complete sentences

# sentiment = labels.iloc[:,0].values

# indexed_lines = []
# for seq in sequences:
#     indexed_lines.append(seq[0])
    
# #pad sequences
# lines_pad = pad_sequences(indexed_lines, padding = 'post', maxlen = max_len)


In [13]:
# # bert-based embeddings

# model_name = "roberta-base" # "bert-base-uncased"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = TFAutoModel.from_pretrained(model_name)

# bert_embeddings = model.get_input_embeddings()
# embedding_matrix = bert_embeddings.weights[0].numpy()
# num_words = len(embedding_matrix)
# dim = len(embedding_matrix[0])

# sentences = [' '.join(line) for line in lines] # Combine tokens in each inner list into complete sentences
# sequences = [tokenizer.encode(sente, add_special_tokens=False, truncation=True, max_length=1024, return_tensors="tf").numpy() for sente in sentences] # Tokenize the complete sentences

# sentiment = labels.iloc[:,0].values

# indexed_lines = []
# for seq in sequences:
#     indexed_lines.append(seq[0])
    
# #pad sequences
# lines_pad = pad_sequences(indexed_lines, padding = 'post', maxlen = max_len)

In [14]:
# codebert-based embeddings

model_name = "microsoft/codebert-base" #"microsoft/graphcodebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)
model = TFAutoModel.from_pretrained(model_name)

codebert_embeddings = model.get_input_embeddings()
embedding_matrix = codebert_embeddings.weights[0].numpy()
num_words = len(embedding_matrix)
dim = len(embedding_matrix[0])

sentences = [' '.join(line) for line in lines]
sequences = [tokenizer(sente, return_tensors="tf", truncation=True, add_special_tokens=False) for sente in sentences]

sentiment = labels.iloc[:,0].values

#lines_pad = pad_sequences(sequences, padding = 'post', maxlen = max_len)

def padSequences(sequences, max_len):
    lines_pad = []
    for sequence in sequences:
        seq = sequence['input_ids'].numpy()[0]
        if len(seq) < 512:
            for i in range(len(seq), 512):
                seq = np.append(seq, 0)
        lines_pad.append(seq)
    return lines_pad

lines_pad = padSequences(sequences, max_len)
lines_pad = [arr.tolist() for arr in lines_pad]
lines_pad = np.array(lines_pad)

All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at microsoft/codebert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [15]:
train_x = lines_pad[0:train_len]
train_y = sentiment[0:train_len]

test_x = lines_pad[train_len:]
test_y = sentiment[train_len:]
    

avg_acc = []
avg_prec = []
avg_rec = []
avg_f1 = []
avg_f2 = []
num_loops = 5
for i in range(0, num_loops):
    print(i)
    #shuffle train data
    indices = np.arange(train_x.shape[0])
    np.random.shuffle(indices)
    train_x = train_x[indices]
    train_y = train_y[indices]

    #shuffle test data
    indices = np.arange(test_x.shape[0])
    np.random.shuffle(indices)
    test_x = test_x[indices]
    test_y = test_y[indices]
    
    #percentage train data
    pososto = 1 # 0.125  0.0625
    train_x = train_x[0:int(pososto*train_len)]
    train_y = train_y[0:int(pososto*train_len)]

    nb_epoch = 100
    BS = 64

    #myModel = buildCnn(max_len, num_words, dim, seed, embedding_matrix.detach().numpy()) # buildLstm # buildCnn # buildBiLstm
    myModel = buildCnn(max_len, num_words, dim, seed, embedding_matrix)
    print("model summary\m",myModel.summary())
    csv_logger = CSVLogger('log.csv', append=True, separator=',')
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
    mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)
    milli_sec1 = int(round(time.time() * 1000))
    history = myModel.fit(train_x, train_y, validation_data=(test_x, test_y), epochs = nb_epoch, batch_size = BS, shuffle=False, verbose=1, callbacks=[csv_logger,es,mc])
    milli_sec2 = int(round(time.time() * 1000))
    print("Training is completed after", milli_sec2-milli_sec1)

    myModel.load_weights("best_model.h5")
    predictions = (myModel.predict(test_x) > 0.5).astype("int32")
    predScores = myModel.predict(test_x)

    cm = confusion_matrix(test_y, predictions, labels=[0, 1])
    tn, fp, fn, tp = confusion_matrix(test_y, predictions).ravel()
    (tn, fp, fn, tp)

    acc=(tp+tn)/(tp+tn+fp+fn)
    prec=tp/(tp+fp)
    rec=tp/(tp+fn)
    f1=2*prec*rec / (prec+rec)
    f2=5*prec*rec / (4*prec+rec)
    print("Accuracy: ", acc)
    print("Precision: ", prec)
    print("Recall: ", rec)
    print("F1-score: ", f1)
    print("F2-score: ", f2)
    
    avg_acc.append(acc)
    avg_prec.append(prec)
    avg_rec.append(rec)
    avg_f1.append(f1)
    avg_f2.append(f2)

print("\n")
print("AVG Accuracy: ", sum(avg_acc)/5)
print("AVG Precision: ", sum(avg_prec)/5)
print("AVG Recall: ", sum(avg_rec)/5)
print("AVG F1-score: ", sum(avg_f1)/5)
print("AVG F2-score: ", sum(avg_f2)/5)


0
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 768)         38603520  
                                                                 
 conv1d (Conv1D)             (None, None, 128)         491648    
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 39,095,297
Trainable params: 491,777
Non-trainable params: 38,603,520
_________________________________________________________________
model summary\m None
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.26431, saving model to best_

Epoch 7/100
Epoch 7: val_loss did not improve from 0.16158
Epoch 8/100
Epoch 8: val_loss did not improve from 0.16158
Epoch 9/100
Epoch 9: val_loss improved from 0.16158 to 0.16098, saving model to best_model.h5
Epoch 10/100
Epoch 10: val_loss did not improve from 0.16098
Epoch 11/100
Epoch 11: val_loss did not improve from 0.16098
Epoch 12/100
Epoch 12: val_loss did not improve from 0.16098
Epoch 13/100
Epoch 13: val_loss did not improve from 0.16098
Epoch 14/100
Epoch 14: val_loss did not improve from 0.16098
Epoch 15/100
Epoch 15: val_loss did not improve from 0.16098
Epoch 16/100
Epoch 16: val_loss did not improve from 0.16098
Epoch 17/100
Epoch 17: val_loss did not improve from 0.16098
Epoch 18/100
Epoch 18: val_loss did not improve from 0.16098
Epoch 19/100
Epoch 19: val_loss did not improve from 0.16098
Epoch 19: early stopping
Training is completed after 21128
Accuracy:  0.928343949044586
Precision:  0.847682119205298
Recall:  0.8533333333333334
F1-score:  0.8504983388704319
F2

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 768)         38603520  
                                                                 
 conv1d_3 (Conv1D)           (None, None, 128)         491648    
                                                                 
 global_max_pooling1d_3 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_3 (Dense)             (None, 1)                 129       
                                                                 
Total params: 39,095,297
Trainable params: 491,777
Non-trainable params: 38,603,520
_________________________________________________________________
model summary\m None
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.26455, saving model to best_

Epoch 8/100
Epoch 8: val_loss improved from 0.17180 to 0.16492, saving model to best_model.h5
Epoch 9/100
Epoch 9: val_loss did not improve from 0.16492
Epoch 10/100
Epoch 10: val_loss did not improve from 0.16492
Epoch 11/100
Epoch 11: val_loss did not improve from 0.16492
Epoch 12/100
Epoch 12: val_loss did not improve from 0.16492
Epoch 13/100
Epoch 13: val_loss did not improve from 0.16492
Epoch 14/100
Epoch 14: val_loss did not improve from 0.16492
Epoch 15/100
Epoch 15: val_loss did not improve from 0.16492
Epoch 16/100
Epoch 16: val_loss did not improve from 0.16492
Epoch 17/100
Epoch 17: val_loss did not improve from 0.16492
Epoch 18/100
Epoch 18: val_loss did not improve from 0.16492
Epoch 18: early stopping
Training is completed after 20309
Accuracy:  0.9267515923566879
Precision:  0.8823529411764706
Recall:  0.8
F1-score:  0.8391608391608392
F2-score:  0.815217391304348


AVG Accuracy:  0.9286624203821656
AVG Precision:  0.8677357132447945
AVG Recall:  0.828
AVG F1-score:  0

In [16]:
# #shuffle data
# indices = np.arange(lines_pad.shape[0])
# lines_pad = lines_pad[indices]
# sentiment = sentiment[indices]

# y = pd.DataFrame(sentiment)
# X = pd.DataFrame(lines_pad)

# ############## cross validation
# scores=['accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'f2', 'fpr']
# values = [np.array([]) for i in range(0, len(scores))]
# score_dict = OrderedDict(zip(scores, values))
# k=10
# f=0
# kfold = StratifiedKFold(n_splits=k,shuffle=True,random_state=seed)

# nb_epoch = 100
# BS = 64
# print("Training...")
# milli_sec1 = int(round(time.time() * 1000))

# for train_index, test_index in kfold.split(X, y):
#     f = f + 1
#     print('fold number= ',f)
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     Y_train, Y_test = y.iloc[train_index], y.iloc[test_index]
#     Y_train = np.array(Y_train)
#     Y_train = Y_train.ravel()
#     Y_test = np.array(Y_test)
#     Y_test = Y_test.ravel()
    
#     '''#sampling
#     X_res, Y_res = RandomOverSampler(random_state=seed, sampling_strategy=0.5).fit_resample(X_train, Y_train)
#     #X_res, Y_res = RandomUnderSampler(random_state=seed, sampling_strategy=0.5).fit_resample(X_train, Y_train)

#     #shuffle dataset
#     X_resampled=pd.DataFrame(X_res)
#     Y_resampled=pd.DataFrame(Y_res)
#     newTrain=X_resampled.assign(Label=Y_resampled.values)
#     newTrain = shuffle(newTrain,random_state=seed)
#     X_train=np.array(newTrain.iloc[:, 0:-1 ])
#     X_train=pd.DataFrame(X_train)
#     Y_train=np.array(newTrain.iloc[:, -1 ])
#     Y_train=pd.DataFrame(Y_train)'''
    
#     myModel = buildCnn(max_len, num_words, dim, seed, embedding_matrix) # buildLstm
#     print("model summary\m",myModel.summary())
#     csv_logger = CSVLogger('log.csv', append=True, separator=',')
#     es = EarlyStopping(monitor='val_f2_metric', mode='max', verbose=1, patience=10)
#     mc = ModelCheckpoint('best_model.h5', monitor='val_f2_metric', mode='max', verbose=1, save_best_only=True)
#     history = myModel.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs = nb_epoch, batch_size = BS, shuffle=False, verbose=1, callbacks=[csv_logger,es,mc])
    
#     #load best model
#     #model = load_model('best_model.h5')
#     myModel.load_weights("best_model.h5")
    
#     scores = myModel.evaluate(X_test, Y_test, verbose=0)
#     #predictions = myModel.predict_classes(X_test, verbose=0)
#     predictions = (myModel.predict(X_test) > 0.5).astype("int32")
#     predScores = myModel.predict(X_test)
#     accuracy=accuracy_score(Y_test, predictions)
#     precision=precision_score(Y_test, predictions)
#     recall=recall_score(Y_test, predictions)
#     f1=f1_score(Y_test, predictions)
#     roc_auc=roc_auc_score(Y_test, predictions)
#     f2=5*precision*recall / (4*precision+recall)
#     print(confusion_matrix(Y_test, predictions, labels=[0, 1]))
#     tn, fp, fn, tp = confusion_matrix(Y_test, predictions).ravel()
#     fpr = fp / (fp+tn)
#     acc = ((tp+tn)/(tp+tn+fp+fn))
#     print("Accuracy:%.2f%%"%(acc*100))
#     print("Precision:%.2f%%"%(precision*100))
#     print("Recall:%.2f%%"%(recall*100))
#     print("F1 score:%.2f%%"%(f1*100))
#     print("Roc_Auc score:%.2f%%"%(roc_auc*100))
#     print("F2 score:%.2f%%"%(f2*100))
#     print("FPR score:%.2f%%"%(fpr*100))
#     print(classification_report(Y_test, predictions))
#     del myModel
#     score_dict['accuracy'] = np.append(score_dict['accuracy'], accuracy)
#     score_dict['precision'] = np.append(score_dict['precision'], precision)
#     score_dict['recall'] = np.append(score_dict['recall'], recall)
#     score_dict['f1'] = np.append(score_dict['f1'], f1)
#     score_dict['roc_auc'] = np.append(score_dict['roc_auc'], roc_auc)
#     score_dict['f2'] = np.append(score_dict['f2'], f2)
#     score_dict['fpr'] = np.append(score_dict['fpr'], fpr)
    
# milli_sec2 = int(round(time.time() * 1000))
# print("Training is completed after", milli_sec2-milli_sec1)

# print("accuracy: %.2f%% (%.2f%%)" % (score_dict['accuracy'].mean()*100, score_dict['accuracy'].std()*100))
# print("precision: %.2f%% (%.2f%%)" % (score_dict['precision'].mean()*100, score_dict['precision'].std()*100))
# print("recall: %.2f%% (%.2f%%)" % (score_dict['recall'].mean()*100, score_dict['recall'].std()*100))
# print("f1: %.2f%% (%.2f%%)" % (score_dict['f1'].mean()*100, score_dict['f1'].std()*100))
# print("roc_auc: %.2f%% (%.2f%%)" % (score_dict['roc_auc'].mean()*100, score_dict['roc_auc'].std()*100))
# print("f2: %.2f%% (%.2f%%)" % (score_dict['f2'].mean()*100, score_dict['f2'].std()*100))
# print("fpr: %.2f%% (%.2f%%)" % (score_dict['fpr'].mean()*100, score_dict['fpr'].std()*100))