# Process the Joern extracted function from Vuldeepecker dataset

In [111]:
import pandas as pd
import numpy as np
import os
import re
from lxml import etree
from clang import cindex
from sklearn.model_selection import train_test_split

### Prepare metadata from SARD manifest

In [4]:
xml_file = []
xml_label = []
xml_vulline = []
with open("VulDeePecker-master-joern/manifest.xml",'rb') as f:
  file_content = f.read()
  tree = etree.fromstring(file_content)
  testcase = tree.xpath('//testcase/file')
  for doc in testcase:
    mypath = doc.xpath('@path')[0]
    try:
        xml_file.append(mypath)
        line = doc.xpath('flaw/@line')[0]
        xml_label.append(1)
        xml_vulline.append(int(line))
    except:
        xml_label.append(0)
        xml_vulline.append(0)
        continue

### Process and sorting

In [5]:
label = []
fname = []
vul_line = []
functions = []
cwe_list = []
startEndLine = []
oriFile = []
pat = re.compile(r'(/\*([^*]|(\*+[^*/]))*\*+/)|(//.*)')

def get_vul_func(diff):
    v_func=[]
    v_line=[]
    for doc in diff:
        if '@@' in doc:
            z= doc.split('@@')[-1].split('(')[0].split(' ')[-1]
            z2= list(filter(str.strip,(re.findall("([1-90]*)", doc.split('@@')[-2]))))[0]
            v_func.append(z)
            v_line.append(z2)
    return(v_func, v_line)

startPath='VulDeePecker-master-joern'

filecount = 1
total = 0
header_file_vulns = []
for CWE in ['CWE-399','CWE-119']:
    total += len(os.listdir(startPath+'/'+CWE+'/source_files'))
    for vuls in os.listdir(startPath+'/'+CWE+'/source_files'):
        print('%d/%d'%(filecount,total), end='\r')
        
        general_path = startPath+'/'+CWE+'/source_files/'+vuls
        
        z = [z.split('.')[-1] for z in os.listdir(general_path)]
        if z.count('c') == 0 and z.count('cpp')==0:
            header_file_vulns.append(general_path)
            continue
        
        try:
            metadata = pd.read_csv(general_path+'/metadata.csv',
                                  names=['filePath','oriPath','lineStart','lineEnd'])
        except FileNotFoundError:
            print('No metadata file: ',general_path)
            continue
        
        if 'CVE' in vuls:
            diff = open(general_path+'/'+vuls+'.txt','r').readlines()
            vul_func_list, vul_func_line_list  = get_vul_func(diff)
        
        for fp,op,ls,le in zip(metadata['filePath'],metadata['oriPath'],metadata['lineStart'],metadata['lineEnd']):
            filename = general_path+'/'+fp.split('/')[-1]
            with open(filename, 'r', encoding="utf8", errors='ignore') as fh:
                code = fh.read()
            code = re.sub(pat,'',code)
            functions.append(code)
            fname.append(filename)
            cwe_list.append(CWE)
            startEndLine.append('%d,%d'%(ls,le))
            oriFile.append(op)
            if op in xml_file:
                file_idx = xml_file.index(op)
                if xml_vulline[file_idx] >= ls and xml_vulline[file_idx] <= le:
                    vul_line.append(xml_vulline[file_idx])
                    label.append(1)
                else:
                    vul_line.append(0)
                    label.append(0)
                    
            else:
                if 'CVE' in vuls:
                    func_name = code.split('(')[0].split(' ')[-1].strip('\n')
                    if func_name in vul_func_list:
                        label.append(1)
                        vul_line.append(vul_func_line_list[vul_func_list.index(func_name)])
                        continue

                vul_line.append(0)
                label.append(0)
            
        filecount +=1
    

No metadata file:  VulDeePecker-master-joern/CWE-119/source_files/CVE-2009-4376
No metadata file:  VulDeePecker-master-joern/CWE-119/source_files/CVE-2010-4538
No metadata file:  VulDeePecker-master-joern/CWE-119/source_files/CVE-2014-1509
10629/10691

In [None]:
mydf = pd.DataFrame({'functionSource':functions,'fName':fname,'oriFile':oriFile,'startEndLine':startEndLine,'label':label, 'vulLine':vul_line, 'cwe':cwe_list})

### Parse using Clang

In [105]:
cindex.Config.set_library_file('/usr/lib/llvm-10/lib/libclang-10.so.1')

In [106]:
cidx = cindex.Index.create()

In [107]:
def clang_tokenizer(code):
    ## Remove code comments
    pat = re.compile(r'(/\*([^*]|(\*+[^*/]))*\*+/)|(//.*)')
    code = re.sub(pat,'',code)
    
    ## Tokkenize using clang
    tok = []
    tu = cidx.parse('tmp.cpp',
                   args=[''],  
                   unsaved_files=[('tmp.cpp', code)],  
                   options=0)
    for t in tu.get_tokens(extent=tu.cursor.extent):
        tok.append(t.spelling)
    return(tok)

In [108]:
mydf.functionSource = mydf.functionSource.apply(clang_tokenizer)

### Split into (train/val/test) and save to file

In [117]:
train, test = train_test_split(mydf, train_size=0.7, random_state=1)
val, test = train_test_split(test, train_size=0.5, random_state=1)

In [None]:
train.to_pickle('vuldeepecker_func_train_tokkenized.pkl')
val.to_pickle('vuldeepecker_func_val_tokkenized.pkl')
test.to_pickle('vuldeepecker_func_test_tokkenized.pkl')