In [15]:
from lexer import lex
import h5py
import numpy as np
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


##### **Examination of Dataset**

In [16]:
# From https://stackoverflow.com/questions/43371438/how-to-inspect-h5-file-in-python
def scan_hdf5(path, recursive=True, tab_step=2):
    def scan_node(g, tabs=0):
        print(' ' * tabs, g.name)
        for _, v in g.items():
            if isinstance(v, h5py.Dataset):
                print(' ' * tabs + ' ' * tab_step + ' -', v.name)
            elif isinstance(v, h5py.Group) and recursive:
                scan_node(v, tabs=tabs + tab_step)
    with h5py.File(path, 'r') as f:
        scan_node(f)

In [4]:
scan_hdf5('VDISC_dataset/VDISC_train.hdf5')

 /
   - /CWE-119
   - /CWE-120
   - /CWE-469
   - /CWE-476
   - /CWE-other
   - /functionSource


##### **Lexing functionSource**

In [17]:
ds_paths = [
    'VDISC_dataset/VDISC_test.hdf5',
    'VDISC_dataset/VDISC_train.hdf5',
    'VDISC_dataset/VDISC_validate.hdf5'
]

In [18]:
raw_src_array = {}

for path in ds_paths:
   with h5py.File(path, 'r') as f:
      data = f['functionSource']
      raw_src_array[path] = np.array(data)

In [19]:
lexed_array = {}

for ds_path, array in raw_src_array.items():
    ar = []
    for src in array:
        ar.append(lex(str(src, 'utf-8')))
    lexed_array[ds_path] = ar

  0%|          | 0/127419 [00:00<?, ?it/s]

100%|██████████| 127419/127419 [00:20<00:00, 6124.80it/s]
 72%|███████▏  | 733842/1019471 [05:46<02:14, 2118.76it/s]  


In [20]:
print(lexed_array['VDISC_dataset/VDISC_test.hdf5'][0])

['<|ident|>', '(', '<|ident|>', '*', '<|ident|>', ',', '<|ident|>', '*', '<|ident|>', ')', '{', 'char', '*', '<|ident|>', ';', 'int', '<|ident|>', ',', '<|ident|>', ',', '<|ident|>', ';', 'switch', '(', '<|ident|>', '->', '<|ident|>', ')', '{', 'case', '<|ident|>', ':', 'if', '(', '<|ident|>', '->', '<|ident|>', '.', '<|ident|>', '.', '<|ident|>', ')', '<|ident|>', '(', '<|ident|>', '->', '<|ident|>', '.', '<|ident|>', '.', '<|ident|>', ')', ';', 'if', '(', '<|ident|>', '->', '<|ident|>', '.', '<|ident|>', '.', '<|ident|>', ')', '<|ident|>', '(', '<|ident|>', '->', '<|ident|>', '.', '<|ident|>', '.', '<|ident|>', ')', ';', 'break', ';', 'case', '<|ident|>', ':', '<|ident|>', '(', '<|ident|>', '->', '<|ident|>', ',', '0', ',', '0', ',', '-', '1', ',', '-', '1', ',', '<|ident|>', ',', '<|ident|>', '->', '<|ident|>', '.', '<|ident|>', ',', '<|ident|>', '->', '<|ident|>', '.', '<|ident|>', ')', ';', '<|ident|>', '->', '<|ident|>', '->', '<|ident|>', '=', '<|ident|>', ';', '<|ident|>', '=',

##### **Export Lexed Arrays into CSV**

In [14]:
import os

LEXED_PATH = 'lexed_VDISC'
if not os.path.exists(LEXED_PATH):
    os.makedirs(LEXED_PATH)

for ds_path, array in lexed_array.items():
    filename = LEXED_PATH + '/' + ds_path.split('/')[1].split('.')[0]
    with open(filename, 'w') as f:
        for lexed_src in array:
            for lexed_token in lexed_src:
                f.write(str(lexed_token))
                f.write(' ')
            f.write('\n')

##### **Export Labels into CSV**

In [9]:
categories = ['CWE-119', 'CWE-120', 'CWE-469', 'CWE-476', 'CWE-other']

In [54]:
labels_array = {}
data = None
for path in ds_paths:
   truth_table = None
   labels = np.array([])
   with h5py.File(path, 'r') as f:
      for cat in categories:
         data = np.array(f[cat])
         if truth_table is None:
            truth_table = data
         else:
            truth_table = np.vstack((truth_table, data))

   labels = np.argmax(truth_table, axis=0)
   false_col = ~truth_table.any(axis=0)
   labels += 1
   labels[false_col] = -1

   labels_array[path] = labels

In [56]:
LABELS_PATH = 'labels_VDISC'
if not os.path.exists(LABELS_PATH):
    os.makedirs(LABELS_PATH)

for ds_path, array in labels_array.items():
    filename = LABELS_PATH + '/' + ds_path.split('/')[1].split('.')[0]
    with open(filename, 'w') as f:
        for label in array:
            f.write(str(label))
            f.write('\n')

##### **Generate Vector Embeddings for Future Feature Extraction**

grrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr

grrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr

grrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr
