In [10]:
from lexer import lex
import h5py
import numpy as np
from sklearn.preprocessing import LabelEncoder

##### **Examination of Dataset**

In [3]:
scan_hdf5('VDISC_dataset/VDISC_train.hdf5')

 /
   - /CWE-119
   - /CWE-120
   - /CWE-469
   - /CWE-476
   - /CWE-other
   - /functionSource


##### **Lexing functionSource**

In [4]:
ds_paths = [
    'VDISC_dataset/VDISC_test.hdf5',
    'VDISC_dataset/VDISC_train.hdf5',
    'VDISC_dataset/VDISC_validate.hdf5'
]

In [5]:
raw_src_array = {}

for path in ds_paths:
   with h5py.File(path, 'r') as f:
      data = f['functionSource']
      raw_src_array[path] = np.array(data)

In [11]:
lexed_array = {}
le = LabelEncoder()

for ds_path, array in raw_src_array.items():
    ar = []
    for src in array:
        lstr = lex(str(src, 'utf-8'))
        le.fit(lstr)
        ar.append(le.transform(lstr))
    
    lexed_array[ds_path] = ar

In [12]:
print(lexed_array['VDISC_dataset/VDISC_test.hdf5'][0])

[19  1 19  3 19  7 19  3 19  2 31 24  3 19 17 28 19  7 19  7 19 17 29  1
 19  9 19  2 31 23 19 16 27  1 19  9 19 10 19 10 19  2 19  1 19  9 19 10
 19 10 19  2 17 27  1 19  9 19 10 19 10 19  2 19  1 19  9 19 10 19 10 19
  2 17 22 17 23 19 16 19  1 19  9 19  7 12  7 12  7  8 13  7  8 13  7 19
  7 19  9 19 10 19  7 19  9 19 10 19  2 17 19  9 19  9 19 20 19 17 19 20
 19  9 19 10 19 10 19 17 19 20 19  9 19 10 19  4 19  9 19 17 19 20 19  9
 19 10 19  4 19  9 19  4 19  9 19 10 19 10 19 17 26  1 19 20 12 17 19 18
 19  9 19 10 19 10 19 17 19  5  7 19  6 19  9 19  9 19  7 19  6 19  9 19
 10 19 10 19  4 13  2 19  1 19  9 19  7 19  7 19  7 19  7  8 13  7 19  2
 17 27  1 19 21 19 32 19  9 19  2 27  1 19  2 19  1 19  7 19  4 19  9 19
 10 19 10 19  3 19  9 19  9 19  7 19  9 19 10 19  4 19  9 19  4 19  9 19
 10 19 10 19  4 19  9 19 10 19 10 19  3 19  9 19  9 19  7 13  7 19  9 19
  9 19  7 12 30 25 25 25 25 25 25  2 17 22 17 23 19 16 27  1 19  0 19  9
 19 10 19  2 31 19 20 14 15 12 17 19  1 19  7 19  9

##### **Export Lexed Arrays into CSV**

In [14]:
import os

LEXED_PATH = 'lexed_VDISC'
if not os.path.exists(LEXED_PATH):
    os.makedirs(LEXED_PATH)

for ds_path, array in lexed_array.items():
    filename = LEXED_PATH + '/' + ds_path.split('/')[1].split('.')[0]
    with open(filename, 'w') as f:
        for lexed_src in array:
            for lexed_token in lexed_src:
                f.write(str(lexed_token))
                f.write(' ')
            f.write('\n')

##### **Export Labels into CSV**

In [9]:
categories = ['CWE-119', 'CWE-120', 'CWE-469', 'CWE-476', 'CWE-other']

In [54]:
labels_array = {}
data = None
for path in ds_paths:
   truth_table = None
   labels = np.array([])
   with h5py.File(path, 'r') as f:
      for cat in categories:
         data = np.array(f[cat])
         if truth_table is None:
            truth_table = data
         else:
            truth_table = np.vstack((truth_table, data))

   labels = np.argmax(truth_table, axis=0)
   false_col = ~truth_table.any(axis=0)
   labels += 1
   labels[false_col] = -1

   labels_array[path] = labels

In [56]:
LABELS_PATH = 'labels_VDISC'
if not os.path.exists(LABELS_PATH):
    os.makedirs(LABELS_PATH)

for ds_path, array in labels_array.items():
    filename = LABELS_PATH + '/' + ds_path.split('/')[1].split('.')[0]
    with open(filename, 'w') as f:
        for label in array:
            f.write(str(label))
            f.write('\n')

##### **Generate Vector Embeddings for Future Feature Extraction**

grrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr

grrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr

grrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr
