In [1]:
# Install transformers library.
#!pip install -q git+https://github.com/huggingface/transformers.git
# Install helper functions.
#!pip install -q git+https://github.com/gmihaila/ml_things.git

In [1]:
# import libraries

import io
import os
import torch
import numpy as np
import random
import os
import csv
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from ml_things import plot_dict, plot_confusion_matrix, fix_text
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
import pandas as pd
from numpy import vstack



In [2]:
# Initialize seeder and randomness
seed = 123
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
#tf.random.set_seed(seed)

<torch._C.Generator at 0x1bd651adbb0>

In [3]:
# Name of transformers model - will use already pretrained model.
# Path of transformer model - will load your own model from local disk.
model_name_or_path = 'gpt2' # 'microsoft/CodeGPT-small-py' # 'gpt2'

In [4]:
# Pre-processing
def dropEmpty(tokens0):
    tokens = []
    for i in range(0, len(tokens0)):
        temp = tokens0[i]
        if temp != []:
            tokens.append(temp)
    return tokens

# Read pre-processed dataset
with open('data_reduced_bert.csv', newline='', encoding='utf-8') as f:
    reader = csv.reader(f)
    data = list(reader)
data = dropEmpty(data)
random.shuffle(data)
#data = data[0:100] # subsample for checking

In [5]:
def getLengths(data):
    lens = []
    for i in range(len(data)):
        lens.append(len(data[i])-2)
    lens = pd.DataFrame(lens)
    lensFreq = lens[0].value_counts()
    lensFreq=pd.DataFrame(lensFreq)
    return lens, lensFreq

lens, lensFreq = getLengths(data)
max_len = max(lens[0])
max_len

510

In [6]:
labs = []
for d in data:
    labs.append(d[1])

In [7]:
# Creation of the dataset's repository
val_ratio = 0.15
'''
## split dataset to train-val-test sets
### split data into train and test (85% train, 15% test)
train_val, test = train_test_split(data, test_size=val_ratio, random_state=seed, stratify = labs)

labs2 = []
for d in train_val:
    labs2.append(d[1])

### split train and validation
train, val = train_test_split(train_val, test_size=val_ratio, random_state=seed, stratify = labs2)


## create the base directory if it doesn't exist
if not os.path.exists('pythonvp'):
    os.mkdir('pythonvp')

## create subdirectories for clean and vuln
for subset in ['train', 'val', 'test']:
    if not os.path.exists(os.path.join('pythonvp', subset)):
        os.mkdir(os.path.join('pythonvp', subset))

    ### create subdirectories for train, val, and test
    for category in ['clean', 'vuln']:
        if not os.path.exists(os.path.join('pythonvp', subset, category)):
            os.mkdir(os.path.join('pythonvp', subset, category))

## write files to appropriate directories
def makeRepo(data, subset):
    for item in data:
        filename = item[0]
        category = 'clean' if item[1] == '0' else 'vuln'
        body = ' '.join([str(token) for token in item[2:]])

        with open(os.path.join('pythonvp', subset, category, filename[:-3] + '.py'), 'w') as f:
            f.write(body)

            
## call makeRepo function
makeRepo(train, 'train')
makeRepo(val, 'val')
makeRepo(test, 'test')

## Merge train and val folders
subset = "train_val"
if not os.path.exists(os.path.join('pythonvp', subset)):
        os.mkdir(os.path.join('pythonvp', subset))
for category in ['clean', 'vuln']:
        if not os.path.exists(os.path.join('pythonvp', subset, category)):
            os.mkdir(os.path.join('pythonvp', subset, category))

makeRepo(train_val, 'train_val')'''

'\n## split dataset to train-val-test sets\n### split data into train and test (85% train, 15% test)\ntrain_val, test = train_test_split(data, test_size=val_ratio, random_state=seed, stratify = labs)\n\nlabs2 = []\nfor d in train_val:\n    labs2.append(d[1])\n\n### split train and validation\ntrain, val = train_test_split(train_val, test_size=val_ratio, random_state=seed, stratify = labs2)\n\n\n## create the base directory if it doesn\'t exist\nif not os.path.exists(\'pythonvp\'):\n    os.mkdir(\'pythonvp\')\n\n## create subdirectories for clean and vuln\nfor subset in [\'train\', \'val\', \'test\']:\n    if not os.path.exists(os.path.join(\'pythonvp\', subset)):\n        os.mkdir(os.path.join(\'pythonvp\', subset))\n\n    ### create subdirectories for train, val, and test\n    for category in [\'clean\', \'vuln\']:\n        if not os.path.exists(os.path.join(\'pythonvp\', subset, category)):\n            os.mkdir(os.path.join(\'pythonvp\', subset, category))\n\n## write files to appro

In [8]:
# define models and parameters
from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          GPT2Config,
                          GPT2Tokenizer,
                          AdamW, 
                          get_linear_schedule_with_warmup,
                          GPT2ForSequenceClassification,
                          GPT2Model)

# Set seed for reproducibility.
set_seed(seed)

# Number of batches - depending on the max sequence length and GPU memory.
# For 512 sequence length batch of 10 works without cuda memory issues.
# For small sequence length can try batch of 32 or higher.
batch_size = 32

# Pad or truncate text sequences to a specific length
# if `None` it will use maximum sequence of word piece tokens allowed by model.
max_length = 120

# Look for gpu to use. Will use `cpu` by default if no gpu found.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Dictionary of labels and their id - this will be used to convert.
# String labels to number ids.
labels_ids = {'clean': 0, 'vuln': 1}

# How many labels are we using in training.
# This is used to decide size of classification head.
n_labels = len(labels_ids)


In [9]:
class VulnCodeDataset(Dataset):
  r"""PyTorch Dataset class for loading data.

  This is where the data parsing happens.

  This class is built with reusability in mind: it can be used as is as.

  Arguments:

    path (:obj:`str`):
        Path to the data partition.

  """

  def __init__(self, path, use_tokenizer):

    # Check if path exists.
    if not os.path.isdir(path):
      # Raise error if path is invalid.
      raise ValueError('Invalid `path` variable! Needs to be a directory')
    self.texts = []
    self.labels = []
    # Since the labels are defined by folders with data we loop 
    # through each label.
    for label in ['vuln', 'clean']:
      code_path = os.path.join(path, label)

      # Get all files from path.
      files_names = os.listdir(code_path)#[:10] # Sample for debugging.
      # Go through each file and read its content.
      for file_name in tqdm(files_names, desc=f'{label} files'):
        file_path = os.path.join(code_path, file_name)

        # Read content.
        
        #content = io.open(file_path, mode='r', encoding='utf-8').read()
        content = io.open(file_path, mode='r', encoding="ISO-8859-1").read()
        # Fix any unicode issues.
        content = fix_text(content)
        # Save content.
        self.texts.append(content)
        # Save encode labels.
        self.labels.append(label)

    # Number of exmaples.
    self.n_examples = len(self.labels)
    

    return

  def __len__(self):
    r"""When used `len` return the number of examples.

    """
    
    return self.n_examples

  def __getitem__(self, item):
    r"""Given an index return an example from the position.
    
    Arguments:

      item (:obj:`int`):
          Index position to pick an example to return.

    Returns:
      :obj:`Dict[str, str]`: Dictionary of inputs that contain text and 
      asociated labels.

    """

    return {'text':self.texts[item],
            'label':self.labels[item]}



class Gpt2ClassificationCollator(object):
    r"""
    Data Collator used for GPT2 in a classificaiton rask. 
    
    It uses a given tokenizer and label encoder to convert any text and labels to numbers that 
    can go straight into a GPT2 model.

    This class is built with reusability in mind: it can be used as is as long
    as the `dataloader` outputs a batch in dictionary format that can be passed 
    straight into the model - `model(**batch)`.

    Arguments:

      use_tokenizer (:obj:`transformers.tokenization_?`):
          Transformer type tokenizer used to process raw text into numbers.

      labels_ids (:obj:`dict`):
          Dictionary to encode any labels names into numbers. Keys map to 
          labels names and Values map to number associated to those labels.

      max_sequence_len (:obj:`int`, `optional`)
          Value to indicate the maximum desired sequence to truncate or pad text
          sequences. If no value is passed it will used maximum sequence size
          supported by the tokenizer and model.

    """

    def __init__(self, use_tokenizer, labels_encoder, max_sequence_len=None):

        # Tokenizer to be used inside the class.
        self.use_tokenizer = use_tokenizer
        # Check max sequence length.
        self.max_sequence_len = use_tokenizer.model_max_length if max_sequence_len is None else max_sequence_len
        # Label encoder used inside the class.
        self.labels_encoder = labels_encoder

        return

    def __call__(self, sequences):
        r"""
        This function allowes the class objesct to be used as a function call.
        Sine the PyTorch DataLoader needs a collator function, I can use this 
        class as a function.

        Arguments:

          item (:obj:`list`):
              List of texts and labels.

        Returns:
          :obj:`Dict[str, object]`: Dictionary of inputs that feed into the model.
          It holddes the statement `model(**Returned Dictionary)`.
        """

        # Get all texts from sequences list.
        texts = [sequence['text'] for sequence in sequences]
        # Get all labels from sequences list.
        labels = [sequence['label'] for sequence in sequences]
        # Encode all labels using label encoder.
        labels = [self.labels_encoder[label] for label in labels]
        # Call tokenizer on all texts to convert into tensors of numbers with 
        # appropriate padding.
        inputs = self.use_tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True,  max_length=self.max_sequence_len)
        # Update the inputs with the associated encoded labels as tensor.
        inputs.update({'labels':torch.tensor(labels)})

        return inputs





In [10]:
def getUniques(data):
    allWords = []
    for i in range(len(data)):
        for j in range(2, len(data[i])):
            allWords.append(data[i][j])
    
    vc = pd.Series(allWords).value_counts()
    uniques=vc.index.values.tolist()        
    return allWords, uniques, vc

def getVocab(data):
    ## split dataset to train-val-test sets
    ### split data into train and test (85% train, 15% test)
    train_val_indices, test_indices = train_test_split(range(len(data)), test_size=val_ratio, random_state=seed)
    train_val_data = [data[i] for i in train_val_indices]

    allWords, vocab, freqs = getUniques(train_val_data)
    return vocab

def addTokens(data, tokenizer, model):
    new_tokens = getVocab(data)

    for new_token in new_tokens:
        if new_token not in tokenizer.get_vocab().keys():
            tokenizer.add_tokens(new_token)

    # resize model embedding to match new tokenizer
    model.resize_token_embeddings(len(tokenizer))
    return tokenizer, model

In [11]:
# Get model configuration.
print('Loading configuraiton...')
model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path=model_name_or_path, num_labels=n_labels)

# Get model's tokenizer.
print('Loading tokenizer...')
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path, do_lower_case = True)
# default to left padding
tokenizer.padding_side = "left"
# Define PAD Token = EOS Token = 50256
tokenizer.pad_token = tokenizer.eos_token


# Get the actual model.
print('Loading model...')
model = GPT2Model.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=model_config)

# resize model embedding to match new tokenizer
model.resize_token_embeddings(len(tokenizer))

# fix model padding token id
model.config.pad_token_id = model.config.eos_token_id

# Define New tokens for string and numerical i.e., strId$ and numId$
new_tokens = ["strId$", "numId$"]
for new_token in new_tokens:
    if new_token not in tokenizer.get_vocab().keys():
        tokenizer.add_tokens(new_token)

# resize model embedding to match new tokenizer
model.resize_token_embeddings(len(tokenizer))

# add all new tokens
#tokenizer, model = addTokens(data, tokenizer, model)

# fix model new tokens ids

# Load model to defined device.
model.to(device)
print('Model loaded to `%s`'%device)


Loading configuraiton...
Loading tokenizer...
Loading model...
Model loaded to `cuda`


In [12]:
encoded_input = tokenizer("Here is an example sentence with strId$ and numId$ tokens.", padding=True, truncation=True, max_length=512, return_tensors="pt")
encoded_input

{'input_ids': tensor([[ 4342,   318,   281,  1672,  6827,   351, 50257,   392, 50258,    83,
           482,   641,    13]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [13]:
# Set MLP architecture
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        out = self.fc1(x)
        out = nn.functional.relu(out)
        out = self.fc2(out)
        out = self.softmax(out)
        return out

# Set MLP hyperparameters
input_size = 768 # size of GPT-2 embeddings
hidden_size = 100
num_classes = 2
learning_rate = 0.001
num_epochs = 10

# Define MLP and optimizer
mlp = MLP(input_size, hidden_size, num_classes).to(device)
optimizer = optim.Adam(mlp.parameters(), lr=learning_rate)

In [14]:
# Create data collator to encode text and labels into numbers.
gpt2_classificaiton_collator = Gpt2ClassificationCollator(use_tokenizer=tokenizer, 
                                                          labels_encoder=labels_ids, 
                                                          max_sequence_len=max_length)


print('Dealing with Train...')
# Create pytorch dataset.
train_dataset = VulnCodeDataset(path='pythonvp\\train', 
                               use_tokenizer=tokenizer)
print('Created `train_dataset` with %d examples!'%len(train_dataset))

# Move pytorch dataset into dataloader.
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=gpt2_classificaiton_collator)
print('Created `train_dataloader` with %d batches!'%len(train_dataloader))

print()

print('Dealing with Validation...')
# Create pytorch dataset.
valid_dataset =  VulnCodeDataset(path='pythonvp\\val', 
                               use_tokenizer=tokenizer)
print('Created `valid_dataset` with %d examples!'%len(valid_dataset))

# Move pytorch dataset into dataloader.
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=gpt2_classificaiton_collator)
print('Created `eval_dataloader` with %d batches!'%len(valid_dataloader))

print('Dealing with Testing...')
# Create pytorch dataset.
test_dataset =  VulnCodeDataset(path='pythonvp\\test', 
                               use_tokenizer=tokenizer)
print('Created `test_dataset` with %d examples!'%len(test_dataset))

# Move pytorch dataset into dataloader.
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=gpt2_classificaiton_collator)
print('Created `eval_dataloader` with %d batches!'%len(test_dataloader))



# Create pytorch dataset.
train_val_dataset = VulnCodeDataset(path='pythonvp\\train_val', 
                               use_tokenizer=tokenizer)
print('Created `train_dataset` with %d examples!'%len(train_val_dataset))

# Move pytorch dataset into dataloader.
train_val_dataloader = DataLoader(train_val_dataset, batch_size=batch_size, shuffle=True, collate_fn=gpt2_classificaiton_collator)
print('Created `train_dataloader` with %d batches!'%len(train_val_dataloader))


Dealing with Train...


vuln files:   0%|          | 0/924 [00:00<?, ?it/s]

clean files:   0%|          | 0/2932 [00:00<?, ?it/s]

Created `train_dataset` with 3856 examples!
Created `train_dataloader` with 121 batches!

Dealing with Validation...


vuln files:   0%|          | 0/238 [00:00<?, ?it/s]

clean files:   0%|          | 0/767 [00:00<?, ?it/s]

Created `valid_dataset` with 1005 examples!
Created `eval_dataloader` with 32 batches!
Dealing with Testing...


vuln files:   0%|          | 0/275 [00:00<?, ?it/s]

clean files:   0%|          | 0/890 [00:00<?, ?it/s]

Created `test_dataset` with 1165 examples!
Created `eval_dataloader` with 37 batches!


vuln files:   0%|          | 0/973 [00:00<?, ?it/s]

clean files:   0%|          | 0/3120 [00:00<?, ?it/s]

Created `train_dataset` with 4093 examples!
Created `train_dataloader` with 128 batches!


In [15]:
with torch.no_grad():
    train_embeddings = []
    train_labels = []
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        output = model(input_ids=input_ids, attention_mask=attention_mask)
        batch_embeddings = output[0][:, -1, :]
        batch_labels = batch['labels']
        train_labels.append(batch_labels)
        train_embeddings.append(batch_embeddings)
    train_embeddings = torch.cat(train_embeddings, dim=0)
    train_labels = torch.cat(train_labels, dim=0)
print(len(train_embeddings))
print(len(train_labels))
train_embeddingsdf = pd.DataFrame(train_embeddings.cpu().numpy())
train_embeddingsdf.insert(0, 'Label', train_labels)
train_embeddingsdf.to_csv('train_embeddings.csv', index=False)

3856
3856


In [16]:
with torch.no_grad():
    val_embeddings = []
    val_labels = []
    for batch in valid_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        output = model(input_ids=input_ids, attention_mask=attention_mask)
        batch_embeddings = output[0][:, -1, :]
        batch_labels = batch['labels']
        val_labels.append(batch_labels)
        val_embeddings.append(batch_embeddings)
    val_embeddings = torch.cat(val_embeddings, dim=0)
    val_labels = torch.cat(val_labels, dim=0)
print(len(val_embeddings))
print(len(val_labels))
val_embeddingsdf = pd.DataFrame(val_embeddings.cpu().numpy())
val_embeddingsdf.insert(0, 'Label', val_labels)
val_embeddingsdf.to_csv('val_embeddings.csv', index=False)

1005
1005


In [17]:
with torch.no_grad():
    train_val_embeddings = []
    train_val_labels = []
    for batch in train_val_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        output = model(input_ids=input_ids, attention_mask=attention_mask)
        batch_embeddings = output[0][:, -1, :]
        batch_labels = batch['labels']
        train_val_labels.append(batch_labels)
        train_val_embeddings.append(batch_embeddings)
    train_val_embeddings = torch.cat(train_val_embeddings, dim=0)
    train_val_labels = torch.cat(train_val_labels, dim=0)
print(len(train_val_embeddings))
print(len(train_val_labels))
train_val_embeddingsdf = pd.DataFrame(train_val_embeddings.cpu().numpy())
train_val_embeddingsdf.insert(0, 'Label', train_val_labels)
train_val_embeddingsdf.to_csv('train_val_embeddings.csv', index=False)

4093
4093


In [18]:
with torch.no_grad():
    test_embeddings = []
    test_labels = []
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        output = model(input_ids=input_ids, attention_mask=attention_mask)
        batch_embeddings = output[0][:, -1, :]
        batch_labels = batch['labels']
        test_labels.append(batch_labels)
        test_embeddings.append(batch_embeddings)
    test_embeddings = torch.cat(test_embeddings, dim=0)
    test_labels = torch.cat(test_labels, dim=0)
print(len(test_embeddings))
print(len(test_labels))
test_embeddingsdf = pd.DataFrame(test_embeddings.cpu().numpy())
test_embeddingsdf.insert(0, 'Label', test_labels)
test_embeddingsdf.to_csv('test_embeddings.csv', index=False)

1165
1165


In [19]:
model = nn.Sequential(
    nn.Linear(768, 100),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(100, 2),
    nn.Sigmoid() # nn.Softmax()
)

# nn.Linear(764, 100, device="cuda:0")

loss_fn = nn.BCELoss() # CrossEntropyLoss
learning_rate = 0.001
epochs = 5
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
BS = 32

In [149]:
# prepare the dataset
def prepare_data(dataset):
    train, test = dataset.get_splits() 
    # prepare data loaders
    train_dl = DataLoader(train, batch_size=BS, shuffle=True)
    test_dl = DataLoader(test, batch_size=BS, shuffle=False)
    return train_dl, test_dl

In [151]:
import numpy as np

np.savetxt('embeddings.txt', val_embeddings.cpu().numpy(), delimiter=' ')

In [None]:
train_dl, test_dl = prepare_data(train, val)
print(len(train_dl.dataset), len(test_dl.dataset))
# train the model
criterion = loss_fn
# enumerate epochs
for epoch in range(epochs):
    # enumerate mini batches
    for i, (inputs, targets) in enumerate(train_dl):
        # clear the gradients
        optimizer.zero_grad()
        # compute the model output
        yhat = model(inputs)
        # calculate loss
        loss = criterion(yhat, targets)
        # credit assignment
        loss.backward()
        # update model weights
        optimizer.step()

predictions, actuals = list(), list()
for i, (inputs, targets) in enumerate(test_dl):
    # evaluate the model on the test set
    yhat = model(inputs)
    # retrieve numpy array
    yhat = yhat.detach().numpy()
    actual = targets.numpy()
    actual = actual.reshape((len(actual), 1))
    # round to class values
    yhat = yhat.round()
    # store
    predictions.append(yhat)
    actuals.append(actual)
predictions, actuals = vstack(predictions), vstack(actuals)
# calculate accuracy
#acc = accuracy_score(actuals, predictions)
tn, fp, fn, tp = confusion_matrix(actuals, predictions).ravel()
print(tn, fp, fn, tp)

val_acc=(tp+tn)/(tp+tn+fp+fn)
val_prec=tp/(tp+fp)
val_rec=tp/(tp+fn)
val_f1=2*val_prec*val_rec / (val_prec+val_rec)
val_f2=5*val_prec*val_rec / (4*val_prec+val_rec)
print("Accuracy: ", val_acc)
print("Precision: ", val_prec)
print("Recall: ", val_rec)
print("F1-score: ", val_f1)
print("F2-score: ", val_f2)

In [None]:
train_dl, test_dl = prepare_data(train_val, test)
print(len(train_dl.dataset), len(test_dl.dataset))
# train the model
criterion = loss_fn
# enumerate epochs
for epoch in range(epochs):
    # enumerate mini batches
    for i, (inputs, targets) in enumerate(train_dl):
        # clear the gradients
        optimizer.zero_grad()
        # compute the model output
        yhat = model(inputs)
        # calculate loss
        loss = criterion(yhat, targets)
        # credit assignment
        loss.backward()
        # update model weights
        optimizer.step()

predictions, actuals = list(), list()
for i, (inputs, targets) in enumerate(test_dl):
    # evaluate the model on the test set
    yhat = model(inputs)
    # retrieve numpy array
    yhat = yhat.detach().numpy()
    actual = targets.numpy()
    actual = actual.reshape((len(actual), 1))
    # round to class values
    yhat = yhat.round()
    # store
    predictions.append(yhat)
    actuals.append(actual)
predictions, actuals = vstack(predictions), vstack(actuals)
# calculate accuracy
#acc = accuracy_score(actuals, predictions)
tn, fp, fn, tp = confusion_matrix(actuals, predictions).ravel()
print(tn, fp, fn, tp)

val_acc=(tp+tn)/(tp+tn+fp+fn)
val_prec=tp/(tp+fp)
val_rec=tp/(tp+fn)
val_f1=2*val_prec*val_rec / (val_prec+val_rec)
val_f2=5*val_prec*val_rec / (4*val_prec+val_rec)
print("Accuracy: ", val_acc)
print("Precision: ", val_prec)
print("Recall: ", val_rec)
print("F1-score: ", val_f1)
print("F2-score: ", val_f2)