In [1]:
# byte-level BPE
# CodeBERT is based on RoBERTa, which has the same architecture as BERT, but uses a byte-level BPE as a tokenizer (same as GPT-2) 

In [70]:
import torch
import torch.nn as nn

from transformers import set_seed
from transformers import AdamWeightDecay
from transformers import AutoTokenizer
from transformers import RobertaTokenizer

import numpy as np
import os
import pandas as pd
import random

from sklearn.model_selection import train_test_split


In [71]:
# Initialize seeder and randomness
seed = 123
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
set_seed(seed)

In [72]:
if torch.cuda.is_available():  
    dev = "cuda:0" 
else:  
    dev = "cpu"  
device = torch.device(dev)  
print(device)

cuda:0


In [73]:
root_path = os.path.join('..', '..')
data = pd.read_csv(os.path.join(root_path, 'data', 'dataset.csv'))

In [74]:
data = data.sample(frac=1, random_state=seed).reset_index(drop=True)
print(data.head())
print(len(data))

                                                func  vul
0  static int ipip_rcv(struct sk_buff *skb)\n{\n\...    0
1  bool LayerTreeHostImpl::IsUIResourceOpaque(UIR...    0
2  error::Error GLES2DecoderPassthroughImpl::DoGe...    0
3  void DocumentLoader::NotifyFinished(Resource* ...    0
4  void conn_free(conn *c) {\n    if (c) {\n     ...    0
199536


In [75]:
data = data.dropna(subset=["func"])

In [76]:
word_counts = data["func"].apply(lambda x: len(x.split()))
max_length = word_counts.max()
print("Maximum number of words:", max_length)

Maximum number of words: 15441


In [77]:
data = pd.DataFrame(({'Text': data['func'], 'Labels': data['vul']}))
#data = data[0:100]
data.head()

Unnamed: 0,Text,Labels
0,static int ipip_rcv(struct sk_buff *skb)\n{\n\...,0
1,bool LayerTreeHostImpl::IsUIResourceOpaque(UIR...,0
2,error::Error GLES2DecoderPassthroughImpl::DoGe...,0
3,void DocumentLoader::NotifyFinished(Resource* ...,0
4,void conn_free(conn *c) {\n if (c) {\n ...,0


In [78]:
val_ratio = 0.10

In [79]:
shuffle_seeders = [seed, 10, 15, 20, 25, 30, 35, 40, 45, 50]
shuffle_seeder = shuffle_seeders[0]

train_val_data, test_data = train_test_split(data, test_size=val_ratio, random_state=shuffle_seeder, stratify=data['Labels'])
train_data, val_data = train_test_split(train_val_data, test_size=val_ratio, random_state=shuffle_seeder, stratify=train_val_data['Labels'])


Train tokenizer on train_data

In [80]:
train_data.head()

Unnamed: 0,Text,Labels
132528,qreal OxideQQuickWebView::viewportHeight() con...,0
147231,"static std::pair<blink::Image*, float> BrokenC...",0
120483,void HttpResponseHeaders::AddHeader(const std...,1
137970,static void cachedAttribute2AttributeGetter(co...,0
38999,void ChromeDownloadManagerDelegate::CheckDownl...,0


In [81]:
train_data["Text"].head()

132528    qreal OxideQQuickWebView::viewportHeight() con...
147231    static std::pair<blink::Image*, float> BrokenC...
120483     void HttpResponseHeaders::AddHeader(const std...
137970    static void cachedAttribute2AttributeGetter(co...
38999     void ChromeDownloadManagerDelegate::CheckDownl...
Name: Text, dtype: object

In [82]:
text = train_data["Text"].values.tolist()

In [89]:
# Write each function to the file
file_path = os.path.join(root_path, 'data', 'tokenizer_train_data.txt')

with open(file_path, "w", encoding="utf-8") as file:
    for function in text:
        file.write(function + "\n")

Train a new C++ BPE Roberta-base tokenizer to use in the vulnerability prediction analysis

In [90]:
tokenizer = ByteLevelBPETokenizer()
tokenizer

Tokenizer(vocabulary_size=0, model=ByteLevelBPE, add_prefix_space=False, lowercase=False, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False)

In [95]:
tokenizer.train(files=file_path, 
                vocab_size=50257,
                min_frequency=2, 
                special_tokens=["<s>",
                                "<pad>",
                                "</s>",
                                "<unk>",
                                "<mask>",
                                ])

In [98]:
# Save tokenizer
output_dir = "cpp_tokenizer/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

tokenizer.save_model("cpp_tokenizer/", "cpp_tokenizer")
tokenizer.save('cpp_tokenizer/config.json')

In [99]:
# load tokenizer
tokenizer = RobertaTokenizer(vocab_file="cpp_tokenizer/cpp_tokenizer-vocab.json",
                             merges_file="cpp_tokenizer/cpp_tokenizer-merges.txt")

In [101]:
# example = """class LinearLayer():
#     def __init__(self, input_size, output_size):
#         self.weight = torch.randn(input_size, output_size)
#         self.bias = torch.zeros(output_size)

#     def __call__(self, x):
#         return x @ self.weights + self.bias
#     """
# tokenizer.tokenize(example)