# Stack Traces Embeddings

The mechanism to compute word embeddings is used here to compute embeddings for stack traces in order to capture the sequence and the correlation of function calls

## Pre Processing

In [183]:
import os
import re
import json
import random
import collections
import numpy as np
from random   import seed
from random   import randint
from datetime import datetime

### Load & Clean Data

In [184]:
# initialize the dictionary and the path to find the stack traces
dir_path       = '/home/kostas/Documents/thesis/data_1'
st_traces_ls   = list()  

In [185]:
# remove text from the stack trace 
# and keep only the sequence of functions
# returns a list with the function calls
def clean_stack_trace(stack_trace):
    
    clean_stack_trace = []
    temp_stack        = stack_trace.split(" at ")[1:]
    
    to_find = re.compile("[|,|<|>]|/|\|=")
    
    #find where each function ends and keep only the path
    for f in temp_stack:
        temp      = f.find(')')
        temp_file = f[0:temp]
        
        # check the punctuations in order to avoid anything else
        match_obj = to_find.search(temp_file)
        if match_obj == None:
            filename = find_filename(temp_file)
            if filename != '':
                clean_stack_trace.append(filename)
    
    return clean_stack_trace


In [186]:
# remove the name of the function and store only the file which contains the function.  
# This is done by tracking full stops
def find_filename(value):
    filename = ""
    words    = value.split("(")
    if len(words)>=2:
        parts = words[0].split(".")
        filename = ".".join(parts[0:-1])
    return filename


In [187]:
# proceed for every file in the directory
for fname in os.listdir(dir_path):
    with open(os.path.join(dir_path,fname)) as json_file:
        
        print("working on file",fname,"\n")
        
        #load data
        data = json.load(json_file)
        
        for counter,issue in enumerate(data):
            dirty_stack_trace = issue['stack_trace']
            
            if dirty_stack_trace != []:
                print("working on stack trace on issue",counter + 1,"\n")
                if len(dirty_stack_trace) > 1:
                    dirty_stack_trace_1 = ''.join(dirty_stack_trace)
                    stack_trace = clean_stack_trace(dirty_stack_trace_1)
                else:
                    stack_trace = clean_stack_trace(dirty_stack_trace[0])
                
                if stack_trace != []:
                    if len(stack_trace)>1:
                        st_traces_ls.append(stack_trace)

working on file data_word_emb72.json 

working on stack trace on issue 24 

working on file data_word_emb143.json 

working on stack trace on issue 5 

working on stack trace on issue 6 

working on stack trace on issue 9 

working on stack trace on issue 11 

working on stack trace on issue 12 

working on stack trace on issue 13 

working on stack trace on issue 16 

working on stack trace on issue 17 

working on file data_word_emb188.json 

working on file data_word_emb98.json 

working on stack trace on issue 23 

working on file data_word_emb239.json 

working on stack trace on issue 8 

working on stack trace on issue 20 

working on file data_word_emb108.json 

working on stack trace on issue 17 

working on file data_word_emb46.json 

working on file data_word_emb31.json 

working on stack trace on issue 3 

working on stack trace on issue 6 

working on file data_word_emb120.json 

working on stack trace on issue 5 

working on file data_word_emb32.json 

working on stack tra

working on file data_word_emb112.json 

working on stack trace on issue 2 

working on stack trace on issue 13 

working on stack trace on issue 14 

working on file data_word_emb68.json 

working on stack trace on issue 4 

working on stack trace on issue 5 

working on stack trace on issue 23 

working on stack trace on issue 24 

working on file data_word_emb153.json 

working on file data_word_emb203.json 

working on file data_word_emb53.json 

working on file data_word_emb136.json 

working on stack trace on issue 12 

working on stack trace on issue 14 

working on stack trace on issue 16 

working on stack trace on issue 17 

working on stack trace on issue 18 

working on file data_word_emb166.json 

working on stack trace on issue 11 

working on file data_word_emb90.json 

working on stack trace on issue 10 

working on file data_word_emb232.json 

working on stack trace on issue 11 

working on stack trace on issue 15 

working on file data_word_emb77.json 

working on stac

#### Train-Validation-Test Sets Split

In [188]:
# this function splits the dataset into training validation and testing set
# it randomly selects test_size  stack traces for testing
# it randomly selects valid_size stack traces for validation

def split_dataset(st_traces_ls,valid_size,test_size):
    
    train_set  = list()
    valid_set  = list()
    test_set   = list()
    
    seed(datetime.now())
    
    for i in range(valid_size):
        temp = randint(0,len(st_traces_ls)-1)
        valid_set.append(st_traces_ls.pop(temp))
    
    for i in range(test_size):
        temp = randint(0,len(st_traces_ls)-1)
        test_set.append(st_traces_ls.pop(temp))
    
    train_set = [i for i in st_traces_ls]
    
    return train_set,valid_set,test_set


In [189]:
#valid_size = int(0.1*len(st_traces_ls))
#test_size  = int(0.1*len(st_traces_ls))
valid_size = 0
test_size  = 0 
train_set,validation_set,test_set = split_dataset(st_traces_ls,valid_size,test_size)

In [190]:
# some print messages
print("total stack traces",len(st_traces_ls))
print("train set size",len(train_set))
print("validation set size",len(validation_set))
print("test set size",len(test_set))
del st_traces_ls

total stack traces 471
train set size 471
validation set size 0
test set size 0


### Create Vocabulary

In [191]:
unk_func        = "UNK"
vocabulary_size = 0
min_occurance   = 2
skip_window     = 2
learning_rate   = 0.1
embedding_dim   = 16
num_sampled     = 64
batch_size      = 2048
false_neigh     = 32
true_neigh      = 2

In [192]:
def save_vocabulary(custom_dict):
    
    with open("stack_traces_vocabulary.txt","w") as file:
        for key in custom_dict:
            file.write("%s, %s \n"%(key,str(custom_dict[key])))

In [193]:
def find_context_words(description,w_index,skip_window,span,grams_list):
    
    # the target word in the first place
    grams_list.append(description[w_index])
    
    # initialize two pointers
    counter = 1
    data_index = w_index-1
    
    while counter < span:
        # look left from target word
        if counter<=skip_window:
            # if data_index<0 => out of bound no more words to take into account
            if data_index < 0:
                data_index = w_index  + 1
                counter    = skip_window + 1
            # if the word is not in the dict skip it
            elif description[data_index] == -2:
                data_index -= 1
            else:
                grams_list.append(description[data_index])
                counter    += 1
                data_index -= 1
                if counter > skip_window:
                    data_index = w_index + 1
        # look right from target word
        else:
            if data_index >= len(description):
                counter = span + 1
            elif description[data_index] == -2:
                data_index += 1
            else:
                grams_list.append(description[data_index])
                counter    += 1
                data_index += 1


In [194]:
def create_corpus(train_set,skip_window):
    
    #find total instances
    total_words = 0
    for func in train_set:
        total_words += len(func)
    
    #initialize the corpus which will keep all word pairs
    max_size = total_words*2*skip_window
    corpus = -1*np.ones((max_size,2), dtype=np.int32)
    
    # initialize pointers for the iterations
    d_pointer  = 0
    w_pointer  = 0
    counter    = 0
    
    #initialize temporary buffer
    span   = 2*skip_window+1 
    buffer = collections.deque(maxlen = span)
    
    while counter< max_size:
        
        # avoid tags with -2
        while train_set[d_pointer][w_pointer] < 0:
            w_pointer += 1
            if w_pointer > len(train_set[d_pointer])-1:
                w_pointer  = 0
                d_pointer +=1
                if d_pointer > len(train_set) -1:
                    break
        
        # check if all descriptions have been analyzed
        if d_pointer > len(train_set)-1:
            break
        
        find_context_words(train_set[d_pointer],w_pointer,skip_window,span,buffer)
        
        for i in range(1,len(buffer)):
            corpus[counter][0] = buffer[0]
            corpus[counter][1] = buffer[i]
            counter += 1
        
        buffer.clear()
        
        if w_pointer == len(train_set[d_pointer]) -1:
            w_pointer  = 0
            d_pointer +=1
            if d_pointer > len(train_set) -1:
                break
        else:
            w_pointer += 1
    
    return corpus[0:counter].copy()


In [195]:
# create vocabulary based on the frequency of each word.
# remove rare file names, which occurs less time than min_occurance, from vocabulary

temp_list = [func for stack_trace in train_set for func in stack_trace]

count     = []
count.extend(collections.Counter(temp_list).most_common())
count[:]  = [e for e in count if e[1]>=min_occurance]

# list temp_sentences now is useless
del temp_list

# create vocabulary
vocabulary_size = len(count)

#assign an id to each function
func2id = dict()
func2id[unk_func] = -2

for i,(func,_) in enumerate(count):
    func2id[func] = i

# list count now is useless
print(len(count))
del count

train_set_id = [[func2id.get(func,-2) for func in i] for i in train_set]
valid_set_id = [[func2id.get(func,-2) for func in i] for i in validation_set]
test_set_id  = [[func2id.get(func,-2) for func in i] for i in test_set ]

del train_set
del validation_set
del test_set

# save the vocabulary
save_vocabulary(func2id)

# create corpus with word pairs
corpus         = create_corpus(train_set_id,skip_window)
corpus_indexes = [w for w in range(len(corpus))] 

# save them 
np.savetxt('stacks_corpus.txt',corpus,fmt="%d")

# train_set_id now is useless
del train_set_id

968


### Validation and Test Pairs Creation

In [196]:
def partition(arr,low,high):
    
    i = (low - 1)
    pivot = arr[high][1]
    
    for j in range(low,high):
        
        if arr[j][1] >= pivot:
            
            i += 1
            arr[i], arr[j] = arr[j], arr[i]
    
    arr[i+1],arr[high] = arr[high], arr[i+1]
    return (i+1)


In [197]:
def quickSort(arr,low,high):
    if len(arr) ==1:
        return arr
    if low<high:
        pi = partition(arr,low,high)
        
        quickSort(arr,low,pi-1)
        quickSort(arr,pi+1,high)


In [198]:
def create_testing_pairs(test_set,min_occurance,num_words,skip_window,true_neigh,false_neigh):
    
    # initialize temporary buffer
    span   = skip_window*2+1
    buffer = collections.deque(maxlen = span)
    
    # numerate all words in the dataset.
    temp_sentences = [word for desc in test_set for word in desc]
    count = []
    count.extend(collections.Counter(temp_sentences).most_common())
    
    # list temp_sentences now is useless
    del temp_sentences
    
    # remove rare words
    count[:] = [e for e in count if e[1]>=min_occurance]
    
    # compute weights and select num_testing words
    weights      = [e[1] for e in count if e[0] != -2]
    total_weight = np.sum(weights)
    
    #for e in count:
    #    if e[0] == -2:
    #        continue
    #    else:
    #        weights.append(e[1])
    #        total_weight += e[1]
    
    # generate random samples 
    weights[:] = [x/total_weight for x in weights]
    indexes    = [i for i in range(len(count)) if count[i][0]!=-2]
    samples    = np.random.choice(indexes,num_words,replace = False, p = weights)
    
    target_w   = [count[i][0] for i in samples]
    w_dict     = dict([(key, [[],[]]) for key in target_w])
    
    for desc in test_set:
        for w in target_w:
            temp_idx = [i for i,e in enumerate(desc) if e == w]
            for idx in temp_idx:
                find_context_words(desc,idx,skip_window,span,buffer)
                for i in range(1,len(buffer)):
                    if w_dict[w][0] == []:
                        w_dict[w][0].append([buffer[i],1])
                    else:
                        flag = False
                        for neigh in w_dict[w][0]:
                            if neigh[0] == buffer[i]:
                                neigh[1] += 1
                                flag = True
                                break
                        if flag == False:
                            w_dict[w][0].append([buffer[i],1])
                buffer.clear()
    
    for key in w_dict:
        quickSort(w_dict[key][0],0,len(w_dict[key][0])-1)
        print(key,w_dict[key][0])

In [199]:
target_emb = norm_embedding_matrix[24]
neigh_emb  = norm_embedding_matrix[7]
result    = np.dot(target_emb,neigh_emb)/(np.sqrt(np.dot(target_emb,target_emb))*np.sqrt(np.dot(neigh_emb,neigh_emb)))
print(result)

0.58006704


In [200]:
create_testing_pairs(valid_set_id,5,30,skip_window,true_neigh,false_neigh)

ValueError: 'a' cannot be empty unless no samples are taken

## Stack's Embeddings Model Definition and Training

In [201]:
import math
import time
import numpy as np
import tensorflow.compat.v1 as tf
tf.compat.v1.disable_eager_execution()

In [202]:
def generate_batch(corpus_data,corpus_indexes,batch_size):
    
    batch  = np.ndarray(shape = (batch_size),   dtype = np.int32)
    labels = np.ndarray(shape = (batch_size,1), dtype = np.int32)
    
    seed(datetime.now())
    
    words_to_use = random.sample(corpus_indexes,batch_size)
    
    for counter,value in enumerate(words_to_use):
        batch[counter]    = corpus_data[value][0]
        labels[counter,0] = corpus_data[value][1] 
    
    return batch,labels

In [203]:
def model_def_cpu(corpus_data,corpus_indexes,batch_size,embedding_dim,
                  num_sampled,learning_rate,vocabulary_size):
    
    # Input data
    X_train = tf.placeholder(tf.int32, shape=[None])
    # Input label
    Y_train = tf.placeholder(tf.int32, shape=[None, 1])
    
    # ensure that the following ops & var are assigned to CPU
    with tf.device('/cpu:0'):
        
        # create the embedding variable wich contains the weights
        embedding = tf.Variable(tf.random_normal([vocabulary_size,embedding_dim]))
        
        # create the lookup table for each sample in X_train=>avoiding to use one_hot encoder
        X_embed   = tf.nn.embedding_lookup(embedding,X_train)
        
        # create variables for the loss function
        nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size,embedding_dim],stddev=1.0))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
        
    loss_func = tf.reduce_mean(tf.nn.nce_loss(weights = nce_weights,biases =nce_biases,labels = Y_train,
                                              inputs = X_embed,num_sampled = num_sampled,
                                              num_classes = vocabulary_size ))
    
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    
    train_opt = optimizer.minimize(loss_func)
    
    # Define initializer for tensorflow variables
    init = tf.global_variables_initializer()
    
    with tf.Session() as sess:
        
        #actual initialize the variables
        sess.run(init)
        
        # patience method's variables 
        min_loss           = float('inf')
        min_emb_matrix     = np.zeros((vocabulary_size,embedding_dim))
        patience_remaining = 100
        
        start_time = time.time()
        
        #train the model using 100 epoch patience
        for epoch in range(5000):
            
            # take a batch of data.
            batch_x,batch_y = generate_batch(corpus_data,corpus_indexes,batch_size)
            
            _,loss = sess.run([train_opt,loss_func],feed_dict={X_train:batch_x, Y_train:batch_y})
            patience_remaining -= 1
            
            if loss < min_loss:
                min_loss           = loss
                patience_remaining = 200
                min_emb_matrix     = embedding.eval()
                
            if patience_remaining == 0:
                break
        
        # normalize embeddings before using them
        # restore min embeddings
        embedding = tf.convert_to_tensor(min_emb_matrix)
        norm = tf.sqrt(tf.reduce_sum(tf.square(embedding),1,keepdims = True))
        normalized_embedding = embedding/norm
        normalized_embedding_matrix = sess.run(normalized_embedding)
        
        #measure total time
        total_time = time.time() - start_time
        print("training time in seconds %s "%(str(total_time)))
        print("total epochs was",epoch)
        
    return normalized_embedding_matrix

In [204]:
norm_embedding_matrix = model_def_cpu(corpus,corpus_indexes,batch_size,embedding_dim,
                                      num_sampled,learning_rate,vocabulary_size)

np.savetxt('stack_embeddings_new.txt',norm_embedding_matrix,fmt='%.8f')

training time in seconds 12.520550012588501 
total epochs was 2128
