# Stack Traces Embeddings

The mechanism to compute word embeddings is used here to compute embeddings for stack traces in order to capture the sequence and the correlation of function calls

## Pre Processing

In [77]:
import os
import re
import json
import random
import pickle
import collections
import numpy as np
from random   import seed
from random   import randint
from datetime import datetime

### Load & Clean Data

In [78]:
# initialize the dictionary and the path to find the stack traces
dir_path       = '../data'
st_traces_ls   = list()  

In [79]:
# remove text from the stack trace 
# and keep only the sequence of functions
# returns a list with the function calls

def clean_stack_trace(stack_trace):
    
    clean_stack_trace = []
    temp_stack        = stack_trace.split(" at ")[1:]
    to_find           = re.compile("[|,|<|>]|/|\|=")
    
    # find where each function ends and keep only the path
    for f in temp_stack:
        temp      = f.find(')')
        temp_file = f[0:temp]
        
        # check the punctuations in order to avoid anything else
        match_obj = to_find.search(temp_file)
        if match_obj == None:
            filename = find_filename(temp_file)
            if filename != '':
                clean_stack_trace.append(filename)
    
    return clean_stack_trace

In [80]:
# remove the name of the function and store only the file which contains the function.  
# This is done by tracking full stops
def find_filename(value):
    filename = ""
    words    = value.split("(")
    if len(words)>=2:
        parts = words[0].split(".")
        filename = ".".join(parts[0:-1])
    return filename

In [81]:
# proceed for every file in the directory
total_stack_traces = 0

for fname in os.listdir(dir_path):
    with open(os.path.join(dir_path,fname)) as json_file:
        
        ##############################
        #print("working on file",fname,"\n")
        ##############################
        
        #load data
        data = json.load(json_file)
        
        for counter,issue in enumerate(data):
            dirty_stack_trace = issue['stack_trace']
            
            if dirty_stack_trace != []:
                total_stack_traces += 1
                
                ##############################
                #print("working on stack trace on issue",counter + 1,"\n")
                ##############################
                
                if len(dirty_stack_trace) > 1:
                    dirty_stack_trace_1 = ' '.join(dirty_stack_trace)
                    stack_trace = clean_stack_trace(dirty_stack_trace_1)
                else:
                    stack_trace = clean_stack_trace(dirty_stack_trace[0])
                
                if stack_trace != []:
                    if len(stack_trace)>1:
                        st_traces_ls.append(stack_trace)

#### Train-Validation-Test Sets Split

In [82]:
# this function splits the dataset into training validation and testing set
# it randomly selects test_size  stack traces for testing
# it randomly selects valid_size stack traces for validation

def split_dataset(st_traces_ls,valid_size,test_size):
    
    train_set  = list()
    valid_set  = list()
    test_set   = list()
    
    seed(datetime.now())
    
    for i in range(valid_size):
        temp = randint(0,len(st_traces_ls)-1)
        valid_set.append(st_traces_ls.pop(temp))
        
    for i in range(test_size):
        temp = randint(0,len(st_traces_ls)-1)
        test_set.append(st_traces_ls.pop(temp))
        
    train_set = [i for i in st_traces_ls]
    return train_set,valid_set,test_set    

In [83]:
valid_size = int(0.2*len(st_traces_ls))
test_size  = int(0.1*len(st_traces_ls))

valid_funcs  = 20
valid_funcs2 = 40
test_funcs   = 30

train_set,validation_set,test_set = split_dataset(st_traces_ls,valid_size,test_size)

In [84]:
# some print messages
#print("total stack traces",total_stack_traces)
print("train set size",len(train_set))
print("validation set size",len(validation_set))
print("test set size",len(test_set))
print("total stack traces",total_stack_traces)
del st_traces_ls

train set size 339
validation set size 96
test set size 48
total stack traces 531


### Create Vocabulary

In [85]:
unk_func        = "UNK"
vocabulary_size = 0
min_occurance   = 2
skip_window     = 2
learning_rate   = 0.1

embedding_dim   = 8
num_sampled     = 32
batch_size      = 2048
false_neigh     = 10
true_neigh      = 2

In [86]:
def save_vocabulary(custom_dict):
    
    with open("../outputs/stacktraces_vocabulary_g.txt","w") as file:
        for key in custom_dict:
            file.write("%s, %s \n"%(key,str(custom_dict[key])))

In [87]:
def create_corpus(train_set,skip_window):
    
    # find total instances
    total_words = 0
    for func in train_set:
        total_words += len(func)
        
    # initialize the corpus which will keep all word pairs
    max_size = total_words*2*skip_window
    corpus   = -1*np.ones((max_size,2), dtype=np.int32)
    
    # initialize pointers for the iterations
    d_pointer  = 0
    w_pointer  = 0
    counter    = 0
    
    #initialize temporary buffer
    span   = 2*skip_window+1 
    buffer = collections.deque(maxlen = span)
    
    while counter< max_size:
        
        # avoid tags with -2
        while train_set[d_pointer][w_pointer] < 0:
            w_pointer += 1
            if w_pointer > len(train_set[d_pointer])-1:
                w_pointer  = 0
                d_pointer += 1
                if d_pointer > len(train_set) -1:
                    break
        
        # check if all descriptions have been analyzed
        if d_pointer > len(train_set)-1:
            break
        
        find_context_words(train_set[d_pointer],w_pointer,skip_window,span,buffer)
        
        for i in range(1,len(buffer)):
            corpus[counter][0] = buffer[0]
            corpus[counter][1] = buffer[i]
            counter += 1
            
        buffer.clear()
        
        if w_pointer == len(train_set[d_pointer]) -1:
            w_pointer  = 0
            d_pointer += 1
            if d_pointer > len(train_set) -1:
                break
        else:
            w_pointer += 1
    
    return corpus[0:counter].copy()

In [88]:
def find_context_words(description,w_index,skip_window,span,grams_list):
    
    # the target word in the first place
    grams_list.append(description[w_index])
    
    # initialize two pointers
    counter = 1
    data_index = w_index-1
    
    while counter < span:
        # look left from target word
        if counter<=skip_window:
            # if data_index<0 => out of bound no more words to take into account
            if data_index < 0:
                data_index = w_index  + 1
                counter    = skip_window + 1
            # if the word is not in the dict skip it
            elif description[data_index] == -2:
                data_index -= 1
            else:
                grams_list.append(description[data_index])
                counter    += 1
                data_index -= 1
                if counter > skip_window:
                    data_index = w_index + 1
        # look right from target word
        else:
            if data_index >= len(description):
                counter = span + 1
            elif description[data_index] == -2:
                data_index += 1
            else:
                grams_list.append(description[data_index])
                counter    += 1
                data_index += 1    

In [89]:
# create vocabulary based on the frequency of each file name.
# remove rare file names, which occurs less time than min_occurance, from vocabulary

temp_list = [func for stack_trace in train_set for func in stack_trace]

count     = []
count.extend(collections.Counter(temp_list).most_common())
count[:]  = [e for e in count if e[1]>=min_occurance]

# list temp_sentences now is useless
del temp_list

# create vocabulary
vocabulary_size = len(count)

# assign an id to each function
func2id = dict()
func2id[unk_func] = -2

for i,(func,_) in enumerate(count):
    func2id[func] = i

# list count now is useless
# print(len(count))
del count

train_set_id = [[func2id.get(func,-2) for func in i] for i in train_set]
valid_set_id = [[func2id.get(func,-2) for func in i] for i in validation_set]
test_set_id  = [[func2id.get(func,-2) for func in i] for i in test_set ]

del train_set
del validation_set
del test_set

# save the vocabulary
save_vocabulary(func2id)

# create corpus with word pairs
corpus         = create_corpus(train_set_id,skip_window)
corpus_indexes = [w for w in range(len(corpus))] 

# save them 
np.savetxt('../outputs/stacktraces_corpus_g.txt',corpus,fmt="%d")

# train_set_id now is useless
del train_set_id

### Validation and Test Pairs Creation

In [90]:
def create_testing_dict(test_set,min_occurance,num_words,num_words2,
                        skip_window,true_neigh,false_neigh):
    
    # numerate all funcs in the dataset.
    temp_funcs = [func for stack_trace in test_set for func in stack_trace]
    count = []
    count.extend(collections.Counter(temp_funcs).most_common())
    
    # list temp_funcs now is useless
    del temp_funcs
    
    # remove rare items
    count[:] = [e for e in count if e[1]>=min_occurance]
    indexes  = [i for i in range(len(count)) if count[i][0] != -2]
    
    # split validation set into two sets one small used for cross entropy computation
    # and the other at the end to meassure results.
    if num_words2>0:
        
        samples2  = np.random.choice(indexes,num_words2,replace = False)
        target_w2 = [count[i][0] for i in samples2]
        w_dict2   = create_testing_pairs(test_set,count,target_w2,indexes,
                                         skip_window,true_neigh,false_neigh)
        
        # test on the "num_words" most frequent items
        tmp_indexes = [i for i in indexes if i not in samples2]
        target_w    = [count[tmp_indexes[i]][0] for i in range(num_words)]
        w_dict      = create_testing_pairs(test_set,count,target_w,indexes,
                                           skip_window,true_neigh,false_neigh)
        return w_dict2,w_dict
    
    else:
        
        # test on the "num_words" most frequent items
        target_w = [count[indexes[i]][0] for i in range(num_words)]
        w_dict   = create_testing_pairs(test_set,count,target_w,indexes,skip_window,true_neigh,false_neigh)
        return None,w_dict

In [91]:
def create_testing_pairs(test_set,count,target_w,indexes,
                         skip_window,true_neigh,false_neigh):
    
    # initialize temporary buffer
    span   = skip_window*2+1
    buffer = collections.deque(maxlen = span)
    
    # initialize dictionary
    w_dict   = dict([(key, [[],[]]) for key in target_w])
    
    # find true neighbors for target words
    for desc in test_set:
        for w in target_w:
            temp_idx = [i for i,e in enumerate(desc) if w == e]
            for idx in temp_idx:
                find_context_words(desc,idx,skip_window,span,buffer)
                for i in range(1,len(buffer)):
                    if w_dict[w][0] == []:
                        w_dict[w][0].append(buffer[i])
                    elif buffer[i] not in w_dict[w][0]:
                        w_dict[w][0].append(buffer[i])
                        
    # find false neigbors
    for key in w_dict:
        neig_counter = 0
        flag         = True
        
        while flag == True:
            random_idx   = np.random.choice(indexes,2*false_neigh,replace = False)
            for idx in random_idx:
                if count[idx][0] == key:
                    continue
                elif count[idx][0] in w_dict[key][0]:
                    continue
                elif count[idx][0] not in w_dict[key][1]:
                    w_dict[key][1].append(count[idx][0])
                    neig_counter += 1
                    if neig_counter >= false_neigh:
                        flag = False
                        break
    
    # choose randomly only true_neigh neighbors.
    removed_keys = []
    for key in w_dict:
        if len(w_dict[key][0])>=true_neigh:
            idx_neigh =  np.random.choice([i for i in range(len(w_dict[key][0]))],true_neigh,replace = False)
            w_dict[key][0] = [w_dict[key][0][i] for i in idx_neigh]
        else:
            removed_keys.append(key)
    
    if removed_keys != []:
        for key in removed_keys:
            w_dict.pop(key)
    
    return w_dict

In [92]:
def save_test_pairs(test_dict):
    
    with open('../outputs/stack_testing_pairs_g.pkl','wb') as file:
        pickle.dump(test_dict,file,pickle.HIGHEST_PROTOCOL)


In [93]:
_,test_dict = create_testing_dict(test_set_id,1,test_funcs,0,2,true_neigh,false_neigh)
save_test_pairs(test_dict)
#del test_dict


v_dict2,v_dict = create_testing_dict(valid_set_id,1,valid_funcs,
                                     valid_funcs2,2,true_neigh,false_neigh)

t_batch  = []
t_label  = []
for key in v_dict2:
    for value in v_dict2[key][0]:
        t_batch.append(key)
        t_label.append(value)
v_batch = np.reshape(t_batch,(len(t_batch),))
v_label = np.reshape(t_label,(len(t_label),1))

## Stack's Embeddings Model Definition and Training

In [94]:
import math
import time
import numpy as np
from sklearn import metrics
import tensorflow.compat.v1 as tf
tf.compat.v1.disable_eager_execution()
from tensorflow.keras import layers
from tensorflow.keras import initializers

In [95]:
def generate_batch(corpus_data,corpus_indexes,batch_size):
    
    batch  = np.ndarray(shape = (batch_size),   dtype = np.int32)
    labels = np.ndarray(shape = (batch_size,1), dtype = np.int32)
    
    seed(datetime.now())
    
    words_to_use = random.sample(corpus_indexes,batch_size)
    
    for counter,value in enumerate(words_to_use):
        batch[counter]    = corpus_data[value][0]
        labels[counter,0] = corpus_data[value][1] 
    
    return batch,labels

In [96]:
def model_def_cpu(corpus_data,corpus_indexes,batch_size,embedding_dim,
                  num_sampled,learning_rate,vocabulary_size,v_batch,v_labels):
    
    # Input data
    X_train = tf.placeholder(tf.int32, shape=[None])
    # Input label
    Y_train = tf.placeholder(tf.int32, shape=[None, 1])
    
    # ensure that the following ops & var are assigned to CPU
    with tf.device('/cpu:0'):
        
        # create the embedding variable wich contains the weights
        initializer = initializers.GlorotNormal()
        embedding   = tf.Variable(initializer(shape=(vocabulary_size,embedding_dim)))
        #embedding = tf.Variable(tf.random_normal([vocabulary_size,embedding_dim]))
        
        # create the lookup table for each sample in X_train=>avoiding to use one_hot encoder
        X_embed   = tf.nn.embedding_lookup(embedding,X_train)
        
        # create variables for the loss function
        nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size,embedding_dim],
                                                      stddev=1.0/ math.sqrt(embedding_dim)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
        
    
    loss_func = tf.reduce_sum(tf.nn.nce_loss(weights = nce_weights,biases =nce_biases,labels = Y_train,
                                              inputs = X_embed,num_sampled = num_sampled,
                                              num_classes = vocabulary_size ))
    
    optimizer = tf.train.AdamOptimizer(learning_rate)
    train_opt = optimizer.minimize(loss_func)
    
    # Define initializer for tensorflow variables
    init = tf.global_variables_initializer()
    
    with tf.Session() as sess:
        
        # actual initialize the variables
        sess.run(init)
        
        # patience method's variables
        min_loss           = float('inf')
        min_emb_matrix     = np.zeros((vocabulary_size,embedding_dim))
        step               = skip_window*batch_size/len(corpus_indexes) 
        print("the step is",step)
        patience_remaining = 100
        
        start_time = time.time()
        
        # train the model using 100 epoch patience
        for epoch in range(50000):
            
            # take a batch of data.
            batch_x,batch_y = generate_batch(corpus_data,corpus_indexes,batch_size)
            
            _,loss = sess.run([train_opt,loss_func],feed_dict={X_train:batch_x, Y_train:batch_y})
            valid_loss   = sess.run(loss_func,feed_dict={X_train:v_batch, Y_train:v_labels})
            
            patience_remaining     = patience_remaining - step
            if valid_loss < min_loss:
                min_loss           = valid_loss
                patience_remaining = 100
                min_emb_matrix     = embedding.eval()
            if patience_remaining <= 0:
                break
        
        # restore min embeddings
        embedding = tf.convert_to_tensor(min_emb_matrix)
        
        # normalize embeddings before using them
        norm = tf.sqrt(tf.reduce_sum(tf.square(embedding),1,keepdims = True))
        normalized_embedding = embedding/norm
        normalized_embedding_matrix = sess.run(normalized_embedding)
        
        #measure total time
        total_time = time.time() - start_time
        print("training time in seconds %s "%(str(total_time)))
        print("total epochs was",epoch+1)
        print("minimum loss is ",min_loss)
    
    return normalized_embedding_matrix

In [97]:
norm_embedding_matrix = model_def_cpu(corpus,corpus_indexes,batch_size,embedding_dim,
                                      num_sampled,learning_rate,vocabulary_size,v_batch,v_label)

np.savetxt('../results/stack_embeddings_g.txt',norm_embedding_matrix,fmt='%.8f')

the step is 0.11314917127071823
training time in seconds 16.883629322052002 
total epochs was 2692
minimum loss is  245.16527


### Model Validation

In [98]:
# The model computes tpr, fpr and auc. The classes are class_A = real neighbor
# and class_B = false neighbor. The model based on cosine similarity
# will try to predict the right label for each word pair given.
def model_validation_v2(embedding_matrix,words_dict):
    
    ylabels = list()
    ypreds  = list()
    
    for key in words_dict:
        target_emb = embedding_matrix[key]
        for true_neigh in words_dict[key][0]:
            neigh_emb = embedding_matrix[true_neigh]
            result    = np.dot(target_emb,neigh_emb)/(np.sqrt(np.dot(target_emb,target_emb))*np.sqrt(np.dot(neigh_emb,neigh_emb)))
            ylabels.append(1)
            ypreds.append(result)
            
        for false_neigh in words_dict[key][1]:
            neigh_emb = embedding_matrix[false_neigh]
            result    = np.dot(target_emb,neigh_emb)/(np.sqrt(np.dot(target_emb,target_emb))*np.sqrt(np.dot(neigh_emb,neigh_emb)))
            ylabels.append(0)
            ypreds.append(result)
    
    y = np.array(ylabels)
    score = np.array(ypreds)
    fpr,tpr,thresholds = metrics.roc_curve(y,score)
    auc = metrics.auc(fpr,tpr)
    return auc

In [99]:
auc = model_validation_v2(norm_embedding_matrix,v_dict)
print("Validation AUC:",auc)

Validation AUC: 0.895625


In [100]:
# unpickling test dictionary
with open('../outputs/stack_testing_pairs_g.pkl','rb') as infile:
    testing_dict = pickle.load(infile)

auc = model_validation_v2(norm_embedding_matrix,testing_dict)
print("Testing AUC:",auc)

Testing AUC: 0.8558888888888889
