# Tag Classification Use Only Stack Traces

This notebook contains logistic regression and neural network for tag classification but based only on stack traces.<br/>
We want to calculate if stack embeddings alone have any correlation with tag classification and if they contains additional knowledge

In [136]:
import os
import re
import json
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

In [137]:
def load_dict(path_to_file):
    temp_dict = dict()
    with open(path_to_file) as file:
        lines = file.readlines()
        for line in lines:
            temp   = str(line)
            values = temp.split(',')
            temp_dict[values[0]] = int(values[1].replace("\n",""))
    
    return temp_dict 

In [138]:
def load_issues(dir_path,tag_labels,stack_traces):
    
    for fname in os.listdir(dir_path):
        with open(os.path.join(dir_path,fname)) as json_file:
            
            data = json.load(json_file)
            for issue in data:
                
                tags = issue['tags']
                for i in range(len(tags)):
                    tags[i] = tags[i].strip()
                
                stack_trace = issue['stack_trace']
                name        = issue['name']
                
                if tags != [] and stack_trace !=[] :
                    tag_labels.append(tags)
                    stack_traces.append(stack_trace)

In [139]:
# copy paste from stack_trace_embedding notebook

def clean_stack_trace(stack_trace):
    
    clean_stack_trace = []
    temp_stack        = stack_trace.split(" at ")[1:]
    
    to_find = re.compile("[|,|<|>]|/|\|=")
    
    #find where each function ends and keep only the path
    for f in temp_stack:
        temp      = f.find(')')
        temp_file = f[0:temp]
        
        # check the punctuations in order to avoid anything else
        match_obj = to_find.search(temp_file)
        if match_obj == None:
            filename = find_filename(temp_file)
            if filename != '':
                clean_stack_trace.append(filename)
                
    return clean_stack_trace

In [140]:
# copy paste from stack_trace_embedding notebook

def find_filename(value):
    filename = ""
    words    = value.split("(")
    if len(words)>=2:
        parts = words[0].split(".")
        filename = ".".join(parts[0:-1])
    return filename


In [141]:
def clean_data(stack_traces):
    
    clean_stack_traces = list()
    
    for i in range(len(stack_traces)):
        
        temp_trace  = stack_traces[i]
        stack_trace = []
        
        if temp_trace != []:
            if len(temp_trace)>1:
                stack_trace = clean_stack_trace(' '.join(temp_trace))
            else:
                stack_trace = clean_stack_trace(temp_trace[0])
                
        clean_stack_traces.append(stack_trace)
        
    return clean_stack_traces

In [142]:
def compute_embeddings(arithmetic_stack_traces,stack_embedding_matrix):
    
    stack_traces_dim  = np.shape(stack_embedding_matrix)[1]
    
    num_issues        = len(arithmetic_stack_traces)
    issues_embeddings = np.zeros((num_issues,stack_traces_dim))
    
    for counter in range(num_issues):
        
        temp_stack  = arithmetic_stack_traces[counter]
        total_funcs = 0
        
        for func in temp_stack:
            if func != -2:
                total_funcs += 1
                issues_embeddings[counter][0:stack_traces_dim] = issues_embeddings[counter][0:stack_traces_dim]+ stack_embedding_matrix[func]
        if total_funcs != 0:
            issues_embeddings[counter]= issues_embeddings[counter]/ total_funcs
    
    return issues_embeddings

In [143]:
# load stack traces embeddings 
stack_embedding_matrix = np.loadtxt('../results/stack_embeddings_g.txt', dtype=np.float64)

# load vocabularies
func2id_path = "../outputs/stacktraces_vocabulary_g.txt"
func2id = load_dict(func2id_path)

#load tags and descriptions
dir_path     = '../data'
tag_labels   = list()
stack_traces = list()

# load issues
load_issues(dir_path,tag_labels,stack_traces)

# clean stack traces
clean_stack_traces = clean_data(stack_traces)

clean_stack_traces_2 = list()
clean_tags_2         = list()

# remove empty stack traces
for counter,value in enumerate(clean_stack_traces):
    if value != []:
        clean_stack_traces_2.append(value)
        clean_tags_2.append(tag_labels[counter])

del clean_stack_traces
del stack_traces

arithmetic_stack_traces = [[func2id.get(func,-2) for func in trace] for trace in clean_stack_traces_2]
del clean_stack_traces_2

issues_embeddings  = compute_embeddings(arithmetic_stack_traces,stack_embedding_matrix)

In [148]:
print(issues_embeddings[450])

[-0.38540108  0.20484011  0.10255159 -0.47162718  0.27095725  0.4557721
  0.20261681  0.0100261 ]


## Classification

In [144]:
tag_labels = list()
# copy by reference in order to avoid to change every where the variable name
tag_labels = clean_tags_2

In [105]:
tags = ['Bug','Google Play or Beta feedback','Prio - High']
no_tags = 3
np_tags = np.zeros((len(arithmetic_stack_traces),no_tags))

for counter in range(len(tag_labels)):
    for counter_2,value in enumerate(tags):
        if value in tag_labels[counter]:
            np_tags[counter][counter_2] = 1
            
df_tags = pd.DataFrame(np_tags, columns = tags)

In [106]:
def my_classifier(tags,df_tags,issues_embeddings,cl_label,n_splits):
    
    target_label    = df_tags[cl_label]
    counter_1       = np.sum(target_label)
    weight_0        = 1/(target_label.shape[0]-counter_1)
    weight_1        = 1/counter_1
    w               = {0:weight_0,1:weight_1}
    skf             = StratifiedKFold(n_splits)
    model           = LogisticRegression(solver='lbfgs',class_weight = w)
    total_confusion = np.zeros((2,2))
    
    for train_index, test_index in skf.split(issues_embeddings,target_label):
        
        X_train,X_test = issues_embeddings[train_index], issues_embeddings[test_index]
        y_train,y_test = target_label[train_index], target_label[test_index]
        
        #fit model 
        model.fit(X_train,y_train)
        predictions = model.predict(X_test)
        
        #print(confusion_matrix(y_test,predictions))
        total_confusion = total_confusion+confusion_matrix(y_test,predictions)
        
    print(total_confusion)
    print("accuracy = TP+TN/(TP+TN+FP+FN)",(total_confusion[0][0]+total_confusion[1][1])/np.sum(total_confusion))
    print("GM",np.sqrt((total_confusion[0][0]/(total_confusion[0][0]+total_confusion[0][1]))*
                                  (total_confusion[1][1]/(total_confusion[1][1]+total_confusion[1][0]))))
    print("\n")

In [107]:
# only stack_traces
my_classifier(tags,df_tags,issues_embeddings,"Bug",10)
my_classifier(tags,df_tags,issues_embeddings,"Google Play or Beta feedback",10)
my_classifier(tags,df_tags,issues_embeddings,"Prio - High",10)

[[ 26.  22.]
 [172. 231.]]
accuracy = TP+TN/(TP+TN+FP+FN) 0.5698447893569845
GM 0.5572107958104742


[[152. 131.]
 [ 68. 100.]]
accuracy = TP+TN/(TP+TN+FP+FN) 0.5587583148558758
GM 0.5654236051605539


[[139. 230.]
 [ 33.  49.]]
accuracy = TP+TN/(TP+TN+FP+FN) 0.41685144124168516
GM 0.4744444065192336




## Neural Network Classifier

In [108]:
import time
import math
import random
from random import seed
from random import randint
from datetime import datetime
import tensorflow.compat.v1 as tf
tf.compat.v1.disable_eager_execution()
from sklearn.model_selection import StratifiedShuffleSplit

In [109]:
def split_dataset2(issues_embeddings,target_labels):
    
    sss = StratifiedShuffleSplit(n_splits = 1, test_size = 0.1, random_state = 0)
    
    X_train_0 = list()
    X_train_1 = list()
    
    for train_index, test_index in sss.split(issues_embeddings,target_labels):
        #X_train,X_test = issues_embeddings[train_index], issues_embeddings[test_index]
        #Y_train,Y_test = target_labels[train_index], target_labels[test_index]
        
        
        X_test = issues_embeddings[test_index]
        Y_test = target_labels[test_index]
        
        for index in train_index:
            if target_labels.iloc[index] == 0:
                X_train_0.append(issues_embeddings[index])
            elif target_labels.iloc[index] == 1:
                X_train_1.append(issues_embeddings[index])
                
    return X_train_0,X_train_1,X_test,Y_test

In [110]:
def generate_batch(issues_embeddings,target_labels,batch_size):
    
    batch  = np.ndarray(shape = (batch_size,np.shape(issues_embeddings)[1]), dtype = np.float64)
    labels = np.ndarray(shape = (batch_size,2), dtype = np.float64)
    
    issues_to_use = random.sample([i for i in range(np.shape(issues_embeddings)[0])],batch_size)
    
    for counter,value in enumerate(issues_to_use):
        batch[counter][:]  = issues_embeddings[value][:]
        # label_0
        labels[counter][0] = 1-target_labels.iloc[value]
        # label_1
        labels[counter][1] =   target_labels.iloc[value]
    return batch,labels

In [111]:
def pooling(issues_embeddings_0, issues_embeddings_1, batch_size):
    
    batch  = np.ndarray(shape = (batch_size,np.shape(issues_embeddings_0)[1]), dtype = np.float64)
    labels = np.ndarray(shape = (batch_size,2), dtype = np.float64)
    
    issues_to_use_0 = random.sample([i for i in range(np.shape(issues_embeddings_0)[0])],batch_size//2)
    issues_to_use_1 = random.sample([i for i in range(np.shape(issues_embeddings_1)[0])],batch_size//2)
    
    # even indexes for issues belong to class 0
    # odd  indexes for issues belong to class 1
    counter_0 = 0
    counter_1 = 0
    
    for counter in range(batch_size):
        
        # even indexes
        if counter%2 == 0 :
            batch[counter][:]  = issues_embeddings_0[issues_to_use_0[counter_0]][:]
            labels[counter][0] = 1
            labels[counter][1] = 0
            counter_0 += 1
        else:
            batch[counter][:]  = issues_embeddings_1[issues_to_use_1[counter_1]][:]
            labels[counter][0] = 0
            labels[counter][1] = 1
            counter_1 += 1
            
    return batch,labels

In [117]:
def my_classifier_nn2(issues_embeddings_0,issues_embeddings_1,hidden_layer_dim,
                      learning_rate,batch_size,epochs,v_batch,v_labels):
    
    # input data
    X_train = tf.placeholder(tf.float64, shape=[None,np.shape(issues_embeddings_0)[1]])
    # input label
    Y_train = tf.placeholder(tf.float64, shape=[None,2])
    
    # input-hidden layer variables
    W1 = tf.Variable(tf.truncated_normal([np.shape(issues_embeddings_0)[1],hidden_layer_dim],
                                         stddev = 1.0/ math.sqrt(hidden_layer_dim),
                                         dtype=tf.float64),name='W1')
    b1 = tf.Variable(tf.random_normal([hidden_layer_dim],stddev = 1.0/ math.sqrt(hidden_layer_dim),
                                      dtype=tf.float64),name = 'b1')
    
    # hidden-output layer variables
    W2 = tf.Variable(tf.truncated_normal([hidden_layer_dim,2],
                                         stddev = 1.0/ math.sqrt(hidden_layer_dim),
                                         dtype=tf.float64),name = 'W2')
    b2 = tf.Variable(tf.random_normal([2],dtype=tf.float64),name = 'b2')
    
    # neural network's functions
    hidden_layer   = tf.add(tf.matmul(X_train,W1),b1)
    hidden_layer   = tf.nn.tanh(hidden_layer)
    
    output_layer   = tf.add(tf.matmul(hidden_layer,W2),b2)
    output_layer_2 = tf.nn.softmax(output_layer)
    
    cost_func = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(labels = Y_train,logits = output_layer))
    
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost_func)
    
    init = tf.global_variables_initializer()
    
    with tf.Session() as sess:
        
        sess.run(init)
        
        for epoch in range(epochs):
            
            # generate batch.
            batch_x,batch_y = pooling(issues_embeddings_0,issues_embeddings_1,batch_size)
            
            # train the model
            _,loss = sess.run([optimizer,cost_func],feed_dict={X_train:batch_x,Y_train:batch_y})
        
        # validation
        y_probs     = sess.run(output_layer_2,feed_dict={X_train:v_batch,Y_train:v_labels})
    
    return compute_metrics(y_probs,v_labels)

In [118]:
def compute_metrics(y_probs,v_labels):
    
    y_preds_1 = np.ndarray(shape = (np.shape(v_labels)[0],1), dtype = np.float64)
    y_true_1  = np.ndarray(shape = (np.shape(v_labels)[0],1), dtype = np.float64)
    
    for i in range(np.shape(v_labels)[0]):
        y_true_1[i]  = v_labels[i][1]
        y_preds_1[i] = 0 if y_probs[i][0]>y_probs[i][1] else 1
    
    total_confusion = confusion_matrix(y_true=y_true_1,y_pred=y_preds_1)
    return total_confusion

In [119]:
seed(datetime.now())

In [None]:
############################################################

In [120]:
# use only stack traces embeddings
target_labels = df_tags["Bug"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,
                                                                      target_labels)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])

43


In [127]:
total_confusion = np.zeros((2,2))

for i in range(10):
    total_confusion += my_classifier_nn2(train_issues_0,train_issues_1,4,0.01,
                                         2*batch_size,1000,v_batch,v_labels)

    
acc = (total_confusion[0][0]+total_confusion[1][1])/np.sum(total_confusion)
gm  = np.sqrt((total_confusion[0][0]/(total_confusion[0][0]+total_confusion[0][1]))*
              (total_confusion[1][1]/(total_confusion[1][1]+total_confusion[1][0])))

print("accuracy",acc)
print("GM",gm)
print("\n")

accuracy 0.5760869565217391
GM 0.550565119871857




In [None]:
############################################################

In [128]:
# second implementation
target_labels = df_tags["Prio - High"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,
                                                                      target_labels)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])

74


In [131]:
# run the neural network using and stack traces embeddings
total_confusion = np.zeros((2,2))

for i in range(10):
    total_confusion += my_classifier_nn2(train_issues_0,train_issues_1,
                                         4,0.01,2*batch_size,50,v_batch,v_labels)

    
acc = (total_confusion[0][0]+total_confusion[1][1])/np.sum(total_confusion)
gm  = np.sqrt((total_confusion[0][0]/(total_confusion[0][0]+total_confusion[0][1]))*
              (total_confusion[1][1]/(total_confusion[1][1]+total_confusion[1][0])))

print("accuracy",acc)
print("GM",gm)
print("\n")

accuracy 0.48695652173913045
GM 0.5349618776383189




In [None]:
############################################################

In [132]:
# second implementation
target_labels = df_tags["Google Play or Beta feedback"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,
                                                                      target_labels)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])

151


In [135]:
# run the neural network using and stack traces embeddings
total_confusion = np.zeros((2,2))

for i in range(10):
    total_confusion += my_classifier_nn2(train_issues_0,train_issues_1,
                                         4,0.01,2*batch_size,50,v_batch,v_labels)

    
acc = (total_confusion[0][0]+total_confusion[1][1])/np.sum(total_confusion)
gm  = np.sqrt((total_confusion[0][0]/(total_confusion[0][0]+total_confusion[0][1]))*
              (total_confusion[1][1]/(total_confusion[1][1]+total_confusion[1][0])))

print("accuracy",acc)
print("GM",gm)
print("\n")

accuracy 0.5978260869565217
GM 0.6109215597192633


