# TAG CLASSIFICATION

In the second version of tag classifications we will try to predict the tag labels for issues based on descriptions and stack traces. More precise, we will use our word embeddings and stack traces embeddings which have already been created to compute the arithmetic representation of the issue and then based on that we will try to predict the type of issue. 

Still, we use logistic regression. Logistic regression used for binary classification but using the method one vs rest we can train one logistic regression model for each label.  Maybe one better version will be using the multinomial logistic regression

Moreover, for the arithmetic representation of issues first we will use the average of the word embeddings concatenated with the average of the stack traces embeddings. For those issues missing stack traces we will just zero padding in order to have fixed size. 

Maybe in later stage we will try to improve the formula using a weighted average based on TF-IDF method.

## Pre Processing

First, load the word embeddings and stack traces embedding matrices, the word's and trace's vocabulary and for every issue the corresponding tags and description and stack trace if exists.

In [69]:
import os
import re
import json
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

### Load and Clean Data

In [70]:
def load_dict(path_to_file):
    temp_dict = dict()
    with open(path_to_file) as file:
        lines = file.readlines()
        for line in lines:
            temp   = str(line)
            values = temp.split(',')
            temp_dict[values[0]] = int(values[1].replace("\n",""))
    
    return temp_dict
    

In [71]:
def load_issues(dir_path,tag_labels,descriptions,stack_traces):
    
    for fname in os.listdir(dir_path):
        with open(os.path.join(dir_path,fname)) as json_file:
            
            data = json.load(json_file)
            for issue in data:
                
                tags = issue['tags']
                for i in range(len(tags)):
                    tags[i] = tags[i].strip()
                description = issue['description']
                stack_trace = issue['stack_trace']
                name        = issue['name']
                
                if tags != [] and stack_trace !=[] : #(description != [] or stack_trace != []):
                    tag_labels.append(tags)
                    descriptions.append(description)
                    stack_traces.append(stack_trace)

In [72]:
# copy paste from stack_trace_embedding notebook

def clean_stack_trace(stack_trace):
    
    clean_stack_trace = []
    temp_stack        = stack_trace.split(" at ")[1:]
    
    to_find = re.compile("[|,|<|>]|/|\|=")
    
    #find where each function ends and keep only the path
    for f in temp_stack:
        temp      = f.find(')')
        temp_file = f[0:temp]
        
        # check the punctuations in order to avoid anything else
        match_obj = to_find.search(temp_file)
        if match_obj == None:
            filename = find_filename(temp_file)
            if filename != '':
                clean_stack_trace.append(filename)
                
    return clean_stack_trace

In [73]:
# copy paste from stack_trace_embedding notebook

def find_filename(value):
    filename = ""
    words    = value.split("(")
    if len(words)>=2:
        parts = words[0].split(".")
        filename = ".".join(parts[0:-1])
    return filename


In [74]:
# copy paste from word embeddings notebook

def clean_description(description):
    
    # define stop words
    all_stopwords = set(stopwords.words('english'))
    
    #define translator to translate punctuation to white space
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    
    #join all lines into one sentence
    sentence     = ' '.join(description)
    
    #translate punctuation
    new_sentence = sentence.translate(translator)
    
    #split the sentense in words
    words = new_sentence.split()
    
    words_sw = [w.lower() for w in words if not w.lower() in all_stopwords and len(w)>1]
    
    return words_sw


In [75]:
# copy paste from word embeddings notebook

def stemming_data(descriptions):
    
    stemmer = PorterStemmer()
    
    for desc in descriptions:
        for counter in range(len(desc)):
            if desc[counter].isalpha():
                desc[counter] = stemmer.stem(desc[counter])
            

In [76]:
def clean_data(descriptions,stack_traces,use_stemming):
    
    clean_descriptions = list()
    clean_stack_traces = list()
    
    for i in range(len(descriptions)):
        
        temp_desc   = descriptions[i]
        temp_trace  = stack_traces[i]
        stack_trace = []
        
        
        if temp_trace != []:
            if len(temp_trace)>1:
                stack_trace = clean_stack_trace(''.join(temp_trace))
            else:
                stack_trace = clean_stack_trace(temp_trace[0])
            
        if temp_desc  != []:
            clean_desc = clean_description(temp_desc)
            
        clean_descriptions.append(clean_desc)
        clean_stack_traces.append(stack_trace)
            
    if use_stemming == True:
        stemming_data(clean_descriptions)
        
    return clean_descriptions,clean_stack_traces

### Compute Arithmetic Representations for Issues

In [77]:
def compute_embeddings(arithmetic_descriptions,arithmetic_stack_traces,
                       word_embedding_matrix,stack_embedding_matrix,use_stacks):
    
    descriptions_dim  = np.shape(word_embedding_matrix)[1]
    if use_stacks == True:
        stack_traces_dim  = np.shape(stack_embedding_matrix)[1]
    else:
        stack_traces_dim  = 0
    
    num_issues        = len(arithmetic_descriptions)
    issues_embeddings = np.zeros((num_issues,descriptions_dim+stack_traces_dim))
    
    for counter in range(len(arithmetic_descriptions)):
        
        temp_desc   = arithmetic_descriptions[counter]
        temp_stack  = arithmetic_stack_traces[counter]
        total_words = 0
        total_funcs = 0
        
        for word in temp_desc:
            if word != -2:
                total_words += 1
                issues_embeddings[counter][0:descriptions_dim] = issues_embeddings[counter][0:descriptions_dim] + word_embedding_matrix[word]
        
        if total_words != 0 :
            issues_embeddings[counter]    /= total_words
        
        if use_stacks == True:
            for func in temp_stack:
                if func != -2:
                    issues_embeddings[counter][descriptions_dim:] = issues_embeddings[counter][descriptions_dim:] + stack_embedding_matrix[func]
                    total_funcs += 1
                
            if total_funcs != 0:
                issues_embeddings[counter][descriptions_dim:] = issues_embeddings[counter][descriptions_dim:] / total_funcs 
            
    return issues_embeddings    

In [78]:
use_stemming = False

In [79]:
# load word embeddings
word_embedding_matrix = np.loadtxt('word_embeddings_good.txt', dtype=np.float64)

# load stack traces embeddings 
stack_embedding_matrix = np.loadtxt('stack_embeddings_new.txt', dtype=np.float64)

# load vocabularies
word2id_path = "vocabulary_good.txt"
func2id_path = "stack_traces_vocabulary.txt"

word2id = load_dict(word2id_path)
func2id = load_dict(func2id_path)

#load tags and descriptions
dir_path     = '/home/kostas/Documents/thesis/data_1'
tag_labels   = list()
descriptions = list()
stack_traces = list()

# load issues
load_issues(dir_path,tag_labels,descriptions,stack_traces)

# transform data to arithmetic representation
clean_descriptions,clean_stack_traces = clean_data(descriptions,stack_traces,use_stemming)

del descriptions
del stack_traces

#arithmetic_transformations
arithmetic_descriptions = [[word2id.get(word,-2) for word in desc]   for desc in clean_descriptions]
arithmetic_stack_traces = [[func2id.get(func,-2) for func in trace] for trace in clean_stack_traces]

del clean_descriptions
del clean_stack_traces

issues_embeddings  = compute_embeddings(arithmetic_descriptions,arithmetic_stack_traces,
                                        word_embedding_matrix,stack_embedding_matrix,True)

## Classification

In [80]:
tags = ['Bug','Google Play or Beta feedback','Feedback required','Feature Request','Prio - High','Frontend Design']
no_tags = 6
np_tags = np.zeros((len(arithmetic_descriptions),no_tags))

for counter in range(len(tag_labels)):
    for counter_2,value in enumerate(tags):
        if value in tag_labels[counter]:
            np_tags[counter][counter_2] = 1
            
df_tags = pd.DataFrame(np_tags, columns = tags)

### Dummy Classifier

In [81]:
from sklearn.dummy import DummyClassifier

In [82]:
def my_dummy_classifier(tags,df_tags,issues_embeddings,cl_label,n_splits):
    
    target_label    = df_tags[cl_label]
    dummy_clf       = DummyClassifier(strategy = "uniform",random_state=0)
    total_confusion = np.zeros((2,2))
    
    #fit model 
    dummy_clf.fit(issues_embeddings,target_label)
    predictions = dummy_clf.predict(issues_embeddings)
    total_confusion = confusion_matrix(target_label,predictions)

    print(total_confusion)
    print("accuracy = TP+TN/(TP+TN+FP+FN)",(total_confusion[0][0]+total_confusion[1][1])/np.sum(total_confusion))
    print("custom metric",np.sqrt((total_confusion[0][0]/(total_confusion[0][0]+total_confusion[0][1]))*
                                  (total_confusion[1][1]/(total_confusion[1][1]+total_confusion[1][0]))))

### Logistic Regression Classifier

In [16]:
def my_classifier(tags,df_tags,issues_embeddings,cl_label,n_splits):
    
    target_label    = df_tags[cl_label]
    counter_1       = np.sum(target_label)
    #print(counter_1)
    #weight_0        = counter_1/target_label.shape[0]
    #weight_1        = (1 - weight_0)
    weight_0        = 1/(target_label.shape[0]-counter_1)
    weight_1        = 1/counter_1
    w               = {0:weight_0,1:weight_1}
    skf             = StratifiedKFold(n_splits)
    model           = LogisticRegression(solver='lbfgs',class_weight = w)
    total_confusion = np.zeros((2,2))
    for train_index, test_index in skf.split(issues_embeddings,target_label):
        
        X_train,X_test = issues_embeddings[train_index], issues_embeddings[test_index]
        y_train,y_test = target_label[train_index], target_label[test_index]
        
        #fit model 
        model.fit(X_train,y_train)
        predictions = model.predict(X_test)
        
        #print(confusion_matrix(y_test,predictions))
        total_confusion = total_confusion+confusion_matrix(y_test,predictions)
        
    print(total_confusion)
    print("accuracy = TP+TN/(TP+TN+FP+FN)",(total_confusion[0][0]+total_confusion[1][1])/np.sum(total_confusion))
    print("custom metric",np.sqrt((total_confusion[0][0]/(total_confusion[0][0]+total_confusion[0][1]))*
                                  (total_confusion[1][1]/(total_confusion[1][1]+total_confusion[1][0]))))
    

In [47]:
my_classifier(tags,df_tags,issues_embeddings,"Prio - High",10)

[[211. 176.]
 [ 47.  38.]]
accuracy = TP+TN/(TP+TN+FP+FN) 0.527542372881356
custom metric 0.49370563095634196


In [17]:
my_dummy_classifier(tags,df_tags,issues_embeddings,"Bug",10)

[[ 28  30]
 [198 216]]
accuracy = TP+TN/(TP+TN+FP+FN) 0.5169491525423728
custom metric 0.5018705639589911


## Neural Network Classifier

In [83]:
import time
import random
from random import seed
from random import randint
from datetime import datetime
import tensorflow.compat.v1 as tf
tf.compat.v1.disable_eager_execution()
from sklearn.model_selection import StratifiedShuffleSplit

In [101]:
def split_dataset(issues_embeddings,target_labels):
    
    sss = StratifiedShuffleSplit(n_splits = 1, test_size = 0.3, random_state = 0)
    
    for train_index, test_index in sss.split(issues_embeddings,target_labels):
        X_train,X_test = issues_embeddings[train_index], issues_embeddings[test_index]
        Y_train,Y_test = target_labels[train_index], target_labels[test_index]
    
    return X_train,Y_train,X_test,Y_test

In [102]:
def generate_batch(issues_embeddings,target_labels,batch_size):
    
    batch  = np.ndarray(shape = (batch_size,np.shape(issues_embeddings)[1]), dtype = np.float64)
    labels = np.ndarray(shape = (batch_size,2), dtype = np.float64)
    
    seed(datetime.now())
    
    issues_to_use = random.sample([i for i in range(np.shape(issues_embeddings)[0])],batch_size)
    
    for counter,value in enumerate(issues_to_use):
        batch[counter][:]  = issues_embeddings[value][:]
        # label_0
        labels[counter][0] = 1-target_labels.iloc[value]
        # label_1
        labels[counter][1] =   target_labels.iloc[value]
    return batch,labels

In [113]:
def my_classifier_nn(issues_embeddings,target_labels,hidden_layer_dim,learning_rate,
                     batch_size,epochs,v_batch,v_labels):
    
    # input data
    X_train = tf.placeholder(tf.float64, shape=[None,np.shape(issues_embeddings)[1]])
    # input label
    Y_train = tf.placeholder(tf.float64, shape=[None,2])
    
    # input-hidden layer variables
    W1 = tf.Variable(tf.truncated_normal([np.shape(issues_embeddings)[1],hidden_layer_dim],stddev = 1.0,dtype=tf.float64),name='W1')
    b1 = tf.Variable(tf.random_normal([hidden_layer_dim],stddev = 1.0,dtype=tf.float64),name = 'b1')
    
    # hidden-output layer variables
    W2 = tf.Variable(tf.truncated_normal([hidden_layer_dim,2],stddev = 1.0,dtype=tf.float64),name = 'W2')
    b2 = tf.Variable(tf.random_normal([2],dtype=tf.float64),name = 'b2')
    
    ##neural network's functions
    hidden_layer = tf.add(tf.matmul(X_train,W1),b1)
    hidden_layer = tf.nn.tanh(hidden_layer)
    
    output_layer = tf.add(tf.matmul(hidden_layer,W2),b2)
    output_layer_2 = tf.nn.softmax(output_layer)
    
    cost_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = Y_train,logits = output_layer))
    
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost_func)
    
    init = tf.initialize_all_variables()
    
    with tf.Session() as sess:
        
        sess.run(init)
        
        for epoch in range(epochs):
            
            # generate batch.
            batch_x,batch_y = generate_batch(issues_embeddings,target_labels,batch_size)
            
            # train the model
            _,loss = sess.run([optimizer,cost_func],feed_dict={X_train:batch_x,Y_train:batch_y})
        
        # validation
        y_preds = sess.run(output_layer_2,feed_dict={X_train:v_batch,Y_train:v_labels})
        for i in range(len(y_preds)):
            print(y_preds[i],v_labels[i])

In [117]:
train_issues,train_labels,test_issues,test_labels = split_dataset(issues_embeddings,target_labels)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])
my_classifier_nn(train_issues,train_labels,8,0.1,64,500,v_batch,v_labels)

[0.08677736 0.91322264] [1. 0.]
[0.11932975 0.88067025] [0. 1.]
[0.02914446 0.97085554] [0. 1.]
[0.31196683 0.68803317] [0. 1.]
[0.57121026 0.42878974] [0. 1.]
[0.03413461 0.96586539] [0. 1.]
[0.04087443 0.95912557] [0. 1.]
[0.08751638 0.91248362] [0. 1.]
[0.00226097 0.99773903] [0. 1.]
[0.0332156 0.9667844] [1. 0.]
[0.23383637 0.76616363] [0. 1.]
[0.13802276 0.86197724] [0. 1.]
[0.06656157 0.93343843] [0. 1.]
[0.0866391 0.9133609] [0. 1.]
[0.01597245 0.98402755] [0. 1.]
[0.08404197 0.91595803] [0. 1.]
[0.07984018 0.92015982] [0. 1.]
[0.08702431 0.91297569] [0. 1.]
[0.08423335 0.91576665] [0. 1.]
[0.19600063 0.80399937] [0. 1.]
[0.02734662 0.97265338] [0. 1.]
[0.97258412 0.02741588] [0. 1.]
[0.00149977 0.99850023] [1. 0.]
[0.00581731 0.99418269] [0. 1.]
[0.0314267 0.9685733] [0. 1.]
[0.08684509 0.91315491] [0. 1.]
[0.08742098 0.91257902] [0. 1.]
[0.08870444 0.91129556] [0. 1.]
[0.12781829 0.87218171] [0. 1.]
[0.00166635 0.99833365] [0. 1.]
[0.08816083 0.91183917] [0. 1.]
[0.30077481 0.

In [86]:
def generate_batch_v2(issues_embeddings,target_labels,batch_size):
    
    batch  = np.ndarray(shape = (batch_size,np.shape(issues_embeddings)[1]), dtype = np.float64)
    labels = np.ndarray(shape = (batch_size,1), dtype = np.float64)
    
    seed(datetime.now())
    
    issues_to_use = random.sample([i for i in range(np.shape(issues_embeddings)[0])],batch_size)
    
    for counter,value in enumerate(issues_to_use):
        batch[counter][:]  = issues_embeddings[value][:]
        labels[counter,0] = target_labels.iloc[value]
        
    return batch,labels

In [88]:
def my_classifier_nn_v2(issues_embeddings,target_labels,hidden_layer_dim,learning_rate,
                        batch_size,epochs,v_batch,v_labels):
    
    # input data
    X_train = tf.placeholder(tf.float64, shape=[None,np.shape(issues_embeddings)[1]])
    # input label
    Y_train = tf.placeholder(tf.float64, shape=[None,1])
    
    # input-hidden layer variables
    W1 = tf.Variable(tf.truncated_normal([np.shape(issues_embeddings)[1],hidden_layer_dim],stddev = 1.0,dtype=tf.float64),name='W1')
    b1 = tf.Variable(tf.random_normal([hidden_layer_dim],stddev = 1.0,dtype=tf.float64),name = 'b1')
    
    W2 = tf.Variable(tf.truncated_normal([hidden_layer_dim,1],stddev = 1.0,dtype=tf.float64),name = 'W2')
    b2 = tf.Variable(tf.random_normal([1],dtype=tf.float64),name = 'b2')
    
    # neural network's functions
    hidden_layer = tf.add(tf.matmul(X_train,W1),b1)
    hidden_layer = tf.nn.tanh(hidden_layer)
    
    output_layer  = tf.add(tf.matmul(hidden_layer,W2),b2)
    output_layer2 = tf.nn.sigmoid(output_layer) 
    
    cost_func = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels = Y_train,logits = output_layer))
    
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost_func)
    
    init = tf.initialize_all_variables()
    
    with tf.Session() as sess:
        
        sess.run(init)
        
        for epoch in range(epochs):
            # generate batch.
            batch_x,batch_y = generate_batch_v2(issues_embeddings,target_labels,batch_size)
            # train the model
            _,loss = sess.run([optimizer,cost_func],feed_dict={X_train:batch_x,Y_train:batch_y})
            
        
        y_preds = sess.run(output_layer,feed_dict = {X_train:v_batch,Y_train:v_labels})
        for i in range(len(y_preds)):
            print(y_preds[i],v_labels[i])

In [None]:
target_labels = df_tags["Bug"]
train_issues,train_labels,test_issues,test_labels = split_dataset(issues_embeddings,target_labels)
v_batch  = np.reshape(test_issues,(-1,np.shape(issues_embeddings)[1]))
v_labels = test_labels.to_numpy()
v_labels = np.reshape(v_labels,(-1,1))
preds = my_classifier_nn_v2(train_issues,train_labels,8,0.1,64,800,v_batch,v_labels)