# TAG CLASSIFICATION

In the second version of tag classifications we will try to predict the tag labels for issues based on descriptions and stack traces. More precise, we will use our word embeddings and stack traces embeddings which have already been created to compute the arithmetic representation of the issue and then based on that we will try to predict the type of issue. 

Still, we use logistic regression. Logistic regression used for binary classification but using the method one vs rest we can train one logistic regression model for each label.  Maybe one better version will be using the multinomial logistic regression

Moreover, for the arithmetic representation of issues first we will use the average of the word embeddings concatenated with the average of the stack traces embeddings. For those issues missing stack traces we will just zero padding in order to have fixed size. 

Maybe in later stage we will try to improve the formula using a weighted average based on TF-IDF method.

## Pre Processing

First, load the word embeddings and stack traces embedding matrices, the word's and trace's vocabulary and for every issue the corresponding tags and description and stack trace if exists.

In [1]:
import os
import re
import json
import nltk
import string
import numpy as np
import pandas as pd
from sklearn import metrics
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

### Load and Clean Data

In [2]:
def load_dict(path_to_file):
    temp_dict = dict()
    with open(path_to_file) as file:
        lines = file.readlines()
        for line in lines:
            temp   = str(line)
            values = temp.split(',')
            temp_dict[values[0]] = int(values[1].replace("\n",""))
    
    return temp_dict 

In [3]:
def load_issues(dir_path,tag_labels,descriptions,stack_traces):
    
    for fname in os.listdir(dir_path):
        with open(os.path.join(dir_path,fname)) as json_file:
            
            data = json.load(json_file)
            for issue in data:
                
                tags = issue['tags']
                for i in range(len(tags)):
                    tags[i] = tags[i].strip()
                
                description = issue['description']
                stack_trace = issue['stack_trace']
                name        = issue['name']
                
                if tags != [] and stack_trace !=[] and description != []:
                    tag_labels.append(tags)
                    descriptions.append(description)
                    stack_traces.append(stack_trace)

In [4]:
# copy paste from stack_trace_embedding notebook

def clean_stack_trace(stack_trace):
    
    clean_stack_trace = []
    temp_1            = stack_trace.replace(r'\tat','  at').replace('\"at ',' at ')
    temp_stack        = temp_1.split(" at ")[1:]
    
    if(temp_stack == []):
        temp_stack_2 = temp_1.split(' ')
        for t in temp_stack_2:
            if t.count('.')>2 and t.find('(') != -1 and t.find(')') != -1:
                if t.find('.java:') > t.find('(') and t.find('.java:') < t.find(')'):
                    if len(t.split())>1:
                        temp_stack.append(t.split()[1])
                    else:
                        temp_stack.append(t)
    
    to_find = re.compile("[|,|<|>]|\|=")
    
    #find where each function ends and keep only the path
    for f in temp_stack:
        temp      = f.find(')')
        temp_file = f[0:temp]
        
        # check the punctuations in order to avoid anything else
        match_obj = to_find.search(temp_file)
        if match_obj == None:
            filename = find_filename(temp_file)
            if filename != '':
                clean_stack_trace.append(filename)
                
    return clean_stack_trace

In [5]:
# copy paste from stack_trace_embedding notebook

def find_filename(value):
    filename = ""
    words    = value.split("(")
    if len(words)>=2:
        parts = words[0].split(".")
        filename = ".".join(parts[0:-1])
    return filename


In [6]:
# copy paste from word embeddings notebook

def clean_description(description):
    
    # define stop words
    all_stopwords = set(stopwords.words('english'))
    
    #define translator to translate punctuation to white space
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    
    #join all lines into one sentence
    sentence     = ' '.join(description)
    
    #translate punctuation
    new_sentence = sentence.translate(translator)
    
    #split the sentense in words
    words = new_sentence.split()
    
    words_sw = [w.lower() for w in words if not w.lower() in all_stopwords and len(w)>1]
    
    return words_sw


In [7]:
# copy paste from word embeddings notebook

def stemming_data(descriptions):
    
    stemmer = PorterStemmer()
    
    for desc in descriptions:
        for counter in range(len(desc)):
            if desc[counter].isalpha():
                desc[counter] = stemmer.stem(desc[counter])
            

In [8]:
def clean_data(descriptions,stack_traces,use_stemming):
    
    clean_descriptions = list()
    clean_stack_traces = list()
    
    for i in range(len(descriptions)):
        
        temp_desc   = descriptions[i]
        temp_trace  = stack_traces[i]
        stack_trace = []
        clean_desc  = []
        
        if temp_trace != []:
            if len(temp_trace)>1:
                stack_trace = clean_stack_trace(' '.join(temp_trace))
            else:
                stack_trace = clean_stack_trace(temp_trace[0])
            
        if temp_desc  != []:
            clean_desc = clean_description(temp_desc)
            
        clean_descriptions.append(clean_desc)
        clean_stack_traces.append(stack_trace)
            
    if use_stemming == True:
        stemming_data(clean_descriptions)
        
    return clean_descriptions,clean_stack_traces

### Compute Arithmetic Representations for Issues

In [9]:
def compute_embeddings(arithmetic_descriptions,arithmetic_stack_traces,
                       word_embedding_matrix,stack_embedding_matrix,use_stacks):
    
    descriptions_dim  = np.shape(word_embedding_matrix)[1]
    if use_stacks == True:
        stack_traces_dim  = np.shape(stack_embedding_matrix)[1]
    else:
        stack_traces_dim  = 0
    
    num_issues        = len(arithmetic_descriptions)
    issues_embeddings = np.zeros((num_issues,descriptions_dim+stack_traces_dim))
    
    for counter in range(len(arithmetic_descriptions)):
        
        temp_desc   = arithmetic_descriptions[counter]
        temp_stack  = arithmetic_stack_traces[counter]
        total_words = 0
        total_funcs = 0
        
        for word in temp_desc:
            if word != -2:
                total_words += 1
                issues_embeddings[counter][0:descriptions_dim] = issues_embeddings[counter][0:descriptions_dim] + word_embedding_matrix[word]
        
        if total_words != 0 :
            issues_embeddings[counter]    /= total_words
        
        if use_stacks == True:
            for func in temp_stack:
                if func != -2:
                    issues_embeddings[counter][descriptions_dim:] = issues_embeddings[counter][descriptions_dim:] + stack_embedding_matrix[func]
                    total_funcs += 1
                
            if total_funcs != 0:
                issues_embeddings[counter][descriptions_dim:] = issues_embeddings[counter][descriptions_dim:] / total_funcs 
            
    return issues_embeddings    

In [10]:
use_stemming = True

In [11]:
# load word embeddings
word_embedding_matrix = np.loadtxt('../results_project_3/word_embeddings_g1.txt', dtype=np.float64)

# load stack traces embeddings 
stack_embedding_matrix = np.loadtxt('../results_project_3/stack_embeddings_g.txt', dtype=np.float64)

# load vocabularies
word2id_path = "../outputs_project_3/words_vocabulary_g1.txt"
func2id_path = "../outputs_project_3/stacktraces_vocabulary_g.txt"

word2id = load_dict(word2id_path)
func2id = load_dict(func2id_path)

#load tags and descriptions
dir_path     = '../spring'
tag_labels   = list()
descriptions = list()
stack_traces = list()

# load issues
load_issues(dir_path,tag_labels,descriptions,stack_traces)

# transform data to arithmetic representation
clean_descriptions,clean_stack_traces = clean_data(descriptions,stack_traces,use_stemming)

clean_descriptions_2 = list()
clean_stack_traces_2 = list()
clean_tags_2         = list()

# remove empty stack traces or dublicate issues
for counter in range(len(clean_stack_traces)):
    
    if clean_stack_traces[counter] != [] :
        
        flag   = False
        flag_2 = False 
        
        # remove empty stack traces 
        for i in clean_stack_traces[counter]:
            func = func2id.get(i,-2)
            if func != -2:
                flag_2 = True
                break
        if flag_2 == False:
            continue
        
        # check for dublicates
        for counter_2 in range(len(clean_stack_traces_2)):
            if clean_descriptions[counter] == clean_descriptions_2[counter_2] and \
               clean_stack_traces[counter] == clean_stack_traces_2[counter_2]:
                    flag = True
        
        if flag == False:
            clean_stack_traces_2.append(clean_stack_traces[counter])
            clean_descriptions_2.append(clean_descriptions[counter])
            clean_tags_2.append(tag_labels[counter])
                    
del clean_descriptions
del clean_stack_traces

del descriptions
del stack_traces

#arithmetic_transformations
arithmetic_descriptions = [[word2id.get(word,-2) for word in desc]   for desc in clean_descriptions_2]
arithmetic_stack_traces = [[func2id.get(func,-2) for func in trace] for trace in clean_stack_traces_2]

del clean_descriptions_2
del clean_stack_traces_2

issues_embeddings  = compute_embeddings(arithmetic_descriptions,arithmetic_stack_traces,
                                        word_embedding_matrix,stack_embedding_matrix,False)

In [12]:
print(np.shape(issues_embeddings))

(1387, 64)


## Classification

In [13]:
tag_labels = list()
# copy by reference in order to avoid to change every where the variable name
tag_labels = clean_tags_2

In [14]:
#tags = ['Bug','Google Play or Beta feedback','Feedback required','Feature Request','Prio - High','Frontend Design']
#tags = ['>test-failure','Team:Distributed','>bug',':Distributed/Snapshot/Restore']
tags = ['type: bug','for: stackoverflow','status: invalid','for: external-project']
no_tags = 4
np_tags = np.zeros((len(arithmetic_descriptions),no_tags))

for counter in range(len(tag_labels)):
    for counter_2,value in enumerate(tags):
        if value in tag_labels[counter]:
            np_tags[counter][counter_2] = 1
            
df_tags = pd.DataFrame(np_tags, columns = tags)

### Dummy Classifier

In [15]:
from sklearn.dummy import DummyClassifier

In [16]:
def my_dummy_classifier(tags,df_tags,issues_embeddings,cl_label,n_splits):
    
    target_label    = df_tags[cl_label]
    dummy_clf       = DummyClassifier(strategy = "uniform",random_state=0)
    total_confusion = np.zeros((2,2))
    
    # fit model 
    dummy_clf.fit(issues_embeddings,target_label)
    predictions = dummy_clf.predict(issues_embeddings)
    total_confusion = confusion_matrix(target_label,predictions)

    print(total_confusion)
    print("accuracy = TP+TN/(TP+TN+FP+FN)",(total_confusion[0][0]+total_confusion[1][1])/np.sum(total_confusion))
    print("custom metric",np.sqrt((total_confusion[0][0]/(total_confusion[0][0]+total_confusion[0][1]))*
                                  (total_confusion[1][1]/(total_confusion[1][1]+total_confusion[1][0]))))
    print("\n")

### Logistic Regression Classifier

In [17]:
def my_classifier(tags,df_tags,issues_embeddings,cl_label,n_splits):
    
    target_label    = df_tags[cl_label]
    counter_1       = np.sum(target_label)
    weight_0        = 1/(target_label.shape[0]-counter_1)
    weight_1        = 1/counter_1
    w               = {0:weight_0,1:weight_1}
    skf             = StratifiedKFold(n_splits)
    model           = LogisticRegression(solver='lbfgs',class_weight = w)
    total_confusion = np.zeros((2,2))
    counter         = 0
    auc             = 0
    for train_index, test_index in skf.split(issues_embeddings,target_label):
        
        X_train,X_test = issues_embeddings[train_index], issues_embeddings[test_index]
        y_train,y_test = target_label[train_index], target_label[test_index]
        
        #fit model 
        model.fit(X_train,y_train)
        predictions = model.predict(X_test)
        
        #print(confusion_matrix(y_test,predictions))
        total_confusion = total_confusion+confusion_matrix(y_test,predictions)
        
        fpr,tpr,thresholds = metrics.roc_curve(y_test,model.predict_proba(X_test)[:,1])
        
        auc     = auc + metrics.auc(fpr,tpr)
        counter = counter +1
        
    print(total_confusion)
    print("accuracy = TP+TN/(TP+TN+FP+FN)",(total_confusion[0][0]+total_confusion[1][1])/np.sum(total_confusion))
    print("GM",np.sqrt((total_confusion[0][0]/(total_confusion[0][0]+total_confusion[0][1]))*
                                  (total_confusion[1][1]/(total_confusion[1][1]+total_confusion[1][0]))))
    print("Pre", total_confusion[0][0]/(total_confusion[0][1]+total_confusion[0][0]))
    print("AUC", auc/counter)
    print("\n")

In [None]:
# project 3
my_dummy_classifier(tags,df_tags,issues_embeddings,"type: bug",10)
my_dummy_classifier(tags,df_tags,issues_embeddings,"for: stackoverflow",10)
my_dummy_classifier(tags,df_tags,issues_embeddings,"status: invalid",10)
my_dummy_classifier(tags,df_tags,issues_embeddings,"for: external-project",10)

# project 2
#my_dummy_classifier(tags,df_tags,issues_embeddings,">test-failure",10)
#my_dummy_classifier(tags,df_tags,issues_embeddings,">bug",10)
#my_dummy_classifier(tags,df_tags,issues_embeddings,"Team:Distributed",10)
#my_dummy_classifier(tags,df_tags,issues_embeddings,":Distributed/Snapshot/Restore",10)

# project 1
#my_dummy_classifier(tags,df_tags,issues_embeddings,"Bug",10)
#my_dummy_classifier(tags,df_tags,issues_embeddings,"Google Play or Beta feedback",10)
#my_dummy_classifier(tags,df_tags,issues_embeddings,"Prio - High",10)

In [None]:
# with out descriptions
num_issues           = np.shape(issues_embeddings)[0]
stack_traces_dim     = np.shape(stack_embedding_matrix)[1]
descriptions_dim     = np.shape(word_embedding_matrix)[1]
issues_embeddings_st = np.zeros((num_issues,stack_traces_dim))
for counter in range(num_issues):
    issues_embeddings_st[counter][:] = issues_embeddings[counter][descriptions_dim:]

In [None]:
for i in range(num_issues):
    if np.sum(issues_embeddings_st[i]) == 0:
        print(i)
        print(issues_embeddings[i])

In [None]:
my_classifier(tags,df_tags,issues_embeddings_st,"type: bug",10)
my_classifier(tags,df_tags,issues_embeddings_st,"for: stackoverflow",10)
my_classifier(tags,df_tags,issues_embeddings_st,"status: invalid",10)
my_classifier(tags,df_tags,issues_embeddings_st,"for: external-project",10)

In [18]:
# with out stack_traces
my_classifier(tags,df_tags,issues_embeddings,"type: bug",10)
my_classifier(tags,df_tags,issues_embeddings,"for: stackoverflow",10)
my_classifier(tags,df_tags,issues_embeddings,"status: invalid",10)
my_classifier(tags,df_tags,issues_embeddings,"for: external-project",10)

[[485. 648.]
 [111. 143.]]
accuracy = TP+TN/(TP+TN+FP+FN) 0.4527757750540735
GM 0.4909158732595112
Pre 0.42806707855251547
AUC 0.5706586889279018


[[577. 610.]
 [ 84. 116.]]
accuracy = TP+TN/(TP+TN+FP+FN) 0.4996395097332372
GM 0.5309780202242338
Pre 0.4860994102780118
AUC 0.586648981626549


[[588. 311.]
 [277. 211.]]
accuracy = TP+TN/(TP+TN+FP+FN) 0.5760634462869503
GM 0.5317899600820661
Pre 0.6540600667408232
AUC 0.5636088552149949


[[944. 245.]
 [160.  38.]]
accuracy = TP+TN/(TP+TN+FP+FN) 0.7080028839221341
GM 0.39035008027904644
Pre 0.7939444911690496
AUC 0.6046179881408407




In [None]:
# descriptions and stack_traces
my_classifier(tags,df_tags,issues_embeddings,"type: bug",10)
my_classifier(tags,df_tags,issues_embeddings,"for: stackoverflow",10)
my_classifier(tags,df_tags,issues_embeddings,"status: invalid",10)
my_classifier(tags,df_tags,issues_embeddings,"for: external-project",10)

## Neural Network Classifier

In [None]:
import time
import math
import random
from random import seed
from random import randint
from datetime import datetime
import tensorflow.compat.v1 as tf
tf.compat.v1.disable_eager_execution()
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
def split_dataset2(issues_embeddings,target_labels):
    
    sss = StratifiedShuffleSplit(n_splits = 1, test_size = 0.1, random_state = 0)
    
    X_train_0 = list()
    X_train_1 = list()
    
    for train_index, test_index in sss.split(issues_embeddings,target_labels):
        #X_train,X_test = issues_embeddings[train_index], issues_embeddings[test_index]
        #Y_train,Y_test = target_labels[train_index], target_labels[test_index]
        
        
        X_test = issues_embeddings[test_index]
        Y_test = target_labels[test_index]
        
        for index in train_index:
            if target_labels.iloc[index] == 0:
                X_train_0.append(issues_embeddings[index])
            elif target_labels.iloc[index] == 1:
                X_train_1.append(issues_embeddings[index])
                
    return X_train_0,X_train_1,X_test,Y_test

In [None]:
def split_dataset(issues_embeddings,target_labels):
    
    sss = StratifiedShuffleSplit(n_splits = 1, test_size = 0.1, random_state = 0)
    
    for train_index, test_index in sss.split(issues_embeddings,target_labels):
        
        X_train,X_test = issues_embeddings[train_index], issues_embeddings[test_index]
        Y_train,Y_test = target_labels[train_index], target_labels[test_index]
    
    return X_train,Y_train,X_test,Y_test

In [None]:
def generate_batch(issues_embeddings,target_labels,batch_size):
    
    batch  = np.ndarray(shape = (batch_size,np.shape(issues_embeddings)[1]), dtype = np.float64)
    labels = np.ndarray(shape = (batch_size,2), dtype = np.float64)
    
    issues_to_use = random.sample([i for i in range(np.shape(issues_embeddings)[0])],batch_size)
    
    for counter,value in enumerate(issues_to_use):
        batch[counter][:]  = issues_embeddings[value][:]
        # label_0
        labels[counter][0] = 1-target_labels.iloc[value]
        # label_1
        labels[counter][1] =   target_labels.iloc[value]
    return batch,labels

In [None]:
def pooling(issues_embeddings_0, issues_embeddings_1, batch_size):
    
    batch  = np.ndarray(shape = (batch_size,np.shape(issues_embeddings_0)[1]), dtype = np.float64)
    labels = np.ndarray(shape = (batch_size,2), dtype = np.float64)
    
    issues_to_use_0 = random.sample([i for i in range(np.shape(issues_embeddings_0)[0])],batch_size//2)
    issues_to_use_1 = random.sample([i for i in range(np.shape(issues_embeddings_1)[0])],batch_size//2)
    
    # even indexes for issues belong to class 0
    # odd  indexes for issues belong to class 1
    counter_0 = 0
    counter_1 = 0
    
    for counter in range(batch_size):
        
        # even indexes
        if counter%2 == 0 :
            batch[counter][:]  = issues_embeddings_0[issues_to_use_0[counter_0]][:]
            labels[counter][0] = 1
            labels[counter][1] = 0
            counter_0 += 1
        else:
            batch[counter][:]  = issues_embeddings_1[issues_to_use_1[counter_1]][:]
            labels[counter][0] = 0
            labels[counter][1] = 1
            counter_1 += 1
            
    return batch,labels

In [None]:
def my_classifier_nn(issues_embeddings,target_labels,hidden_layer_dim,learning_rate,
                     batch_size,epochs,v_batch,v_labels):
    
    # input data
    X_train = tf.placeholder(tf.float64, shape=[None,np.shape(issues_embeddings)[1]])
    # input label
    Y_train = tf.placeholder(tf.float64, shape=[None,2])
    
    # input-hidden layer variables
    W1 = tf.Variable(tf.truncated_normal([np.shape(issues_embeddings)[1],hidden_layer_dim],stddev = 1.0,dtype=tf.float64),name='W1')
    b1 = tf.Variable(tf.random_normal([hidden_layer_dim],stddev = 1.0,dtype=tf.float64),name = 'b1')
    
    # hidden-output layer variables
    W2 = tf.Variable(tf.truncated_normal([hidden_layer_dim,2],stddev = 1.0,dtype=tf.float64),name = 'W2')
    b2 = tf.Variable(tf.random_normal([2],dtype=tf.float64),name = 'b2')
    
    ##neural network's functions
    hidden_layer = tf.add(tf.matmul(X_train,W1),b1)
    hidden_layer = tf.nn.tanh(hidden_layer)
    
    output_layer = tf.add(tf.matmul(hidden_layer,W2),b2)
    output_layer_2 = tf.nn.softmax(output_layer)
    
    cost_func = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(labels = Y_train,logits = output_layer))
    
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost_func)
    
    init = tf.initialize_all_variables()
    print(weights)
    with tf.Session() as sess:
        
        sess.run(init)
        
        for epoch in range(epochs):
            
            # generate batch.
            batch_x,batch_y = generate_batch(issues_embeddings,target_labels,batch_size)
            
            # train the model
            _,loss = sess.run([optimizer,cost_func],feed_dict={X_train:batch_x,Y_train:batch_y})
        
        # validation
        y_preds = sess.run(output_layer_2,feed_dict={X_train:v_batch,Y_train:v_labels})
        for i in range(len(y_preds)):
            print(y_preds[i],v_labels[i])

In [None]:
def my_classifier_nn2(issues_embeddings_0,issues_embeddings_1,hidden_layer_dim,
                      learning_rate,batch_size,epochs,v_batch,v_labels):
    
    # input data
    X_train = tf.placeholder(tf.float64, shape=[None,np.shape(issues_embeddings_0)[1]])
    # input label
    Y_train = tf.placeholder(tf.float64, shape=[None,2])
    
    # input-hidden layer variables
    W1 = tf.Variable(tf.truncated_normal([np.shape(issues_embeddings_0)[1],hidden_layer_dim],
                                         stddev = 1.0/ math.sqrt(hidden_layer_dim),
                                         dtype=tf.float64),name='W1')
    b1 = tf.Variable(tf.random_normal([hidden_layer_dim],stddev = 1.0/ math.sqrt(hidden_layer_dim),dtype=tf.float64),name = 'b1')
    
    # hidden-output layer variables
    W2 = tf.Variable(tf.truncated_normal([hidden_layer_dim,2],
                                         stddev = 1.0/ math.sqrt(hidden_layer_dim),
                                         dtype=tf.float64),name = 'W2')
    b2 = tf.Variable(tf.random_normal([2],dtype=tf.float64),name = 'b2')
    
    # neural network's functions
    hidden_layer   = tf.add(tf.matmul(X_train,W1),b1)
    hidden_layer   = tf.nn.tanh(hidden_layer)
     
    dropout_layer  = tf.nn.dropout(hidden_layer,rate=0.5) 
    
    output_layer   = tf.add(tf.matmul(dropout_layer,W2),b2)
    
    # for testing drop out is not used 
    output_layer_all = tf.add(tf.matmul(hidden_layer,W2),b2)
    output_layer_2   = tf.nn.softmax(output_layer_all)
    
    cost_func = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(labels = Y_train,logits = output_layer))
    
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost_func)
    
    init = tf.initialize_all_variables()
    
    with tf.Session() as sess:
        
        sess.run(init)
        
        for epoch in range(epochs):
            
            # generate batch.
            batch_x,batch_y = pooling(issues_embeddings_0,issues_embeddings_1,batch_size)
            
            # train the model
            _,loss = sess.run([optimizer,cost_func],feed_dict={X_train:batch_x,Y_train:batch_y})
        
        W1_np = W1.eval()
        b1_np = b1.eval()
        W2_np = W2.eval()
        b2_np = b2.eval()
        
        
        # validation
        y_probs     = sess.run(output_layer_2,feed_dict={X_train:v_batch,Y_train:v_labels})
    
    return compute_metrics(y_probs,v_labels)

In [None]:
def compute_metrics(y_probs,v_labels):
    
    y_preds_1 = np.ndarray(shape = (np.shape(v_labels)[0],1), dtype = np.float64)
    y_true_1  = np.ndarray(shape = (np.shape(v_labels)[0],1), dtype = np.float64)
    
    for i in range(np.shape(v_labels)[0]):
        y_true_1[i]  = v_labels[i][1]
        y_preds_1[i] = 0 if y_probs[i][0]>y_probs[i][1] else 1
    
    total_confusion = confusion_matrix(y_true=y_true_1,y_pred=y_preds_1)
    return total_confusion

In [None]:
#######################################################################

In [None]:
seed(datetime.now())

In [None]:
# second implementation
# use both word embeddings + stack traces embeddings
target_labels = df_tags["Bug"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,
                                                                      target_labels)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])


In [None]:
# run the neural network using and stack traces embeddings
# metrics matrix
start_time = time.time()
total_confusion = np.zeros((2,2))

for i in range(10):
    total_confusion += my_classifier_nn2(train_issues_0,train_issues_1,16,0.01,
                                         2*batch_size,100,v_batch,v_labels)

    
acc = (total_confusion[0][0]+total_confusion[1][1])/np.sum(total_confusion)
gm  = np.sqrt((total_confusion[0][0]/(total_confusion[0][0]+total_confusion[0][1]))*
              (total_confusion[1][1]/(total_confusion[1][1]+total_confusion[1][0])))

print("accuracy",acc)
print("GM",gm)
print("\n")
total_time = time.time() - start_time
print("training time in seconds %s "%(str(total_time)))
# save neural's network weights
#np.savetxt('../results/nn_W1.txt',W1,fmt='%.8f')
#np.savetxt('../results/nn_b1.txt',b1,fmt='%.8f')
#np.savetxt('../results/nn_W2.txt',W2,fmt='%.8f')
#np.savetxt('../results/nn_b2.txt',b2,fmt='%.8f')

In [None]:
# run the neural network using only word embeddings
descriptions_dim  = np.shape(word_embedding_matrix)[1]
train_issues_w0   = np.ndarray(shape = (np.shape(train_issues_0)[0],descriptions_dim),
                               dtype = np.float64)
train_issues_w1   = np.ndarray(shape = (np.shape(train_issues_1)[0],descriptions_dim),
                               dtype = np.float64)
test_issues_w     = np.ndarray(shape = (np.shape(test_issues)[0],descriptions_dim),
                               dtype = np.float64)

for i in range(np.shape(train_issues_0)[0]):
    train_issues_w0[i] = train_issues_0[i][0:descriptions_dim]

for i in range(np.shape(train_issues_1)[0]):
    train_issues_w1[i] = train_issues_1[i][0:descriptions_dim]
    
for i in range(np.shape(test_issues)[0]):
    test_issues_w[i] = test_issues[i][0:descriptions_dim]

v_batch_w,v_labels_w = generate_batch(test_issues_w,test_labels,np.shape(test_issues_w)[0])

total_confusion = np.zeros((2,2))

for i in range(5):
    total_confusion += my_classifier_nn2(train_issues_w0,train_issues_w1,
                                         16,0.01,2*batch_size,500,v_batch_w,v_labels_w)

    
acc = (total_confusion[0][0]+total_confusion[1][1])/np.sum(total_confusion)
gm  = np.sqrt((total_confusion[0][0]/(total_confusion[0][0]+total_confusion[0][1]))*
              (total_confusion[1][1]/(total_confusion[1][1]+total_confusion[1][0])))

print("accuracy",acc)
print("GM",gm)
print("\n")

In [None]:
print(len(train_issues_w0))
print(len(train_issues_w1))

In [None]:
########################################################################################3

In [None]:
# second implementation
target_labels = df_tags["Google Play or Beta feedback"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,
                                                                      target_labels)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])

In [None]:
# run the neural network using and stack traces embeddings
total_confusion = np.zeros((2,2))

for i in range(10):
    total_confusion += my_classifier_nn2(train_issues_0,train_issues_1,
                                         32,0.01,2*batch_size,100,v_batch,v_labels)

    
acc = (total_confusion[0][0]+total_confusion[1][1])/np.sum(total_confusion)
gm  = np.sqrt((total_confusion[0][0]/(total_confusion[0][0]+total_confusion[0][1]))*
              (total_confusion[1][1]/(total_confusion[1][1]+total_confusion[1][0])))

print("accuracy",acc)
print("GM",gm)
print("\n")

In [None]:
# run the neural network using only word embeddings
descriptions_dim  = np.shape(word_embedding_matrix)[1]
train_issues_w0   = np.ndarray(shape = (np.shape(train_issues_0)[0],descriptions_dim),
                               dtype = np.float64)
train_issues_w1   = np.ndarray(shape = (np.shape(train_issues_1)[0],descriptions_dim),
                               dtype = np.float64)
test_issues_w     = np.ndarray(shape = (np.shape(test_issues)[0],descriptions_dim),
                               dtype = np.float64)

for i in range(np.shape(train_issues_0)[0]):
    train_issues_w0[i] = train_issues_0[i][0:descriptions_dim]

for i in range(np.shape(train_issues_1)[0]):
    train_issues_w1[i] = train_issues_1[i][0:descriptions_dim]
    
for i in range(np.shape(test_issues)[0]):
    test_issues_w[i] = test_issues[i][0:descriptions_dim]

v_batch_w,v_labels_w = generate_batch(test_issues_w,test_labels,np.shape(test_issues_w)[0])

total_confusion = np.zeros((2,2))

for i in range(10):
    total_confusion += my_classifier_nn2(train_issues_0,train_issues_1,
                                         32,0.01,2*batch_size,800,v_batch,v_labels)

    
acc = (total_confusion[0][0]+total_confusion[1][1])/np.sum(total_confusion)
gm  = np.sqrt((total_confusion[0][0]/(total_confusion[0][0]+total_confusion[0][1]))*
              (total_confusion[1][1]/(total_confusion[1][1]+total_confusion[1][0])))

print("accuracy",acc)
print("GM",gm)
print("\n")

In [None]:
################################################################################

In [None]:
# second implementation
target_labels = df_tags["Prio - High"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,
                                                                      target_labels)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])

In [None]:
# run the neural network using and stack traces embeddings
total_confusion = np.zeros((2,2))

for i in range(10):
    total_confusion += my_classifier_nn2(train_issues_0,train_issues_1,
                                         32,0.01,2*batch_size,2000,v_batch,v_labels)

    
acc = (total_confusion[0][0]+total_confusion[1][1])/np.sum(total_confusion)
gm  = np.sqrt((total_confusion[0][0]/(total_confusion[0][0]+total_confusion[0][1]))*
              (total_confusion[1][1]/(total_confusion[1][1]+total_confusion[1][0])))

print("accuracy",acc)
print("GM",gm)
print("\n")

In [None]:
total_confusion = my_classifier_nn2(train_issues_0,train_issues_1,16,0.01,2*batch_size,1000,v_batch,v_labels)

acc = (total_confusion[0][0]+total_confusion[1][1])/np.sum(total_confusion)
gm  = np.sqrt((total_confusion[0][0]/(total_confusion[0][0]+total_confusion[0][1]))*
              (total_confusion[1][1]/(total_confusion[1][1]+total_confusion[1][0])))

print("accuracy",acc)
print("GM",gm)
print("\n")

In [None]:
total_confusion = my_classifier_nn2(train_issues_0,train_issues_1,32,0.01,2*batch_size,3000,v_batch,v_labels)

acc = (total_confusion[0][0]+total_confusion[1][1])/np.sum(total_confusion)
gm  = np.sqrt((total_confusion[0][0]/(total_confusion[0][0]+total_confusion[0][1]))*
              (total_confusion[1][1]/(total_confusion[1][1]+total_confusion[1][0])))

print("accuracy",acc)
print("GM",gm)
print("\n")

In [None]:
# run the neural network using only word embeddings
descriptions_dim  = np.shape(word_embedding_matrix)[1]
train_issues_w0   = np.ndarray(shape = (np.shape(train_issues_0)[0],descriptions_dim),
                               dtype = np.float64)
train_issues_w1   = np.ndarray(shape = (np.shape(train_issues_1)[0],descriptions_dim),
                               dtype = np.float64)
test_issues_w     = np.ndarray(shape = (np.shape(test_issues)[0],descriptions_dim),
                               dtype = np.float64)

for i in range(np.shape(train_issues_0)[0]):
    train_issues_w0[i] = train_issues_0[i][0:descriptions_dim]

for i in range(np.shape(train_issues_1)[0]):
    train_issues_w1[i] = train_issues_1[i][0:descriptions_dim]
    
for i in range(np.shape(test_issues)[0]):
    test_issues_w[i] = test_issues[i][0:descriptions_dim]

v_batch_w,v_labels_w = generate_batch(test_issues_w,test_labels,np.shape(test_issues_w)[0])

total_confusion = np.zeros((2,2))

for i in range(10):
    total_confusion += my_classifier_nn2(train_issues_w0,train_issues_w1,
                                         16,0.01,2*batch_size,1000,v_batch_w,v_labels_w)

    
acc = (total_confusion[0][0]+total_confusion[1][1])/np.sum(total_confusion)
gm  = np.sqrt((total_confusion[0][0]/(total_confusion[0][0]+total_confusion[0][1]))*
              (total_confusion[1][1]/(total_confusion[1][1]+total_confusion[1][0])))

print("accuracy",acc)
print("GM",gm)
print("\n")

In [None]:
##########################################################################################

In [None]:
# run the neural network using only word embeddings
descriptions_dim  = np.shape(word_embedding_matrix)[1]
train_issues_w0   = train_issues_0[:][0:descriptions_dim].copy()
train_issues_w1   = train_issues_1[:][0:descriptions_dim].copy()
test_issues_w     = test_issues[:][0:descriptions_dim].copy()

v_batch_w,v_labels_w = generate_batch(test_issues_w,test_labels,np.shape(test_issues_w)[0])
my_classifier_nn2(train_issues_w0,train_issues_w1,16,0.01,2*batch_size,v_batch_w,v_labels_w)

In [None]:
# first implementation
target_labels = df_tags["Bug"]
train_issues,train_labels,test_issues,test_labels = split_dataset(issues_embeddings,target_labels)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])
my_classifier_nn(train_issues,train_labels,16,0.01,64,1000,v_batch,v_labels)

In [None]:
target_labels = df_tags["Bug"]
print(np.sum(target_labels))
print(len(target_labels))

In [None]:
#######################################################################################
#######################################################################################

In [None]:
def generate_batch_v2(issues_embeddings,target_labels,batch_size):
    
    batch  = np.ndarray(shape = (batch_size,np.shape(issues_embeddings)[1]), dtype = np.float64)
    labels = np.ndarray(shape = (batch_size,1), dtype = np.float64)
    
    seed(datetime.now())
    
    issues_to_use = random.sample([i for i in range(np.shape(issues_embeddings)[0])],batch_size)
    
    for counter,value in enumerate(issues_to_use):
        batch[counter][:]  = issues_embeddings[value][:]
        labels[counter,0] = target_labels.iloc[value]
        
    return batch,labels

In [None]:
def my_classifier_nn_v2(issues_embeddings,target_labels,hidden_layer_dim,learning_rate,
                        batch_size,epochs,v_batch,v_labels):
    
    # input data
    X_train = tf.placeholder(tf.float64, shape=[None,np.shape(issues_embeddings)[1]])
    # input label
    Y_train = tf.placeholder(tf.float64, shape=[None,1])
    
    # input-hidden layer variables
    W1 = tf.Variable(tf.truncated_normal([np.shape(issues_embeddings)[1],hidden_layer_dim],stddev = 1.0,dtype=tf.float64),name='W1')
    b1 = tf.Variable(tf.random_normal([hidden_layer_dim],stddev = 1.0,dtype=tf.float64),name = 'b1')
    
    W2 = tf.Variable(tf.truncated_normal([hidden_layer_dim,1],stddev = 1.0,dtype=tf.float64),name = 'W2')
    b2 = tf.Variable(tf.random_normal([1],dtype=tf.float64),name = 'b2')
    
    # neural network's functions
    hidden_layer = tf.add(tf.matmul(X_train,W1),b1)
    hidden_layer = tf.nn.tanh(hidden_layer)
    
    output_layer  = tf.add(tf.matmul(hidden_layer,W2),b2)
    output_layer2 = tf.nn.sigmoid(output_layer) 
    
    #cost_func = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels = Y_train,logits = output_layer2))
    cost_func = tf.reduce_mean(-tf.log(output_layer2)*Y_train - (1-Y_train)*tf.log(1-output_layer2))
    
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost_func)
    
    init = tf.initialize_all_variables()
    
    with tf.Session() as sess:
        
        sess.run(init)
        
        for epoch in range(epochs):
            # generate batch.
            batch_x,batch_y = generate_batch_v2(issues_embeddings,target_labels,batch_size)
            # train the model
            _,loss = sess.run([optimizer,cost_func],feed_dict={X_train:batch_x,Y_train:batch_y})
            
        
        y_preds = sess.run(output_layer2,feed_dict = {X_train:v_batch,Y_train:v_labels})
        for i in range(len(y_preds)):
            print(y_preds[i],v_labels[i])

In [None]:
target_labels = df_tags["Bug"]
train_issues,train_labels,test_issues,test_labels = split_dataset(issues_embeddings,target_labels)
v_batch  = np.reshape(test_issues,(-1,np.shape(issues_embeddings)[1]))
v_labels = test_labels.to_numpy()
v_labels = np.reshape(v_labels,(-1,1))
preds = my_classifier_nn_v2(train_issues,train_labels,8,0.1,64,500,v_batch,v_labels)