# Tag Classification

This notebook contains only implementations based on neural networks.

In [1]:
import os
import re
import json
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

## Pre Processing

In [2]:
def load_dict(path_to_file):
    temp_dict = dict()
    with open(path_to_file) as file:
        lines = file.readlines()
        for line in lines:
            temp   = str(line)
            values = temp.split(',')
            temp_dict[values[0]] = int(values[1].replace("\n",""))
    
    return temp_dict 

In [3]:
def load_issues(dir_path,tag_labels,descriptions,stack_traces):
    
    for fname in os.listdir(dir_path):
        with open(os.path.join(dir_path,fname)) as json_file:
            
            data = json.load(json_file)
            for issue in data:
                
                tags = issue['tags']
                for i in range(len(tags)):
                    tags[i] = tags[i].strip()
                description = issue['description']
                stack_trace = issue['stack_trace']
                name        = issue['name']
                
                if tags != [] and stack_trace !=[] and description != []: #(description != [] or stack_trace != []):
                    tag_labels.append(tags)
                    descriptions.append(description)
                    stack_traces.append(stack_trace)

In [4]:
# copy paste from stack_trace_embedding notebook

def clean_stack_trace(stack_trace):
    
    clean_stack_trace = []
    temp_1            = stack_trace.replace(r'\tat','  at').replace('\"at ',' at ')
    temp_stack        = temp_1.split(" at ")[1:]
    
    if(temp_stack == []):
        temp_stack_2 = temp_1.split(' ')
        for t in temp_stack_2:
            if t.count('.')>2 and t.find('(') != -1 and t.find(')') != -1:
                if t.find('.java:') > t.find('(') and t.find('.java:') < t.find(')'):
                    if len(t.split())>1:
                        temp_stack.append(t.split()[1])
                    else:
                        temp_stack.append(t)
    
    to_find = re.compile("[|,|<|>]|\|=")
        
    #find where each function ends and keep only the path
    for f in temp_stack:
        temp      = f.find(')')
        temp_file = f[0:temp]
        
        # check the punctuations in order to avoid anything else
        match_obj = to_find.search(temp_file)
        if match_obj == None:
            filename = find_filename(temp_file)
            if filename != '':
                clean_stack_trace.append(filename)
                
    return clean_stack_trace

In [5]:
# copy paste from stack_trace_embedding notebook

def find_filename(value):
    filename = ""
    words    = value.split("(")
    if len(words)>=2:
        parts = words[0].split(".")
        filename = ".".join(parts[0:-1])
    return filename


In [6]:
# copy paste from word embeddings notebook

def clean_description(description):
    
    # define stop words
    all_stopwords = set(stopwords.words('english'))
    
    #define translator to translate punctuation to white space
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    
    #join all lines into one sentence
    sentence     = ' '.join(description)
    
    #translate punctuation
    new_sentence = sentence.translate(translator)
    
    #split the sentense in words
    words = new_sentence.split()
    
    words_sw = [w.lower() for w in words if not w.lower() in all_stopwords and len(w)>1]
    
    return words_sw

In [7]:
# copy paste from word embeddings notebook

def stemming_data(descriptions):
    
    stemmer = PorterStemmer()
    
    for desc in descriptions:
        for counter in range(len(desc)):
            if desc[counter].isalpha():
                desc[counter] = stemmer.stem(desc[counter])
            

In [8]:
def clean_data(descriptions,stack_traces,use_stemming):
    
    clean_descriptions = list()
    clean_stack_traces = list()
    
    for i in range(len(descriptions)):
        
        temp_desc   = descriptions[i]
        temp_trace  = stack_traces[i]
        stack_trace = []
        clean_desc  = []
        
        if temp_trace != []:
            if len(temp_trace)>1:
                stack_trace = clean_stack_trace(' '.join(temp_trace))
            else:
                stack_trace = clean_stack_trace(temp_trace[0])
            
        if temp_desc  != []:
            clean_desc = clean_description(temp_desc)
            
        clean_descriptions.append(clean_desc)
        clean_stack_traces.append(stack_trace)
            
    if use_stemming == True:
        stemming_data(clean_descriptions)
        
    return clean_descriptions,clean_stack_traces

## Compute Arithmetic Representations for Issues

In [9]:
def compute_embeddings(arithmetic_descriptions,arithmetic_stack_traces,
                       word_embedding_matrix,stack_embedding_matrix,use_words,use_stacks):
    
    total_embeddings_dim = 0
    descriptions_dim     = 0
    stack_traces_dim     = 0
    
    if use_words == True:
        descriptions_dim     = np.shape(word_embedding_matrix)[1]
        total_embeddings_dim = total_embeddings_dim + descriptions_dim
        
    if use_stacks == True:
        stack_traces_dim     = np.shape(stack_embedding_matrix)[1]
        total_embeddings_dim = total_embeddings_dim + stack_traces_dim
    
    # make sure that in any case there are something to compute
    if total_embeddings_dim ==0:
        return None
    
    num_issues        = len(arithmetic_descriptions)
    issues_embeddings = np.zeros((num_issues,total_embeddings_dim))
    
    for counter in range(len(arithmetic_descriptions)):
        
        temp_desc   = arithmetic_descriptions[counter]
        temp_stack  = arithmetic_stack_traces[counter]
        total_words = 0
        total_funcs = 0
        
        if use_words == True:
            for word in temp_desc:
                if word != -2:
                    total_words += 1
                    issues_embeddings[counter][0:descriptions_dim] = issues_embeddings[counter][0:descriptions_dim] + word_embedding_matrix[word]
            if total_words != 0 :
                issues_embeddings[counter]    /= total_words
        
        
        if use_stacks == True:
            for func in temp_stack:
                if func != -2:
                    issues_embeddings[counter][descriptions_dim:] = issues_embeddings[counter][descriptions_dim:] + stack_embedding_matrix[func]
                    total_funcs += 1
            if total_funcs != 0:
                issues_embeddings[counter][descriptions_dim:] = issues_embeddings[counter][descriptions_dim:] / total_funcs 
            
    return issues_embeddings  

In [10]:
use_stemming = True

In [11]:
# load word embeddings
word_embedding_matrix = np.loadtxt('../results_project_3/word_embeddings_g1.txt', dtype=np.float64)

# load stack traces embeddings 
stack_embedding_matrix = np.loadtxt('../results_project_3/stack_embeddings_g.txt', dtype=np.float64)

# load vocabularies
word2id_path = "../outputs_project_3/words_vocabulary_g1.txt"
func2id_path = "../outputs_project_3/stacktraces_vocabulary_g.txt"

word2id = load_dict(word2id_path)
func2id = load_dict(func2id_path)

#load tags and descriptions
dir_path     = '../spring'
tag_labels   = list()
descriptions = list()
stack_traces = list()

# load issues
load_issues(dir_path,tag_labels,descriptions,stack_traces)

# transform data to arithmetic representation
clean_descriptions,clean_stack_traces = clean_data(descriptions,stack_traces,use_stemming)

clean_descriptions_2 = list()
clean_stack_traces_2 = list()
clean_tags_2         = list()

# remove empty stack traces or dublicate issues
for counter in range(len(clean_stack_traces)):
    
    if clean_stack_traces[counter] != []:
        
        flag   = False
        flag_2 = False 
        
        # remove empty stack traces 
        for i in clean_stack_traces[counter]:
            func = func2id.get(i,-2)
            if func != -2:
                flag_2 = True
                break
        if flag_2 == False:
            continue
        
        # check for dublicates
        for counter_2 in range(len(clean_stack_traces_2)):
            if clean_descriptions[counter] == clean_descriptions_2[counter_2] and \
               clean_stack_traces[counter] == clean_stack_traces_2[counter_2]:
                    flag = True
                    break
        
        if flag == False:
            clean_stack_traces_2.append(clean_stack_traces[counter])
            clean_descriptions_2.append(clean_descriptions[counter])
            clean_tags_2.append(tag_labels[counter])
                    
del clean_descriptions
del clean_stack_traces

del descriptions
del stack_traces

#arithmetic_transformations
arithmetic_descriptions = [[word2id.get(word,-2) for word in desc]   for desc in clean_descriptions_2]
arithmetic_stack_traces = [[func2id.get(func,-2) for func in trace] for trace in clean_stack_traces_2]

del clean_descriptions_2
del clean_stack_traces_2

issues_embeddings  = compute_embeddings(arithmetic_descriptions,arithmetic_stack_traces,
                                        word_embedding_matrix,stack_embedding_matrix,False,True)

In [12]:
tag_labels = list()
# copy by reference in order to avoid to change every where the variable name
tag_labels = clean_tags_2

In [13]:
#tags = ['Bug','Google Play or Beta feedback','Prio - High']
#tags = ['>test-failure','Team:Distributed','>bug',':Distributed/Snapshot/Restore']
tags = ['type: bug','for: stackoverflow','status: invalid','for: external-project']
no_tags = 4
np_tags = np.zeros((len(arithmetic_descriptions),no_tags))

for counter in range(len(tag_labels)):
    for counter_2,value in enumerate(tags):
        if value in tag_labels[counter]:
            np_tags[counter][counter_2] = 1
            
df_tags = pd.DataFrame(np_tags, columns = tags)

In [14]:
print(np.shape(issues_embeddings))

(1387, 8)


## Neural Network Classifiers

In [15]:
import time
import math
import random
from random import seed
from random import randint
from sklearn import metrics
from datetime import datetime
import tensorflow.compat.v1 as tf
tf.compat.v1.disable_eager_execution()
from tensorflow.keras import initializers
from sklearn.model_selection import StratifiedShuffleSplit

In [16]:
def split_dataset2(issues_embeddings,target_labels,t_size =0.1):
    
    sss = StratifiedShuffleSplit(n_splits = 1, test_size = t_size, random_state = 0)
    
    X_train_0 = list()
    X_train_1 = list()
    
    for train_index, test_index in sss.split(issues_embeddings,target_labels):
        #X_train,X_test = issues_embeddings[train_index], issues_embeddings[test_index]
        #Y_train,Y_test = target_labels[train_index], target_labels[test_index]
        
        
        X_test = issues_embeddings[test_index]
        Y_test = target_labels[test_index]
        
        for index in train_index:
            if target_labels.iloc[index] == 0:
                X_train_0.append(issues_embeddings[index])
            elif target_labels.iloc[index] == 1:
                X_train_1.append(issues_embeddings[index])
                
    return X_train_0,X_train_1,X_test,Y_test

In [17]:
def generate_batch(issues_embeddings,target_labels,batch_size):
    
    batch  = np.ndarray(shape = (batch_size,np.shape(issues_embeddings)[1]), dtype = np.float64)
    labels = np.ndarray(shape = (batch_size,2), dtype = np.float64)
    
    issues_to_use = random.sample([i for i in range(np.shape(issues_embeddings)[0])],batch_size)
    
    for counter,value in enumerate(issues_to_use):
        batch[counter][:]  = issues_embeddings[value][:]
        # label_0
        labels[counter][0] = 1-target_labels.iloc[value]
        # label_1
        labels[counter][1] =   target_labels.iloc[value]
    return batch,labels

In [18]:
def pooling(issues_embeddings_0, issues_embeddings_1, batch_size):
    
    batch  = np.ndarray(shape = (batch_size,np.shape(issues_embeddings_0)[1]), dtype = np.float64)
    labels = np.ndarray(shape = (batch_size,2), dtype = np.float64)
    
    issues_to_use_0 = random.sample([i for i in range(np.shape(issues_embeddings_0)[0])],batch_size//2)
    issues_to_use_1 = random.sample([i for i in range(np.shape(issues_embeddings_1)[0])],batch_size//2)
    
    # even indexes for issues belong to class 0
    # odd  indexes for issues belong to class 1
    counter_0 = 0
    counter_1 = 0
    
    for counter in range(batch_size):
        
        # even indexes
        if counter%2 == 0 :
            batch[counter][:]  = issues_embeddings_0[issues_to_use_0[counter_0]][:]
            labels[counter][0] = 1
            labels[counter][1] = 0
            counter_0 += 1
        else:
            batch[counter][:]  = issues_embeddings_1[issues_to_use_1[counter_1]][:]
            labels[counter][0] = 0
            labels[counter][1] = 1
            counter_1 += 1
            
    return batch,labels

In [19]:
def compute_predictions(y_probs,v_labels):
    
    y_probs_1 = np.ndarray(shape = (np.shape(v_labels)[0],1), dtype = np.float64)
    y_preds_1 = np.ndarray(shape = (np.shape(v_labels)[0],1), dtype = np.float64)
    y_true_1  = np.ndarray(shape = (np.shape(v_labels)[0],1), dtype = np.float64) 
    
    for i in range(np.shape(v_labels)[0]):
        y_true_1[i]  = v_labels[i][1]
        y_preds_1[i] = 0 if y_probs[i][0]>y_probs[i][1] else 1
        y_probs_1[i] = y_probs[i][1]
    
    matrix_confusion = metrics.confusion_matrix(y_true=y_true_1,y_pred=y_preds_1)
    
    return y_probs_1, y_preds_1, y_true_1, matrix_confusion

In [20]:
def compute_auc(y_true,y_probs):
    
    fpr,tpr,thresholds = metrics.roc_curve(y_true,y_probs)
    auc                = metrics.auc(fpr,tpr)
    
    return auc

In [21]:
def compute_metrics(total_confusion,aucs):
    
    acc = (total_confusion[0][0]+total_confusion[1][1])/np.sum(total_confusion)
    
    gm  = np.sqrt((total_confusion[0][0]/(total_confusion[0][0]+total_confusion[0][1]))*
              (total_confusion[1][1]/(total_confusion[1][1]+total_confusion[1][0])))
    
    pre = total_confusion[1][1]/(total_confusion[1][1]+total_confusion[1][0])
    
    mean_auc = np.sum(aucs)/np.shape(aucs)[0]

    print("accuracy" , acc)
    print("precision", pre)
    print("GM"       , gm)
    print("mean auc" , mean_auc)
    print(total_confusion)
    print("\n")

#### First and Simpliest Classifier

In [22]:
def my_classifier_nn2(issues_embeddings_0,issues_embeddings_1,hidden_layer_dim,
                      learning_rate,batch_size,epochs,v_batch,v_labels):
    
    # input data
    X_train = tf.placeholder(tf.float64, shape=[None,np.shape(issues_embeddings_0)[1]])
    # input label
    Y_train = tf.placeholder(tf.float64, shape=[None,2])
    
    # input-hidden layer variables
    
    #initializer = initializers.GlorotNormal()
    #W1  = tf.Variable(initializer(shape=(np.shape(issues_embeddings_0)[1],hidden_layer_dim),dtype=tf.float64),name='W1')
    W1 = tf.Variable(tf.truncated_normal([np.shape(issues_embeddings_0)[1],hidden_layer_dim],
                                         stddev = 1.0/ math.sqrt(hidden_layer_dim),
                                         dtype=tf.float64),name='W1')
    
    b1 = tf.Variable(tf.random_normal([hidden_layer_dim],
                                         stddev = 1.0/ math.sqrt(hidden_layer_dim),
                                         dtype=tf.float64),name = 'b1')
    
    # hidden-output layer variables
    W2 = tf.Variable(tf.truncated_normal([hidden_layer_dim,2],
                                         stddev = 1.0/ math.sqrt(hidden_layer_dim)
                                         ,dtype=tf.float64),name = 'W2')
    
    b2 = tf.Variable(tf.random_normal([2],dtype=tf.float64),name = 'b2')
    
    # neural network's functions
    hidden_layer   = tf.add(tf.matmul(X_train,W1),b1)
    hidden_layer   = tf.nn.tanh(hidden_layer)
     
    output_layer   = tf.add(tf.matmul(hidden_layer,W2),b2)
    output_layer_2 = tf.nn.softmax(output_layer)
    
    cost_func = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(labels = Y_train,logits = output_layer))
    
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost_func)
    
    init = tf.global_variables_initializer()
    
    with tf.Session() as sess:
        
        sess.run(init)
        
        for epoch in range(epochs):
            
            # generate batch.
            batch_x,batch_y = pooling(issues_embeddings_0,issues_embeddings_1,batch_size)
            
            # train the model
            _,loss = sess.run([optimizer,cost_func],feed_dict={X_train:batch_x,Y_train:batch_y})
        
        # saving the weights in numpy format
        #W1_np = W1.eval()
        #b1_np = b1.eval()
        #W2_np = W2.eval()
        #b2_np = b2.eval()
        
        
        # validation
        y_probs     = sess.run(output_layer_2,feed_dict={X_train:v_batch,Y_train:v_labels})
    
    return compute_predictions(y_probs,v_labels)

In [23]:
##########################################################################

In [None]:
# first implementation the simpliest neural network
# use both word embeddings and stack traces embeddings.

target_labels = df_tags[">test-failure"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,target_labels,t_size =0.2)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])

In [None]:
total_iterations = 10
total_confusion  = np.zeros((2,2))
conf_matrix      = np.zeros((2,2))
aucs             = np.zeros(total_iterations)

for i in range(total_iterations):
    y_probs_1, _, y_true_1, conf_matrix = my_classifier_nn2(train_issues_0,train_issues_1,16,0.01,
                                                            2*batch_size,100,v_batch,v_labels)
    total_confusion = total_confusion + conf_matrix
    aucs[i]         = compute_auc(y_true_1,y_probs_1)

compute_metrics(total_confusion,aucs)

In [None]:
##########################################################################

In [None]:
target_labels = df_tags["Google Play or Beta feedback"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,
                                                                      target_labels)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])

In [None]:
total_iterations = 10
total_confusion  = np.zeros((2,2))
conf_matrix      = np.zeros((2,2))
aucs             = np.zeros(total_iterations)

for i in range(total_iterations):
    y_probs_1, _, y_true_1, conf_matrix = my_classifier_nn2(train_issues_0,train_issues_1,32,0.01,
                                                            2*batch_size,100,v_batch,v_labels)
    total_confusion = total_confusion + conf_matrix
    aucs[i]         = compute_auc(y_true_1,y_probs_1)

compute_metrics(total_confusion,aucs)

In [None]:
##########################################################################

In [None]:
target_labels = df_tags["Prio - High"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,
                                                                      target_labels)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])

In [None]:
total_iterations = 10
total_confusion  = np.zeros((2,2))
conf_matrix      = np.zeros((2,2))
aucs             = np.zeros(total_iterations)

for i in range(total_iterations):
    y_probs_1, _, y_true_1, conf_matrix = my_classifier_nn2(train_issues_0,train_issues_1,4,0.01,
                                                            2*batch_size,50,v_batch,v_labels)
    total_confusion = total_confusion + conf_matrix
    aucs[i]         = compute_auc(y_true_1,y_probs_1)
    
compute_metrics(total_confusion,aucs)

#### Voting

Because the first implementation has big variances in scores between sequential trainings in the same training and testing datasets we will implement a ensemble technique in order to make the results more stable.

In [31]:
def compute_predictions_voting(total_ypreds,total_nn):
    
    threshold = total_nn//2
    ypreds    = np.ndarray(shape = (np.shape(total_ypreds)[0],1),dtype = np.float64)
    
    for i in range(np.shape(total_ypreds)[0]):
        ypreds[i] = 0 if total_ypreds[i]<=threshold else 1
    
    return ypreds

In [32]:
######################################################################################

In [33]:
# voting technique based on the first and simpliest neural network.
# Use odd number of nn so majority wins every time.

target_labels = df_tags["type: bug"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,target_labels,t_size =0.1)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])

229


In [34]:
total_nn       = 3
aucs           = np.zeros(total_nn)
total_ypreds_1 = np.zeros(shape = (np.shape(v_labels)[0],1), dtype = np.float64)

for i in range(total_nn):
    y_probs_1, y_preds_1, y_true_1, conf_matrix = my_classifier_nn2(train_issues_0,train_issues_1,64,0.01,
                                                                    2*batch_size,500,v_batch,v_labels)
    total_ypreds_1    = total_ypreds_1 + y_preds_1 
    aucs[i]           = compute_auc(y_true_1,y_probs_1)

y_preds1         = compute_predictions_voting(total_ypreds_1,total_nn)
matrix_confusion = metrics.confusion_matrix(y_true=y_true_1,y_pred=y_preds1)
compute_metrics(matrix_confusion,aucs)

accuracy 0.5539568345323741
precision 0.56
GM 0.5563035899673184
mean auc 0.6049122807017545
[[63 51]
 [11 14]]




In [None]:
######################################################################################

In [35]:
target_labels = df_tags["for: stackoverflow"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,target_labels,t_size =0.1)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])

180


In [38]:
total_nn       = 3
aucs           = np.zeros(total_nn)
total_ypreds_1 = np.zeros(shape = (np.shape(v_labels)[0],1), dtype = np.float64)

for i in range(total_nn):
    y_probs_1, y_preds_1, y_true_1, conf_matrix = my_classifier_nn2(train_issues_0,train_issues_1,4,0.01,
                                                                    2*batch_size,800,v_batch,v_labels)
    total_ypreds_1    = total_ypreds_1 + y_preds_1
    aucs[i]           = compute_auc(y_true_1,y_probs_1)
    
y_preds1         = compute_predictions_voting(total_ypreds_1,total_nn)
matrix_confusion = metrics.confusion_matrix(y_true=y_true_1,y_pred=y_preds1)
compute_metrics(matrix_confusion,aucs)

accuracy 0.5107913669064749
precision 0.6
GM 0.5454163470199977
mean auc 0.5004201680672269
[[59 60]
 [ 8 12]]




In [None]:
######################################################################################

In [39]:
target_labels = df_tags["status: invalid"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,target_labels,t_size =0.15)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])

414


In [43]:
total_nn       = 3
aucs           = np.zeros(total_nn)
total_ypreds_1 = np.zeros(shape = (np.shape(v_labels)[0],1), dtype = np.float64)
start_time = time.time()

for i in range(total_nn):
    y_probs_1, y_preds_1, y_true_1, conf_matrix = my_classifier_nn2(train_issues_0,train_issues_1,4,0.01,
                                                                    2*batch_size,300,v_batch,v_labels)
    total_ypreds_1    = total_ypreds_1 + y_preds_1
    aucs[i]           = compute_auc(y_true_1,y_probs_1)

#measure total time
total_time = time.time() - start_time
print("training time in seconds %s "%(str(total_time)))

y_preds1         = compute_predictions_voting(total_ypreds_1,total_nn)
matrix_confusion = metrics.confusion_matrix(y_true=y_true_1,y_pred=y_preds1)
compute_metrics(matrix_confusion,aucs)

training time in seconds 5.120529890060425 
accuracy 0.5215311004784688
precision 0.5945945945945946
GM 0.5350572738373774
mean auc 0.5601434768101435
[[65 70]
 [30 44]]




In [44]:
target_labels = df_tags["for: external-project"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,target_labels,t_size =0.1)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])

178


In [45]:
total_nn       = 3
aucs           = np.zeros(total_nn)
total_ypreds_1 = np.zeros(shape = (np.shape(v_labels)[0],1), dtype = np.float64)
start_time = time.time()

for i in range(total_nn):
    y_probs_1, y_preds_1, y_true_1, conf_matrix = my_classifier_nn2(train_issues_0,train_issues_1,4,0.01,
                                                                    2*batch_size,150,v_batch,v_labels)
    total_ypreds_1    = total_ypreds_1 + y_preds_1
    aucs[i]           = compute_auc(y_true_1,y_probs_1)

#measure total time
total_time = time.time() - start_time
print("training time in seconds %s "%(str(total_time)))

y_preds1         = compute_predictions_voting(total_ypreds_1,total_nn)
matrix_confusion = metrics.confusion_matrix(y_true=y_true_1,y_pred=y_preds1)
compute_metrics(matrix_confusion,aucs)

training time in seconds 4.030569553375244 
accuracy 0.6474820143884892
precision 0.5
GM 0.5797710356524485
mean auc 0.5974789915966386
[[80 39]
 [10 10]]




#### Patience remaining

Here the neural network architecture is implemented based on patience remaining technique in order to avoid tuning the hyper parameter epochs and as a consequence avoid overfitting. 

In [53]:
seed(datetime.now())

In [54]:
def create_validation(train_issues_0,train_issues_1,rate=0.2):
    
    new_train_issues_0 = list()
    new_train_issues_1 = list()
    validation_issues  = list()
    validation_labels  = list()
    
    min_size = len(train_issues_0) if len(train_issues_0)<len(train_issues_1) else len(train_issues_1)
    print(min_size)
    validation_size_0 = int(min_size*rate)
    validation_size_1 = int(min_size*rate)
    
    validation_idxs_0 = random.sample([i for i in range(len(train_issues_0))], validation_size_0)
    validation_idxs_1 = random.sample([i for i in range(len(train_issues_1))], validation_size_1)
    
    for i in range(len(train_issues_0)):
        if i in validation_idxs_0:
            validation_issues.append(train_issues_0[i])
            validation_labels.append(0.0)
        else:
            new_train_issues_0.append(train_issues_0[i])
    
    for i in range(len(train_issues_1)):
        if i in validation_idxs_1:
            validation_issues.append(train_issues_1[i])
            validation_labels.append(1.0)
        else:
            new_train_issues_1.append(train_issues_1[i])
    
    # create a pandas series for validation labels in order to be compatible with the rest code
    val_labels_series = pd.Series(validation_labels, index = [i for i in range(len(validation_labels))])
    
    # create a np array for validation issues in order to be compatible with the rest code
    val_issues = np.array(validation_issues)
    
    return new_train_issues_0,new_train_issues_1,val_issues,val_labels_series

In [55]:
def my_classifier_nn3(issues_embeddings_0,issues_embeddings_1,hidden_layer_dim,
                      learning_rate,batch_size,v_batch,v_labels,t_batch,t_labels):
    
    # input data
    X_train = tf.placeholder(tf.float64, shape=[None,np.shape(issues_embeddings_0)[1]])
    # input label
    Y_train = tf.placeholder(tf.float64, shape=[None,2])
    
    # input-hidden layer variables
    W1 = tf.Variable(tf.truncated_normal([np.shape(issues_embeddings_0)[1],hidden_layer_dim],
                                         stddev = 1.0/ math.sqrt(hidden_layer_dim),
                                         dtype=tf.float64),name='W1')
    b1 = tf.Variable(tf.random_normal([hidden_layer_dim],stddev = 1.0/ math.sqrt(hidden_layer_dim),
                                      dtype=tf.float64),name = 'b1')
    
    # hidden-output layer variables
    W2 = tf.Variable(tf.truncated_normal([hidden_layer_dim,2],
                                         stddev = 1.0/ math.sqrt(hidden_layer_dim),
                                         dtype=tf.float64),name = 'W2')
    b2 = tf.Variable(tf.random_normal([2],dtype=tf.float64),name = 'b2')
    
    # neural network's functions
    hidden_layer   = tf.add(tf.matmul(X_train,W1),b1)
    hidden_layer   = tf.nn.tanh(hidden_layer)
     
    output_layer   = tf.add(tf.matmul(hidden_layer,W2),b2)
    output_layer_2 = tf.nn.softmax(output_layer)
    
    cost_func = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(labels = Y_train,logits = output_layer))
    
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost_func)
    
    # patience mehtod's variables
    min_loss = float('inf')
    
    min_W1             = np.zeros((np.shape(issues_embeddings_0)[1],hidden_layer_dim))
    min_b1             = np.zeros(hidden_layer_dim)
    min_W2             = np.zeros((hidden_layer_dim,2))
    min_b2             = np.zeros(2)
    patience_remaining = 200
    step               = batch_size/(np.shape(issues_embeddings_0)[0] + np.shape(issues_embeddings_1)[0])
    
    init = tf.global_variables_initializer()
    
    with tf.Session() as sess:
        
        sess.run(init)
        
        for epoch in range(50000):
            
            # generate batch.
            batch_x,batch_y = pooling(issues_embeddings_0,issues_embeddings_1,batch_size)
            
            # train the model
            _,train_loss = sess.run([optimizer,cost_func],feed_dict={X_train:batch_x,Y_train:batch_y})
            # maybe valid loss should not be cross entropy but better the predictions.
            valid_loss   = sess.run(cost_func,feed_dict={X_train:v_batch,Y_train:v_labels})
            
            patience_remaining -= step
            if valid_loss < min_loss:
                min_loss           = valid_loss
                patience_remaining = 200
                min_W1             = W1.eval()
                min_b1             = b1.eval()
                min_W2             = W2.eval()
                min_b2             = b2.eval()
            if patience_remaining<=0:
                print("total epochs",epoch+1)
                break
        
        # restore minimum weights
        W1 = tf.convert_to_tensor(min_W1)
        b1 = tf.convert_to_tensor(min_b1)
        W2 = tf.convert_to_tensor(min_W2)
        b2 = tf.convert_to_tensor(min_b2)
                
        # testing
        y_probs     = sess.run(output_layer_2,feed_dict={X_train:t_batch,Y_train:t_labels})
        
    return compute_predictions(y_probs,t_labels)

In [56]:
#########################################################################################################

In [57]:
# use patience remaining technique

target_labels = df_tags["type: bug"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,target_labels,t_size=0.1)

# create validation set
train_issues_0,train_issues_1,valid_issues,valid_labels = create_validation(train_issues_0,train_issues_1)


batch_size  = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)

t_batch,t_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])
v_batch,v_labels = generate_batch(valid_issues,valid_labels,np.shape(valid_issues)[0])

229
184


In [58]:
aucs = list()
y_probs_1, y_preds_1, y_true_1, conf_matrix = my_classifier_nn3(train_issues_0,train_issues_1,4,0.01,
                                                                2*batch_size,v_batch,v_labels,t_batch,t_labels)

aucs.append(compute_auc(y_true_1,y_probs_1)) 
compute_metrics(conf_matrix,aucs)

total epochs 1031
accuracy 0.539568345323741
precision 0.56
GM 0.5474021582045667
mean auc 0.5849122807017544
[[61 53]
 [11 14]]




In [68]:
total_nn       = 3
aucs           = np.zeros(total_nn)
total_ypreds_1 = np.zeros(shape = (np.shape(t_labels)[0],1), dtype = np.float64)

for i in range(total_nn):
    y_probs_1, y_preds_1, y_true_1, conf_matrix = my_classifier_nn3(train_issues_0,train_issues_1,4,0.01,
                                                                    2*batch_size,v_batch,v_labels,t_batch,t_labels)
    total_ypreds_1    = total_ypreds_1 + y_preds_1 
    aucs[i]           = compute_auc(y_true_1,y_probs_1)

y_preds1         = compute_predictions_voting(total_ypreds_1,total_nn)
matrix_confusion = metrics.confusion_matrix(y_true=y_true_1,y_pred=y_preds1)
compute_metrics(matrix_confusion,aucs)

total epochs 362
total epochs 420
total epochs 405
accuracy 0.539568345323741
precision 0.6122448979591837
GM 0.5532833351724882
mean auc 0.5689342403628118
[[45 45]
 [19 30]]




In [60]:
########################################################################################

In [69]:
# use patience remaining technique

target_labels = df_tags["for: stackoverflow"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,target_labels,t_size=0.1)

# create validation set
train_issues_0,train_issues_1,valid_issues,valid_labels = create_validation(train_issues_0,train_issues_1,rate=0.2)


batch_size  = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)

t_batch,t_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])
v_batch,v_labels = generate_batch(valid_issues,valid_labels,np.shape(valid_issues)[0])

180
144


In [70]:
aucs = list()
y_probs_1, y_preds_1, y_true_1, conf_matrix = my_classifier_nn3(train_issues_0,train_issues_1,4,0.01,
                                                                2*batch_size,v_batch,v_labels,t_batch,t_labels)

aucs.append(compute_auc(y_true_1,y_probs_1)) 
compute_metrics(conf_matrix,aucs)

total epochs 3632
accuracy 0.539568345323741
precision 0.55
GM 0.543873440542679
mean auc 0.5231092436974789
[[64 55]
 [ 9 11]]




In [63]:
total_nn       = 5
aucs           = np.zeros(total_nn)
total_ypreds_1 = np.zeros(shape = (np.shape(t_labels)[0],1), dtype = np.float64)

for i in range(total_nn):
    y_probs_1, y_preds_1, y_true_1, conf_matrix = my_classifier_nn3(train_issues_0,train_issues_1,4,0.01,
                                                                    2*batch_size,v_batch,v_labels,t_batch,t_labels)
    total_ypreds_1    = total_ypreds_1 + y_preds_1
    aucs[i]           = compute_auc(y_true_1,y_probs_1)
    
y_preds1         = compute_predictions_voting(total_ypreds_1,total_nn)
matrix_confusion = metrics.confusion_matrix(y_true=y_true_1,y_pred=y_preds1)
compute_metrics(matrix_confusion,aucs)

total epochs 1218
total epochs 924
total epochs 940
total epochs 914
total epochs 948
accuracy 0.5683453237410072
precision 0.7
GM 0.6183469424008422
mean auc 0.5341176470588235
[[65 54]
 [ 6 14]]




In [64]:
###################################################################################################

In [71]:
# use patience remaining technique

target_labels = df_tags["status: invalid"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,target_labels,t_size=0.1)

# create validation set
train_issues_0,train_issues_1,valid_issues,valid_labels = create_validation(train_issues_0,train_issues_1,rate=0.25)


batch_size  = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)

t_batch,t_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])
v_batch,v_labels = generate_batch(valid_issues,valid_labels,np.shape(valid_issues)[0])

439
330


In [72]:
aucs = list()
y_probs_1, y_preds_1, y_true_1, conf_matrix = my_classifier_nn3(train_issues_0,train_issues_1,4,0.01,
                                                                2*batch_size,v_batch,v_labels,t_batch,t_labels)

aucs.append(compute_auc(y_true_1,y_probs_1)) 
compute_metrics(conf_matrix,aucs)

total epochs 338
accuracy 0.5827338129496403
precision 0.6530612244897959
GM 0.5962847939999438
mean auc 0.5954648526077098
[[49 41]
 [17 32]]




In [73]:
total_nn       = 3
aucs           = np.zeros(total_nn)
total_ypreds_1 = np.zeros(shape = (np.shape(t_labels)[0],1), dtype = np.float64)
start_time = time.time()

for i in range(total_nn):
    y_probs_1, y_preds_1, y_true_1, conf_matrix = my_classifier_nn3(train_issues_0,train_issues_1,4,0.01,
                                                                    2*batch_size,v_batch,v_labels,t_batch,t_labels)
    total_ypreds_1    = total_ypreds_1 + y_preds_1
    aucs[i]           = compute_auc(y_true_1,y_probs_1)

#measure total time
total_time = time.time() - start_time
print("training time in seconds %s "%(str(total_time)))

y_preds1         = compute_predictions_voting(total_ypreds_1,total_nn)
matrix_confusion = metrics.confusion_matrix(y_true=y_true_1,y_pred=y_preds1)
compute_metrics(matrix_confusion,aucs)

total epochs 373
total epochs 340
total epochs 366
training time in seconds 13.134699583053589 
accuracy 0.539568345323741
precision 0.5714285714285714
GM 0.5462716342742852
mean auc 0.5882086167800454
[[47 43]
 [21 28]]




In [None]:
###################################################################################################

In [74]:
# use patience remaining technique

target_labels = df_tags["for: external-project"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,target_labels,t_size=0.1)

# create validation set
train_issues_0,train_issues_1,valid_issues,valid_labels = create_validation(train_issues_0,train_issues_1,rate=0.1)


batch_size  = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)

t_batch,t_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])
v_batch,v_labels = generate_batch(valid_issues,valid_labels,np.shape(valid_issues)[0])

178
161


In [75]:
aucs = list()
y_probs_1, y_preds_1, y_true_1, conf_matrix = my_classifier_nn3(train_issues_0,train_issues_1,4,0.01,
                                                                2*batch_size,v_batch,v_labels,t_batch,t_labels)

aucs.append(compute_auc(y_true_1,y_probs_1)) 
compute_metrics(conf_matrix,aucs)

total epochs 1735
accuracy 0.6258992805755396
precision 0.65
GM 0.6357685747756561
mean auc 0.661764705882353
[[74 45]
 [ 7 13]]




In [76]:
total_nn       = 5
aucs           = np.zeros(total_nn)
total_ypreds_1 = np.zeros(shape = (np.shape(t_labels)[0],1), dtype = np.float64)
start_time = time.time()

for i in range(total_nn):
    y_probs_1, y_preds_1, y_true_1, conf_matrix = my_classifier_nn3(train_issues_0,train_issues_1,8,0.1,
                                                                    2*batch_size,v_batch,v_labels,t_batch,t_labels)
    total_ypreds_1    = total_ypreds_1 + y_preds_1
    aucs[i]           = compute_auc(y_true_1,y_probs_1)

#measure total time
total_time = time.time() - start_time
print("training time in seconds %s "%(str(total_time)))

y_preds1         = compute_predictions_voting(total_ypreds_1,total_nn)
matrix_confusion = metrics.confusion_matrix(y_true=y_true_1,y_pred=y_preds1)
compute_metrics(matrix_confusion,aucs)

total epochs 924
total epochs 1307
total epochs 1028
total epochs 872
total epochs 1005
training time in seconds 27.559181213378906 
accuracy 0.6834532374100719
precision 0.45
GM 0.5702719386692751
mean auc 0.6352941176470589
[[86 33]
 [11  9]]




#### Drop Out layer

In [77]:
def my_classifier_nn4(issues_embeddings_0,issues_embeddings_1,hidden_layer_dim,
                      learning_rate,batch_size,epochs,v_batch,v_labels):
    
    # input data
    X_train = tf.placeholder(tf.float64, shape=[None,np.shape(issues_embeddings_0)[1]])
    # input label
    Y_train = tf.placeholder(tf.float64, shape=[None,2])
    
    # input-hidden layer variables
    W1 = tf.Variable(tf.truncated_normal([np.shape(issues_embeddings_0)[1],hidden_layer_dim],
                                         stddev = 1.0/ math.sqrt(hidden_layer_dim),
                                         dtype=tf.float64),name='W1')
    b1 = tf.Variable(tf.random_normal([hidden_layer_dim],stddev = 1.0/ math.sqrt(hidden_layer_dim),dtype=tf.float64),name = 'b1')
    
    # hidden-output layer variables
    W2 = tf.Variable(tf.truncated_normal([hidden_layer_dim,2],
                                         stddev = 1.0/ math.sqrt(hidden_layer_dim),
                                         dtype=tf.float64),name = 'W2')
    b2 = tf.Variable(tf.random_normal([2],dtype=tf.float64),name = 'b2')
    
    # neural network's functions
    hidden_layer   = tf.add(tf.matmul(X_train,W1),b1)
    hidden_layer   = tf.nn.tanh(hidden_layer)
    
    dropout_layer  = tf.nn.dropout(hidden_layer,rate = 0.5)
    
    output_layer   = tf.add(tf.matmul(dropout_layer,W2),b2)
    
    # for validation and testing dont use dropout
    output_layer_all = tf.add(tf.matmul(hidden_layer,W2),b2)
    output_layer_2   = tf.nn.softmax(output_layer_all)
    
    cost_func = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(labels = Y_train,logits = output_layer))
    
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost_func)
    
    init = tf.global_variables_initializer()
    
    with tf.Session() as sess:
        
        sess.run(init)
        
        for epoch in range(epochs):
            
            # generate batch.
            batch_x,batch_y = pooling(issues_embeddings_0,issues_embeddings_1,batch_size)
            
            # train the model
            _,loss = sess.run([optimizer,cost_func],feed_dict={X_train:batch_x,Y_train:batch_y})
        
        # validation
        y_probs     = sess.run(output_layer_2,feed_dict={X_train:v_batch,Y_train:v_labels})
    
    return compute_predictions(y_probs,v_labels)

In [78]:
# first implementation the simpliest neural network
# use both word embeddings and stack traces embeddings.

target_labels = df_tags["type: bug"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,target_labels,t_size=0.1)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])

229


In [80]:
aucs = list()

# for drop out neral network => hidden_layer_dim = hidden_layer_dim/rate, epochs are more
y_probs_1, _, y_true_1, conf_matrix = my_classifier_nn4(train_issues_0,train_issues_1,4,0.1,
                                                        2*batch_size,300,v_batch,v_labels)

aucs.append(compute_auc(y_true_1,y_probs_1))
compute_metrics(conf_matrix,aucs)

accuracy 0.60431654676259
precision 0.44
GM 0.5308054125241304
mean auc 0.5957894736842105
[[73 41]
 [14 11]]




In [91]:
# first implementation the simpliest neural network
# use both word embeddings and stack traces embeddings.

target_labels = df_tags["for: stackoverflow"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,target_labels,t_size=0.1)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])

180


In [93]:
aucs = list()

# for drop out neral network => hidden_layer_dim = hidden_layer_dim/rate, epochs are more
y_probs_1, _, y_true_1, conf_matrix = my_classifier_nn4(train_issues_0,train_issues_1,16,0.1,
                                                        2*batch_size,200,v_batch,v_labels)

aucs.append(compute_auc(y_true_1,y_probs_1))
compute_metrics(conf_matrix,aucs)

accuracy 0.460431654676259
precision 0.45
GM 0.45605174407879523
mean auc 0.4676470588235294
[[55 64]
 [11  9]]




In [94]:
# first implementation the simpliest neural network
# use both word embeddings and stack traces embeddings.

target_labels = df_tags["status: invalid"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,target_labels,t_size=0.1)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])

439


In [95]:
aucs = list()

# for drop out neral network => hidden_layer_dim = hidden_layer_dim/rate, epochs are more
y_probs_1, _, y_true_1, conf_matrix = my_classifier_nn4(train_issues_0,train_issues_1,4,0.1,
                                                        2*batch_size,200,v_batch,v_labels)

aucs.append(compute_auc(y_true_1,y_probs_1))
compute_metrics(conf_matrix,aucs)

accuracy 0.5611510791366906
precision 0.5714285714285714
GM 0.563436169819011
mean auc 0.5829931972789115
[[50 40]
 [21 28]]




In [96]:
# first implementation the simpliest neural network
# use both word embeddings and stack traces embeddings.

target_labels = df_tags["for: external-project"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,target_labels,t_size=0.1)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])

178


In [97]:
aucs = list()

# for drop out neral network => hidden_layer_dim = hidden_layer_dim/rate, epochs are more
y_probs_1, _, y_true_1, conf_matrix = my_classifier_nn4(train_issues_0,train_issues_1,16,0.01,
                                                        2*batch_size,200,v_batch,v_labels)

aucs.append(compute_auc(y_true_1,y_probs_1))
compute_metrics(conf_matrix,aucs)

accuracy 0.6618705035971223
precision 0.45
GM 0.5602370446681363
mean auc 0.5873949579831933
[[83 36]
 [11  9]]




#### DropOut Layer + Patience Remaining

In [98]:
def my_classifier_nn5(issues_embeddings_0,issues_embeddings_1,hidden_layer_dim,
                      learning_rate,batch_size,v_batch,v_labels,t_batch,t_labels):
    
    # input data
    X_train = tf.placeholder(tf.float64, shape=[None,np.shape(issues_embeddings_0)[1]])
    # input label
    Y_train = tf.placeholder(tf.float64, shape=[None,2])
    
    # input-hidden layer variables
    W1 = tf.Variable(tf.truncated_normal([np.shape(issues_embeddings_0)[1],hidden_layer_dim],
                                         stddev = 1.0/ math.sqrt(hidden_layer_dim),
                                         dtype=tf.float64),name='W1')
    b1 = tf.Variable(tf.random_normal([hidden_layer_dim],stddev = 1.0/ math.sqrt(hidden_layer_dim),dtype=tf.float64),name = 'b1')
    
    # hidden-output layer variables
    W2 = tf.Variable(tf.truncated_normal([hidden_layer_dim,2],
                                         stddev = 1.0/ math.sqrt(hidden_layer_dim),
                                         dtype=tf.float64),name = 'W2')
    b2 = tf.Variable(tf.random_normal([2],dtype=tf.float64),name = 'b2')
    
    # neural network's functions
    hidden_layer   = tf.add(tf.matmul(X_train,W1),b1)
    hidden_layer   = tf.nn.tanh(hidden_layer)
    
    dropout_layer  = tf.nn.dropout(hidden_layer,rate = 0.5)
    
    output_layer   = tf.add(tf.matmul(dropout_layer,W2),b2)
    
    # for validation and testing dont use dropout
    output_layer_all = tf.add(tf.matmul(hidden_layer,W2),b2)
    output_layer_2   = tf.nn.softmax(output_layer_all)
    
    cost_func  = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(labels = Y_train,logits = output_layer))
    valid_func = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(labels = Y_train,logits = output_layer_all))
    
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost_func)
    
    # patience mehtod's variables
    min_loss = float('inf')
    
    min_W1             = np.zeros((np.shape(issues_embeddings_0)[1],hidden_layer_dim))
    min_b1             = np.zeros(hidden_layer_dim)
    min_W2             = np.zeros((hidden_layer_dim,2))
    min_b2             = np.zeros(2)
    
    patience_remaining = 100
    step               = batch_size/(np.shape(issues_embeddings_0)[0] + np.shape(issues_embeddings_1)[0])
    
    init = tf.global_variables_initializer()
    
    with tf.Session() as sess:
        
        sess.run(init)
        
        for epoch in range(50000):
            
            # generate batch.
            batch_x,batch_y = pooling(issues_embeddings_0,issues_embeddings_1,batch_size)
            
            # train the model
            _,loss     = sess.run([optimizer,cost_func],feed_dict={X_train:batch_x,Y_train:batch_y})
            valid_loss = sess.run(valid_func,feed_dict={X_train:v_batch,Y_train:v_labels}) 
            
            patience_remaining -= step
            if valid_loss < min_loss:
                min_loss           = valid_loss
                patience_remaining = 100
                min_W1             = W1.eval()
                min_b1             = b1.eval()
                min_W2             = W2.eval()
                min_b2             = b2.eval()
            if patience_remaining<=0:
                print("total epochs",epoch+1)
                break
        
        # restore minimum weights
        W1 = tf.convert_to_tensor(min_W1)
        b1 = tf.convert_to_tensor(min_b1)
        W2 = tf.convert_to_tensor(min_W2)
        b2 = tf.convert_to_tensor(min_b2)
        
        # testing
        y_probs     = sess.run(output_layer_2,feed_dict={X_train:t_batch,Y_train:t_labels})
    
    return compute_predictions(y_probs,t_labels)

In [99]:
seed(datetime.now())

In [100]:
# use patience remaining technique

target_labels = df_tags["type: bug"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,target_labels,t_size=0.1)

# create validation set
train_issues_0,train_issues_1,valid_issues,valid_labels = create_validation(train_issues_0,train_issues_1,rate=0.2)


batch_size  = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)

t_batch,t_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])
v_batch,v_labels = generate_batch(valid_issues,valid_labels,np.shape(valid_issues)[0])

229
184


In [102]:
aucs = list()
y_probs_1, y_preds_1, y_true_1, conf_matrix = my_classifier_nn5(train_issues_0,train_issues_1,4,0.01,
                                                                2*batch_size,v_batch,v_labels,t_batch,t_labels)

aucs.append(compute_auc(y_true_1,y_probs_1)) 
compute_metrics(conf_matrix,aucs)

total epochs 688
accuracy 0.6115107913669064
precision 0.56
GM 0.5905691575290412
mean auc 0.6256140350877193
[[71 43]
 [11 14]]




In [None]:
##############################################################################################################

In [103]:
# use patience remaining technique

target_labels = df_tags["for: stackoverflow"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,target_labels,t_size=0.1)

# create validation set
train_issues_0,train_issues_1,valid_issues,valid_labels = create_validation(train_issues_0,train_issues_1,rate=0.2)


batch_size  = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)

t_batch,t_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])
v_batch,v_labels = generate_batch(valid_issues,valid_labels,np.shape(valid_issues)[0])

180
144


In [104]:
aucs = list()
y_probs_1, y_preds_1, y_true_1, conf_matrix = my_classifier_nn5(train_issues_0,train_issues_1,32,0.01,
                                                                2*batch_size,v_batch,v_labels,t_batch,t_labels)

aucs.append(compute_auc(y_true_1,y_probs_1)) 
compute_metrics(conf_matrix,aucs)

total epochs 444
accuracy 0.460431654676259
precision 0.5
GM 0.4763305116224668
mean auc 0.47100840336134453
[[54 65]
 [10 10]]




In [None]:
##############################################################################################################

In [105]:
target_labels = df_tags["status: invalid"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,target_labels,t_size=0.10)

# create validation set
train_issues_0,train_issues_1,valid_issues,valid_labels = create_validation(train_issues_0,train_issues_1,rate=0.1)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)

t_batch,t_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])
v_batch,v_labels = generate_batch(valid_issues,valid_labels,np.shape(valid_issues)[0])

439
396


In [106]:
aucs = list()
y_probs_1, y_preds_1, y_true_1, conf_matrix = my_classifier_nn5(train_issues_0,train_issues_1,4,0.01,
                                                                2*batch_size,v_batch,v_labels,t_batch,t_labels)

aucs.append(compute_auc(y_true_1,y_probs_1)) 
compute_metrics(conf_matrix,aucs)

total epochs 311
accuracy 0.5611510791366906
precision 0.46938775510204084
GM 0.535581994247714
mean auc 0.5877551020408164
[[55 35]
 [26 23]]




In [107]:
target_labels = df_tags["for: external-project"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,target_labels,t_size=0.10)

# create validation set
train_issues_0,train_issues_1,valid_issues,valid_labels = create_validation(train_issues_0,train_issues_1,rate=0.1)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)

t_batch,t_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])
v_batch,v_labels = generate_batch(valid_issues,valid_labels,np.shape(valid_issues)[0])

178
161


In [108]:
aucs = list()
y_probs_1, y_preds_1, y_true_1, conf_matrix = my_classifier_nn5(train_issues_0,train_issues_1,4,0.01,
                                                                2*batch_size,v_batch,v_labels,t_batch,t_labels)

aucs.append(compute_auc(y_true_1,y_probs_1)) 
compute_metrics(conf_matrix,aucs)

total epochs 694
accuracy 0.6115107913669064
precision 0.45
GM 0.5360923036037668
mean auc 0.5626050420168067
[[76 43]
 [11  9]]




### Multiclassification

Create a neural network for the moulticlassification problem.<br>
The model will try to predict 3 classes Bug, Google play or Beta feedback, Prio-High.<br>
To begin with every issue belongs only in one category. 

In [None]:
def multi_classifier_nn(issues_embeddings,issues_labels,hidden_layer_dim,
                        learning_rate,batch_size,epochs,v_batch,v_labels):
    
    # input data
    X_train = tf.placeholder(tf.float64, shape=[None,np.shape(issues_embeddings)[1]])
    
    # input label
    Y_train = tf.placeholder(tf.float64, shape=[None,3])
    
    # input-hidden layer variables
    W1 = tf.Variable(tf.truncated_normal([np.shape(issues_embeddings)[1],hidden_layer_dim],
                                         stddev = 1.0/ math.sqrt(hidden_layer_dim),
                                         dtype=tf.float64),name='W1')
    
    b1 = tf.Variable(tf.random_normal([hidden_layer_dim],stddev = 1.0/ math.sqrt(hidden_layer_dim),
                                      dtype=tf.float64),name = 'b1')
    
    # hidden-output layer variables
    W2 = tf.Variable(tf.truncated_normal([hidden_layer_dim,3],
                                         stddev = 1.0/ math.sqrt(3),
                                         dtype=tf.float64),name = 'W2')
    
    b2 = tf.Variable(tf.random_normal([3],dtype=tf.float64),name = 'b2')
    
    # neural network's functions
    hidden_layer   = tf.add(tf.matmul(X_train,W1),b1)
    hidden_layer   = tf.nn.tanh(hidden_layer)
     
    output_layer   = tf.add(tf.matmul(hidden_layer,W2),b2)
    output_layer_2 = tf.nn.softmax(output_layer)
    
    cost_func = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(labels = Y_train,logits = output_layer))
    
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost_func)
    
    init = tf.global_variables_initializer()
    
    with tf.Session() as sess:
        
        sess.run(init)
        
        for epoch in range(epochs):
            
            # generate batch.
            batch_x,batch_y = generate_batch_multi(issues_embeddings,issues_labels,batch_size)
            
            # train the model
            _,loss = sess.run([optimizer,cost_func],feed_dict={X_train:batch_x,Y_train:batch_y})
        
        
        # validation
        y_probs     = sess.run(output_layer_2,feed_dict={X_train:valid_embeddings,Y_train:valid_labels})
        
        compute_metrics_multi(y_probs,valid_labels)

In [None]:
def compute_metrics_multi(y_probs,valid_labels):
    
    y_preds = np.zeros(shape = (np.shape(valid_labels)[0],np.shape(valid_labels)[1]),dtype = np.float64)
    
    for i in range(np.shape(valid_labels)[0]):
        max_prob    = y_probs[i][0]
        max_pointer = 0
        
        for j in range(1,np.shape(valid_labels)[1]):
            if y_probs[i][j] > max_prob:
                max_prob    = y_probs[i][j]
                max_pointer = j
        
        y_preds[i][max_pointer] = 1
    
    print(y_preds.argmax(axis=1)) 
    print(valid_labels.argmax(axis=1))
    
    for counter,value in enumerate(y_preds):
        print(value,y_probs[counter],valid_labels[counter])
    
    conf_matrix = metrics.confusion_matrix(y_true=valid_labels.argmax(axis=1),y_pred=y_preds.argmax(axis=1))
    
    accuracy    = np.sum(np.diagonal(conf_matrix))/np.sum(conf_matrix)

    # precisions
    rows_sums  = np.sum(conf_matrix,axis=1).tolist()
    precisions = list()
    gm         = 1
    
    for i in range(np.shape(conf_matrix)[0]):
        precisions.append(conf_matrix[i][i]/rows_sums[i])
        gm    = gm * np.sqrt(precisions[i])
    
    print("confusion matrix \n", conf_matrix)
    print("accuracy:"          , accuracy)
    print("gm:"                , gm)
    
    for counter,value in enumerate(precisions):
        print("precision ", counter, ":", value)

In [None]:
def generate_batch_multi(issues_embeddings,issues_labels,batch_size):
    
    batch_embeddings = np.ndarray(shape = (batch_size,np.shape(issues_embeddings)[1]),dtype = np.float64)
    batch_labels     = np.ndarray(shape = (batch_size,3),dtype = np.float64)
    
    issues_to_use = random.sample([i for i in range(np.shape(issues_embeddings)[0])],batch_size)
    
    for counter,value in enumerate(issues_to_use):
        batch_embeddings[counter][:]  = issues_embeddings[value][:]
        batch_labels[counter][:]      = issues_labels[value][:]
        
    return batch_embeddings,batch_labels

In [None]:
def create_dataset(issues_embeddings,tags_order,df_tags,min_train_size,min_valid_size):
    
    # train issues
    train_embeddings = np.ndarray(shape = (len(tags_order)*min_train_size,np.shape(issues_embeddings)[1]),
                                  dtype = np.float64)
    train_labels     = np.zeros(shape = (len(tags_order)*min_train_size,3),dtype = np.float64)
    
    # valid issues
    valid_embeddings = np.ndarray(shape = (len(tags_order)*min_valid_size,np.shape(issues_embeddings)[1]),
                                  dtype = np.float64)
    valid_labels     = np.zeros(shape = (len(tags_order)*min_valid_size,3),dtype = np.float64)
    
    idxs             = list()
    
    for tag in tags_order:
        temp = df_tags[tag]
        idxs.append([i for i in range(len(temp)) if temp.loc[i] == 1])
        
    # make sure every index(=>issue) belongs only in one category.
    for counter,value in enumerate(idxs):
        if counter == len(idxs)-1:
            break
        for idx in value:
            for counter_2 in range(counter+1,len(idxs)):
                if idx in idxs[counter_2]:
                    idxs[counter_2].remove(idx)
    
    # choose random indexes.
    for counter,value in enumerate(idxs):
        
        random_idx_train = random.sample(value,min_train_size)
        random_idx_valid = random.sample([i for i in value if i not in random_idx_train],min_valid_size)
        
        # training split
        temp_2 = 0
        for temp in range(counter*min_train_size,(counter+1)*min_train_size):
            
            train_embeddings[temp]          = issues_embeddings[random_idx_train[temp_2]]
            train_labels[temp][counter]     = 1.0
            temp_2                         += 1
        
        # validation split
        temp_2 = 0
        for temp in range(counter*min_valid_size,(counter+1)*min_valid_size):
            valid_embeddings[temp]      = issues_embeddings[random_idx_valid[temp_2]]
            valid_labels[temp][counter] = 1.0
            temp_2                     += 1
            
    return train_embeddings,train_labels,valid_embeddings,valid_labels

In [None]:
# tags must be in asceding order.
tags_order     = ['Prio - High','Google Play or Beta feedback','Bug']
min_train_size = 50
min_valid_size = 10

train_embeddings,train_labels,valid_embeddings,valid_labels = create_dataset(issues_embeddings,tags_order,df_tags,
                                                                            min_train_size,min_valid_size)


In [None]:
multi_classifier_nn(train_embeddings,train_labels,32,0.01,50,500,valid_embeddings,valid_labels)