# Tag Classification

This notebook contains only implementations based on neural networks.

In [27]:
import os
import re
import json
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

## Pre Processing

In [28]:
def load_dict(path_to_file):
    temp_dict = dict()
    with open(path_to_file) as file:
        lines = file.readlines()
        for line in lines:
            temp   = str(line)
            values = temp.split(',')
            temp_dict[values[0]] = int(values[1].replace("\n",""))
    
    return temp_dict 

In [29]:
def load_issues(dir_path,tag_labels,descriptions,stack_traces):
    
    for fname in os.listdir(dir_path):
        with open(os.path.join(dir_path,fname)) as json_file:
            
            data = json.load(json_file)
            for issue in data:
                
                tags = issue['tags']
                for i in range(len(tags)):
                    tags[i] = tags[i].strip()
                description = issue['description']
                stack_trace = issue['stack_trace']
                name        = issue['name']
                
                if tags != [] and stack_trace !=[] : #(description != [] or stack_trace != []):
                    tag_labels.append(tags)
                    descriptions.append(description)
                    stack_traces.append(stack_trace)

In [30]:
# copy paste from stack_trace_embedding notebook

def clean_stack_trace(stack_trace):
    
    clean_stack_trace = []
    temp_stack        = stack_trace.split(" at ")[1:]
    
    to_find = re.compile("[|,|<|>]|/|\|=")
    
    #find where each function ends and keep only the path
    for f in temp_stack:
        temp      = f.find(')')
        temp_file = f[0:temp]
        
        # check the punctuations in order to avoid anything else
        match_obj = to_find.search(temp_file)
        if match_obj == None:
            filename = find_filename(temp_file)
            if filename != '':
                clean_stack_trace.append(filename)
                
    return clean_stack_trace

In [31]:
# copy paste from stack_trace_embedding notebook

def find_filename(value):
    filename = ""
    words    = value.split("(")
    if len(words)>=2:
        parts = words[0].split(".")
        filename = ".".join(parts[0:-1])
    return filename


In [32]:
# copy paste from word embeddings notebook

def clean_description(description):
    
    # define stop words
    all_stopwords = set(stopwords.words('english'))
    
    #define translator to translate punctuation to white space
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    
    #join all lines into one sentence
    sentence     = ' '.join(description)
    
    #translate punctuation
    new_sentence = sentence.translate(translator)
    
    #split the sentense in words
    words = new_sentence.split()
    
    words_sw = [w.lower() for w in words if not w.lower() in all_stopwords and len(w)>1]
    
    return words_sw

In [33]:
# copy paste from word embeddings notebook

def stemming_data(descriptions):
    
    stemmer = PorterStemmer()
    
    for desc in descriptions:
        for counter in range(len(desc)):
            if desc[counter].isalpha():
                desc[counter] = stemmer.stem(desc[counter])
            

In [34]:
def clean_data(descriptions,stack_traces,use_stemming):
    
    clean_descriptions = list()
    clean_stack_traces = list()
    
    for i in range(len(descriptions)):
        
        temp_desc   = descriptions[i]
        temp_trace  = stack_traces[i]
        stack_trace = []
        clean_desc  = []
        
        if temp_trace != []:
            if len(temp_trace)>1:
                stack_trace = clean_stack_trace(' '.join(temp_trace))
            else:
                stack_trace = clean_stack_trace(temp_trace[0])
            
        if temp_desc  != []:
            clean_desc = clean_description(temp_desc)
            
        clean_descriptions.append(clean_desc)
        clean_stack_traces.append(stack_trace)
            
    if use_stemming == True:
        stemming_data(clean_descriptions)
        
    return clean_descriptions,clean_stack_traces

## Compute Arithmetic Representations for Issues

In [35]:
def compute_embeddings(arithmetic_descriptions,arithmetic_stack_traces,
                       word_embedding_matrix,stack_embedding_matrix,use_words,use_stacks):
    
    total_embeddings_dim = 0
    descriptions_dim     = 0
    stack_traces_dim     = 0
    
    if use_words == True:
        descriptions_dim     = np.shape(word_embedding_matrix)[1]
        total_embeddings_dim = total_embeddings_dim + descriptions_dim
        
    if use_stacks == True:
        stack_traces_dim     = np.shape(stack_embedding_matrix)[1]
        total_embeddings_dim = total_embeddings_dim + stack_traces_dim
    
    # make sure that in any case there are something to compute
    if total_embeddings_dim ==0:
        return None
    
    num_issues        = len(arithmetic_descriptions)
    issues_embeddings = np.zeros((num_issues,total_embeddings_dim))
    
    for counter in range(len(arithmetic_descriptions)):
        
        temp_desc   = arithmetic_descriptions[counter]
        temp_stack  = arithmetic_stack_traces[counter]
        total_words = 0
        total_funcs = 0
        
        if use_words == True:
            for word in temp_desc:
                if word != -2:
                    total_words += 1
                    issues_embeddings[counter][0:descriptions_dim] = issues_embeddings[counter][0:descriptions_dim] + word_embedding_matrix[word]
            if total_words != 0 :
                issues_embeddings[counter]    /= total_words
        
        
        if use_stacks == True:
            for func in temp_stack:
                if func != -2:
                    issues_embeddings[counter][descriptions_dim:] = issues_embeddings[counter][descriptions_dim:] + stack_embedding_matrix[func]
                    total_funcs += 1
            if total_funcs != 0:
                issues_embeddings[counter][descriptions_dim:] = issues_embeddings[counter][descriptions_dim:] / total_funcs 
            
    return issues_embeddings  

In [36]:
use_stemming = True

In [37]:
# load word embeddings
word_embedding_matrix = np.loadtxt('../results/word_embeddings_g.txt', dtype=np.float64)

# load stack traces embeddings 
stack_embedding_matrix = np.loadtxt('../results/stack_embeddings_g.txt', dtype=np.float64)

# load vocabularies
word2id_path = "../outputs/words_vocabulary_g.txt"
func2id_path = "../outputs/stacktraces_vocabulary_g.txt"

word2id = load_dict(word2id_path)
func2id = load_dict(func2id_path)

#load tags and descriptions
dir_path     = '../data'
tag_labels   = list()
descriptions = list()
stack_traces = list()

# load issues
load_issues(dir_path,tag_labels,descriptions,stack_traces)

# transform data to arithmetic representation
clean_descriptions,clean_stack_traces = clean_data(descriptions,stack_traces,use_stemming)

clean_descriptions_2 = list()
clean_stack_traces_2 = list()
clean_tags_2         = list()

# remove empty stack traces
for counter,value in enumerate(clean_stack_traces):
    if value != []:
        clean_stack_traces_2.append(value)
        clean_descriptions_2.append(clean_descriptions[counter])
        clean_tags_2.append(tag_labels[counter])

del clean_descriptions
del clean_stack_traces

del descriptions
del stack_traces

#arithmetic_transformations
arithmetic_descriptions = [[word2id.get(word,-2) for word in desc]   for desc in clean_descriptions_2]
arithmetic_stack_traces = [[func2id.get(func,-2) for func in trace] for trace in clean_stack_traces_2]

del clean_descriptions_2
del clean_stack_traces_2

issues_embeddings  = compute_embeddings(arithmetic_descriptions,arithmetic_stack_traces,
                                        word_embedding_matrix,stack_embedding_matrix,True,True)

In [38]:
tag_labels = list()
# copy by reference in order to avoid to change every where the variable name
tag_labels = clean_tags_2

In [39]:
tags = ['Bug','Google Play or Beta feedback','Prio - High']
no_tags = 3
np_tags = np.zeros((len(arithmetic_descriptions),no_tags))

for counter in range(len(tag_labels)):
    for counter_2,value in enumerate(tags):
        if value in tag_labels[counter]:
            np_tags[counter][counter_2] = 1
            
df_tags = pd.DataFrame(np_tags, columns = tags)

## Neural Network Classifiers

In [40]:
import time
import math
import random
from random import seed
from random import randint
from sklearn import metrics
from datetime import datetime
import tensorflow.compat.v1 as tf
tf.compat.v1.disable_eager_execution()
from sklearn.model_selection import StratifiedShuffleSplit

In [107]:
def split_dataset2(issues_embeddings,target_labels,t_size =0.1):
    
    sss = StratifiedShuffleSplit(n_splits = 1, test_size = t_size, random_state = 0)
    
    X_train_0 = list()
    X_train_1 = list()
    
    for train_index, test_index in sss.split(issues_embeddings,target_labels):
        #X_train,X_test = issues_embeddings[train_index], issues_embeddings[test_index]
        #Y_train,Y_test = target_labels[train_index], target_labels[test_index]
        
        
        X_test = issues_embeddings[test_index]
        Y_test = target_labels[test_index]
        
        for index in train_index:
            if target_labels.iloc[index] == 0:
                X_train_0.append(issues_embeddings[index])
            elif target_labels.iloc[index] == 1:
                X_train_1.append(issues_embeddings[index])
                
    return X_train_0,X_train_1,X_test,Y_test

In [42]:
def generate_batch(issues_embeddings,target_labels,batch_size):
    
    batch  = np.ndarray(shape = (batch_size,np.shape(issues_embeddings)[1]), dtype = np.float64)
    labels = np.ndarray(shape = (batch_size,2), dtype = np.float64)
    
    issues_to_use = random.sample([i for i in range(np.shape(issues_embeddings)[0])],batch_size)
    
    for counter,value in enumerate(issues_to_use):
        batch[counter][:]  = issues_embeddings[value][:]
        # label_0
        labels[counter][0] = 1-target_labels.iloc[value]
        # label_1
        labels[counter][1] =   target_labels.iloc[value]
    return batch,labels

In [43]:
def pooling(issues_embeddings_0, issues_embeddings_1, batch_size):
    
    batch  = np.ndarray(shape = (batch_size,np.shape(issues_embeddings_0)[1]), dtype = np.float64)
    labels = np.ndarray(shape = (batch_size,2), dtype = np.float64)
    
    issues_to_use_0 = random.sample([i for i in range(np.shape(issues_embeddings_0)[0])],batch_size//2)
    issues_to_use_1 = random.sample([i for i in range(np.shape(issues_embeddings_1)[0])],batch_size//2)
    
    # even indexes for issues belong to class 0
    # odd  indexes for issues belong to class 1
    counter_0 = 0
    counter_1 = 0
    
    for counter in range(batch_size):
        
        # even indexes
        if counter%2 == 0 :
            batch[counter][:]  = issues_embeddings_0[issues_to_use_0[counter_0]][:]
            labels[counter][0] = 1
            labels[counter][1] = 0
            counter_0 += 1
        else:
            batch[counter][:]  = issues_embeddings_1[issues_to_use_1[counter_1]][:]
            labels[counter][0] = 0
            labels[counter][1] = 1
            counter_1 += 1
            
    return batch,labels

In [44]:
def compute_predictions(y_probs,v_labels):
    
    y_probs_1 = np.ndarray(shape = (np.shape(v_labels)[0],1), dtype = np.float64)
    y_preds_1 = np.ndarray(shape = (np.shape(v_labels)[0],1), dtype = np.float64)
    y_true_1  = np.ndarray(shape = (np.shape(v_labels)[0],1), dtype = np.float64) 
    
    for i in range(np.shape(v_labels)[0]):
        y_true_1[i]  = v_labels[i][1]
        y_preds_1[i] = 0 if y_probs[i][0]>y_probs[i][1] else 1
        y_probs_1[i] = y_probs[i][1]
    
    matrix_confusion = metrics.confusion_matrix(y_true=y_true_1,y_pred=y_preds_1)
    
    return y_probs_1, y_preds_1, y_true_1, matrix_confusion

In [45]:
def compute_auc(y_true,y_probs):
    
    fpr,tpr,thresholds = metrics.roc_curve(y_true,y_probs)
    auc                = metrics.auc(fpr,tpr)
    
    return auc

In [46]:
def compute_metrics(total_confusion,aucs):
    
    acc = (total_confusion[0][0]+total_confusion[1][1])/np.sum(total_confusion)
    
    gm  = np.sqrt((total_confusion[0][0]/(total_confusion[0][0]+total_confusion[0][1]))*
              (total_confusion[1][1]/(total_confusion[1][1]+total_confusion[1][0])))
    
    pre = total_confusion[1][1]/(total_confusion[1][1]+total_confusion[1][0])
    
    mean_auc = np.sum(aucs)/np.shape(aucs)[0]

    print("accuracy" , acc)
    print("precision", pre)
    print("GM"       , gm)
    print("mean auc" , mean_auc)
    print("\n")

#### First and Simpliest Classifier

In [47]:
def my_classifier_nn2(issues_embeddings_0,issues_embeddings_1,hidden_layer_dim,
                      learning_rate,batch_size,epochs,v_batch,v_labels):
    
    # input data
    X_train = tf.placeholder(tf.float64, shape=[None,np.shape(issues_embeddings_0)[1]])
    # input label
    Y_train = tf.placeholder(tf.float64, shape=[None,2])
    
    # input-hidden layer variables
    W1 = tf.Variable(tf.truncated_normal([np.shape(issues_embeddings_0)[1],hidden_layer_dim],
                                         stddev = 1.0/ math.sqrt(hidden_layer_dim),
                                         dtype=tf.float64),name='W1')
    b1 = tf.Variable(tf.random_normal([hidden_layer_dim],stddev = 1.0/ math.sqrt(hidden_layer_dim),dtype=tf.float64),name = 'b1')
    
    # hidden-output layer variables
    W2 = tf.Variable(tf.truncated_normal([hidden_layer_dim,2],
                                         stddev = 1.0/ math.sqrt(hidden_layer_dim),
                                         dtype=tf.float64),name = 'W2')
    b2 = tf.Variable(tf.random_normal([2],dtype=tf.float64),name = 'b2')
    
    # neural network's functions
    hidden_layer   = tf.add(tf.matmul(X_train,W1),b1)
    hidden_layer   = tf.nn.tanh(hidden_layer)
     
    output_layer   = tf.add(tf.matmul(hidden_layer,W2),b2)
    output_layer_2 = tf.nn.softmax(output_layer)
    
    cost_func = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(labels = Y_train,logits = output_layer))
    
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost_func)
    
    init = tf.global_variables_initializer()
    
    with tf.Session() as sess:
        
        sess.run(init)
        
        for epoch in range(epochs):
            
            # generate batch.
            batch_x,batch_y = pooling(issues_embeddings_0,issues_embeddings_1,batch_size)
            
            # train the model
            _,loss = sess.run([optimizer,cost_func],feed_dict={X_train:batch_x,Y_train:batch_y})
        
        # saving the weights in numpy format
        #W1_np = W1.eval()
        #b1_np = b1.eval()
        #W2_np = W2.eval()
        #b2_np = b2.eval()
        
        
        # validation
        y_probs     = sess.run(output_layer_2,feed_dict={X_train:v_batch,Y_train:v_labels})
    
    return compute_predictions(y_probs,v_labels)

In [48]:
##########################################################################

In [49]:
# first implementation the simpliest neural network
# use both word embeddings and stack traces embeddings.

target_labels = df_tags["Bug"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,target_labels)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])

43


In [51]:
total_iterations = 10
total_confusion  = np.zeros((2,2))
conf_matrix      = np.zeros((2,2))
aucs             = np.zeros(total_iterations)

for i in range(total_iterations):
    y_probs_1, _, y_true_1, conf_matrix = my_classifier_nn2(train_issues_0,train_issues_1,16,0.01,
                                                            2*batch_size,100,v_batch,v_labels)
    total_confusion = total_confusion + conf_matrix
    aucs[i]         = compute_auc(y_true_1,y_probs_1)

compute_metrics(total_confusion,aucs)

accuracy 0.7195652173913043
precision 0.7073170731707317
GM 0.7615773105863908
mean auc 0.8131707317073171




In [52]:
##########################################################################

In [53]:
target_labels = df_tags["Google Play or Beta feedback"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,
                                                                      target_labels)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])

151


In [54]:
total_iterations = 10
total_confusion  = np.zeros((2,2))
conf_matrix      = np.zeros((2,2))
aucs             = np.zeros(total_iterations)

for i in range(total_iterations):
    y_probs_1, _, y_true_1, conf_matrix = my_classifier_nn2(train_issues_0,train_issues_1,32,0.01,
                                                            2*batch_size,100,v_batch,v_labels)
    total_confusion = total_confusion + conf_matrix
    aucs[i]         = compute_auc(y_true_1,y_probs_1)

compute_metrics(total_confusion,aucs)

accuracy 0.8695652173913043
precision 0.8235294117647058
GM 0.8592652174945423
mean auc 0.8993914807302232




In [None]:
##########################################################################

In [56]:
target_labels = df_tags["Prio - High"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,
                                                                      target_labels)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])

74


In [57]:
total_iterations = 10
total_confusion  = np.zeros((2,2))
conf_matrix      = np.zeros((2,2))
aucs             = np.zeros(total_iterations)

for i in range(total_iterations):
    y_probs_1, _, y_true_1, conf_matrix = my_classifier_nn2(train_issues_0,train_issues_1,4,0.01,
                                                            2*batch_size,50,v_batch,v_labels)
    total_confusion = total_confusion + conf_matrix
    aucs[i]         = compute_auc(y_true_1,y_probs_1)
    
compute_metrics(total_confusion,aucs)

accuracy 0.5478260869565217
precision 0.475
GM 0.51720402163943
mean auc 0.537828947368421




#### Voting

Because the first implementation has big variances in scores between sequential trainings in the same training and testing datasets we will implement a ensemble technique in order to make the results more stable.

In [78]:
def compute_predictions_voting(total_ypreds,total_nn):
    
    threshold = total_nn//2
    ypreds    = np.ndarray(shape = (np.shape(total_ypreds)[0],1),dtype = np.float64)
    
    for i in range(np.shape(total_ypreds)[0]):
        ypreds[i] = 0 if total_ypreds[i]<=threshold else 1
    
    return ypreds

In [79]:
######################################################################################

In [118]:
# voting technique based on the first and simpliest neural network.
# Use odd number of nn so majority wins every time.

target_labels = df_tags["Bug"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,target_labels)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])

43


In [95]:
total_nn       = 9
aucs           = np.zeros(total_nn)
total_ypreds_1 = np.zeros(shape = (np.shape(v_labels)[0],1), dtype = np.float64)

for i in range(total_nn):
    y_probs_1, y_preds_1, y_true_1, conf_matrix = my_classifier_nn2(train_issues_0,train_issues_1,16,0.01,
                                                                    2*batch_size,100,v_batch,v_labels)
    total_ypreds_1    = total_ypreds_1 + y_preds_1 
    aucs[i]           = compute_auc(y_true_1,y_probs_1)

y_preds1         = compute_predictions_voting(total_ypreds_1,total_nn)
matrix_confusion = metrics.confusion_matrix(y_true=y_true_1,y_pred=y_preds1)
compute_metrics(matrix_confusion,aucs)

accuracy 0.7608695652173914
precision 0.7560975609756098
GM 0.7777390621413379
mean auc 0.810840108401084




In [None]:
######################################################################################

In [97]:
target_labels = df_tags["Google Play or Beta feedback"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,
                                                                      target_labels)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])

151


In [98]:
total_nn       = 9
aucs           = np.zeros(total_nn)
total_ypreds_1 = np.zeros(shape = (np.shape(v_labels)[0],1), dtype = np.float64)

for i in range(total_nn):
    y_probs_1, y_preds_1, y_true_1, conf_matrix = my_classifier_nn2(train_issues_0,train_issues_1,32,0.01,
                                                                    2*batch_size,100,v_batch,v_labels)
    total_ypreds_1    = total_ypreds_1 + y_preds_1
    aucs[i]           = compute_auc(y_true_1,y_probs_1)
    
y_preds1         = compute_predictions_voting(total_ypreds_1,total_nn)
matrix_confusion = metrics.confusion_matrix(y_true=y_true_1,y_pred=y_preds1)
compute_metrics(matrix_confusion,aucs)

accuracy 0.8913043478260869
precision 0.8823529411764706
GM 0.8894239994007015
mean auc 0.9003831417624522




In [99]:
######################################################################################

In [100]:
target_labels = df_tags["Prio - High"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,
                                                                      target_labels)

batch_size = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)
v_batch,v_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])

74


In [106]:
total_nn       = 9
aucs           = np.zeros(total_nn)
total_ypreds_1 = np.zeros(shape = (np.shape(v_labels)[0],1), dtype = np.float64)

for i in range(total_nn):
    y_probs_1, y_preds_1, y_true_1, conf_matrix = my_classifier_nn2(train_issues_0,train_issues_1,4,0.01,
                                                                    2*batch_size,100,v_batch,v_labels)
    total_ypreds_1    = total_ypreds_1 + y_preds_1
    aucs[i]           = compute_auc(y_true_1,y_probs_1)
    
y_preds1         = compute_predictions_voting(total_ypreds_1,total_nn)
matrix_confusion = metrics.confusion_matrix(y_true=y_true_1,y_pred=y_preds1)
compute_metrics(matrix_confusion,aucs)

accuracy 0.5
precision 0.625
GM 0.5441071875825088
mean auc 0.5303362573099416




#### Patience remaining

Here the neural network architecture is implemented based on patience remaining technique in order to avoid tuning the hyper parameter epochs and as a consequence avoid overfitting. 

In [164]:
seed(datetime.now())

In [165]:
def create_validation(train_issues_0,train_issues_1,rate=0.1):
    
    new_train_issues_0 = list()
    new_train_issues_1 = list()
    validation_issues  = list()
    validation_labels  = list()
    
    validation_size_0 = int(len(train_issues_0)*rate)
    validation_size_1 = int(len(train_issues_1)*rate)
    
    validation_idxs_0 = random.sample([i for i in range(len(train_issues_0))], validation_size_0)
    validation_idxs_1 = random.sample([i for i in range(len(train_issues_1))], validation_size_1)
    
    for i in range(len(train_issues_0)):
        if i in validation_idxs_0:
            validation_issues.append(train_issues_0[i])
            validation_labels.append(0.0)
        else:
            new_train_issues_0.append(train_issues_0[i])
    
    for i in range(len(train_issues_1)):
        if i in validation_idxs_1:
            validation_issues.append(train_issues_1[i])
            validation_labels.append(1.0)
        else:
            new_train_issues_1.append(train_issues_1[i])
    
    # create a pandas series for validation labels in order to be compatible with the rest code
    val_labels_series = pd.Series(validation_labels, index = [i for i in range(len(validation_labels))])
    
    # create a np array for validation issues in order to be compatible with the rest code
    val_issues = np.array(validation_issues)
    return new_train_issues_0,new_train_issues_1,val_issues,val_labels_series

In [None]:
def my_classifier_nn3(issues_embeddings_0,issues_embeddings_1,hidden_layer_dim,
                      learning_rate,batch_size,v_batch,v_labels,t_batch,t_labels):
    
    # input data
    X_train = tf.placeholder(tf.float64, shape=[None,np.shape(issues_embeddings_0)[1]])
    # input label
    Y_train = tf.placeholder(tf.float64, shape=[None,2])
    
    # input-hidden layer variables
    W1 = tf.Variable(tf.truncated_normal([np.shape(issues_embeddings_0)[1],hidden_layer_dim],
                                         stddev = 1.0/ math.sqrt(hidden_layer_dim),
                                         dtype=tf.float64),name='W1')
    b1 = tf.Variable(tf.random_normal([hidden_layer_dim],stddev = 1.0/ math.sqrt(hidden_layer_dim),
                                      dtype=tf.float64),name = 'b1')
    
    # hidden-output layer variables
    W2 = tf.Variable(tf.truncated_normal([hidden_layer_dim,2],
                                         stddev = 1.0/ math.sqrt(hidden_layer_dim),
                                         dtype=tf.float64),name = 'W2')
    b2 = tf.Variable(tf.random_normal([2],dtype=tf.float64),name = 'b2')
    
    # neural network's functions
    hidden_layer   = tf.add(tf.matmul(X_train,W1),b1)
    hidden_layer   = tf.nn.tanh(hidden_layer)
     
    output_layer   = tf.add(tf.matmul(hidden_layer,W2),b2)
    output_layer_2 = tf.nn.softmax(output_layer)
    
    cost_func = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(labels = Y_train,logits = output_layer))
    
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost_func)
    
    # patience mehtod's variables
    min_loss = float('inf')
    
    min_W1             = np.zeros((np.shape(issues_embeddings_0)[1],hidden_layer_dim))
    min_b1             = np.zeros(hidden_layrer_dim)
    min_W2             = np.zeros((hidden_layer_dim,2))
    min_b2             = np.zeros(2)
    patience_remaining = 100
    step               = batch_size/(np.shape(issues_embeddings_0)[0] + np.shape(issues_embeddings_1)[0])
    
    init = tf.global_variables_initializer()
    
    with tf.Session() as sess:
        
        sess.run(init)
        
        for epoch in range(50000):
            
            # generate batch.
            batch_x,batch_y = pooling(issues_embeddings_0,issues_embeddings_1,batch_size)
            
            # train the model
            _,train_loss = sess.run([optimizer,cost_func],feed_dict={X_train:batch_x,Y_train:batch_y})
            valid_loss   = sess.run(cost_func,feed_dict={X_train:v_batch,Y_train:v_labels})
            
            patience_remaining -= step
            if valid_loss < min_loss:
                min_loss           = valid_loss
                patience_remaining = 100
                min_W1             = W1.eval()
                min_b1             = b1.eval()
                min_W2             = W2.eval()
                min_b2             = b2.eval()
            if patience_remaining<=0:
                break
        
        # restore minimum weights
        W1 = tf.convert_to_tensor(min_W1)
        b1 = tf.convert_to_tensor(min_b1)
        W2 = tf.convert_to_tensor(min_W2)
        b2 = tf.convert_to_tensor(min_b2)
                
        # testing
        y_probs     = sess.run(output_layer_2,feed_dict={X_train:t_batch,Y_train:t_labels})
    
    return compute_predictions(y_probs,v_labels)

In [171]:
# use patience remaining technique

target_labels = df_tags["Bug"]
train_issues_0,train_issues_1,test_issues,test_labels = split_dataset2(issues_embeddings,target_labels,t_size=0.1)

# create validation set
train_issues_0,train_issues_1,valid_issues,valid_labels = create_validation(train_issues_0,train_issues_1)


batch_size  = np.shape(train_issues_0)[0] if np.shape(train_issues_0)[0]<np.shape(train_issues_1)[0] else np.shape(train_issues_1)[0]  
print(batch_size)

t_batch,t_labels = generate_batch(test_issues,test_labels,np.shape(test_issues)[0])
v_batch,v_labels = generate_batch(valid_issues,valid_labels,np.shape(valid_issues)[0])

39


#### Drop Out layer

In [None]:
def my_classifier_nn4(issues_embeddings_0,issues_embeddings_1,hidden_layer_dim,
                      learning_rate,batch_size,epochs,v_batch,v_labels):
    
    # input data
    X_train = tf.placeholder(tf.float64, shape=[None,np.shape(issues_embeddings_0)[1]])
    # input label
    Y_train = tf.placeholder(tf.float64, shape=[None,2])
    
    # input-hidden layer variables
    W1 = tf.Variable(tf.truncated_normal([np.shape(issues_embeddings_0)[1],hidden_layer_dim],
                                         stddev = 1.0/ math.sqrt(hidden_layer_dim),
                                         dtype=tf.float64),name='W1')
    b1 = tf.Variable(tf.random_normal([hidden_layer_dim],stddev = 1.0/ math.sqrt(hidden_layer_dim),dtype=tf.float64),name = 'b1')
    
    # hidden-output layer variables
    W2 = tf.Variable(tf.truncated_normal([hidden_layer_dim,2],
                                         stddev = 1.0/ math.sqrt(hidden_layer_dim),
                                         dtype=tf.float64),name = 'W2')
    b2 = tf.Variable(tf.random_normal([2],dtype=tf.float64),name = 'b2')
    
    # neural network's functions
    hidden_layer   = tf.add(tf.matmul(X_train,W1),b1)
    hidden_layer   = tf.nn.tanh(hidden_layer)
    
    dropout_layer  = tf.nn.dropout(hidden_layer,rate = 0.5)
    
    output_layer   = tf.add(tf.matmul(dropout_layer,W2),b2)
    
    # for validation and testing dont use dropout
    output_layer_all = tf.add(tf.matmul(hidden_layer,W2),b2)
    output_layer_2   = tf.nn.softmax(output_layer_all)
    
    cost_func = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(labels = Y_train,logits = output_layer))
    
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost_func)
    
    init = tf.initialize_all_variables()
    
    with tf.Session() as sess:
        
        sess.run(init)
        
        for epoch in range(epochs):
            
            # generate batch.
            batch_x,batch_y = pooling(issues_embeddings_0,issues_embeddings_1,batch_size)
            
            # train the model
            _,loss = sess.run([optimizer,cost_func],feed_dict={X_train:batch_x,Y_train:batch_y})
        
        # to save the weights in numpy format
        #W1_np = W1.eval()
        #b1_np = b1.eval()
        #W2_np = W2.eval()
        #b2_np = b2.eval()
        
        
        # validation
        y_probs     = sess.run(output_layer_2,feed_dict={X_train:v_batch,Y_train:v_labels})
    
    return compute_predictions(y_probs,v_labels)

#### DropOut Layer + Voting

#### DropOut Layer + Voting + Patience Remaining???

### Multiclassification