In [4]:
# Loading all required libraries
import pandas as pd
import nltk
import gensim
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from gensim.parsing.preprocessing import STOPWORDS
import numpy as np
import pickle
import gensim
from gensim.utils import simple_preprocess
from util import make_w2v_embeddings
from util import split_and_zero_padding
from util import ManDist
from time import time

import keras
from keras.models import Model, Sequential
from keras.layers import Input, Embedding, LSTM, GRU, Conv1D, Conv2D, GlobalMaxPool1D, Dense, Dropout
from sklearn.model_selection import train_test_split


# This will prompt for authorization.
# drive.mount('/content/drive')

In [5]:
#Tokeniztion and stop word removal function
def tokenization(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS:
            result.append(token)
    return result

preprocessed = lambda x: transform(x)

In [6]:
# Loading all required files, Note: Only Eclipse in this baseline scenario

Eclipse_dups = pd.read_csv("EP_dup.csv",sep=";", engine='python')
Eclipse_nondups = pd.read_csv("EP_nondup.csv",sep=";", engine='python')
Eclipse_combined = pd.concat([Eclipse_dups, Eclipse_nondups], ignore_index=True, sort=False)

# Mozilla_dups = pd.read_csv("/content/drive/My Drive/Duplicate Bug Report/Mozilla/M_Duplicate BRs.csv",sep=";", engine='python')
# Mozilla_nondups = pd.read_csv("/content/drive/My Drive/Duplicate Bug Report/Mozilla/M_NonDuplicate BRs.csv",sep=";", engine='python')
# Mozilla_combined = pd.concat([Mozilla_dups, Mozilla_nondups], ignore_index=True, sort=False)

# ThunderBird_dups = pd.read_csv("/content/drive/My Drive/Duplicate Bug Report/ThunderBird/dup_TB.csv",sep=";", engine='python')
# ThunderBird_nondups = pd.read_csv("/content/drive/My Drive/Duplicate Bug Report/ThunderBird/Nondup_TB.csv",sep=";", engine='python')
# ThunderBird_combined = pd.concat([ThunderBird_dups, ThunderBird_nondups], ignore_index=True, sort=False)

In [7]:
#Checking distribution of Label column
Eclipse_combined.Label.value_counts()

0    34222
1    12686
Name: Label, dtype: int64

In [8]:
# Creating new column to concat Title and Description into one column called Report for the pair of bug reports. 
Eclipse_combined['Report1'] = Eclipse_combined['Title1'] +" "+ Eclipse_combined['Description1']
Eclipse_combined['Report2'] = Eclipse_combined['Title2'] +" "+ Eclipse_combined['Description2']

In [20]:
#View the records in the X_train dataframe
X_train.head(3)

Unnamed: 0,Report1_n,Report2_n
11048,npe prevents from applying a patch team apply...,npe in label.setbackground choosing show in br...
20231,dav site explorer should update lazily as you ...,keybindings copy action in view. binding crtlc...
26892,preferences invalid preference pages are still...,rundebug hoverbehavior incorrect on workspace ...


In [27]:
# Split to train test
Train, Test = train_test_split(Eclipse_combined, test_size=0.33, random_state=42)

# Load training set and resetting index
train_df = Train.reset_index()
for q in ['Report1', 'Report2']:
    train_df[q + '_n'] = train_df[q]

# Make word2vec embeddings
embedding_dim = 300
max_seq_length = 20
use_w2v = True

#trigerring make_w2v_embeddings support function from 'util.py' for the word embedding and vectorization process
#detailed in the final Paper submission
train_df, embeddings = make_w2v_embeddings(train_df, embedding_dim=embedding_dim, empty_w2v=use_w2v)

# Split to train validation, assigning size
validation_size = int(len(train_df) * 0.1)
training_size = len(train_df) - validation_size

#Selecting X and y
X = train_df[['Report1_n','Report2_n']]
Y = train_df['Label']

#Train and validation split
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size)

#trigerring split_and_zero_padding support function from 'util.py' for both train and validation dataframes
X_train = split_and_zero_padding(X_train, max_seq_length)
X_validation = split_and_zero_padding(X_validation, max_seq_length)

# Convert labels to their numpy representations
Y_train = Y_train.values
Y_validation = Y_validation.values

# Make sure shape of dataframe is as expected
assert X_train['left'].shape == X_train['right'].shape
assert len(X_train['left']) == len(Y_train)

# -- BEGIN MODEL ----

# Model variables
gpus = 2
batch_size = 1024 * gpus
n_epoch = 50
n_hidden = 50

# Define the shared sequential model
x = Sequential()
#using word2vec generated embeddings
x.add(Embedding(len(embeddings), embedding_dim,
                weights=[embeddings], input_shape=(max_seq_length,), 
                trainable=False))
#Adding the LSTM layer for the Siamese Signal Subnet Compression
x.add(LSTM(n_hidden))
shared_model = x

# The visible layer
left_input = Input(shape=(max_seq_length,), dtype='int32')
right_input = Input(shape=(max_seq_length,), dtype='int32')

# Rolled up into a Manhattan Distance model
malstm_distance = ManDist()([shared_model(left_input), shared_model(right_input)])
model = Model(inputs=[left_input, right_input], outputs=[malstm_distance])

# Build model with MSE loss, Adam optimizer and optimizing for accuracy 
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

#Display Model Summary
model.summary()

# Start training over the epochs
training_start_time = time()
malstm_trained = model.fit([X_train['left'], X_train['right']], Y_train,
                           batch_size=batch_size, epochs=n_epoch,
                           validation_data=([X_validation['left'], X_validation['right']], Y_validation))
training_end_time = time()

print("Training time finished.\n%d epochs in %12.2f" % (n_epoch,training_end_time - training_start_time))


#Saving the model to a file for inference 
model.save('./data/BugClassifierLSTM.h5')


Loading word2vec model(it may takes 2-3 mins) ...
1,000 sentences embedded.
2,000 sentences embedded.
3,000 sentences embedded.
4,000 sentences embedded.
5,000 sentences embedded.
6,000 sentences embedded.
7,000 sentences embedded.
8,000 sentences embedded.
9,000 sentences embedded.
10,000 sentences embedded.
11,000 sentences embedded.
12,000 sentences embedded.
13,000 sentences embedded.
14,000 sentences embedded.
15,000 sentences embedded.
16,000 sentences embedded.
17,000 sentences embedded.
18,000 sentences embedded.
19,000 sentences embedded.
20,000 sentences embedded.
21,000 sentences embedded.
22,000 sentences embedded.
23,000 sentences embedded.
24,000 sentences embedded.
25,000 sentences embedded.
26,000 sentences embedded.
27,000 sentences embedded.
28,000 sentences embedded.
29,000 sentences embedded.
30,000 sentences embedded.
31,000 sentences embedded.


2022-03-06 17:52:25.257056: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 sequential (Sequential)        (None, 50)           51270300    ['input_1[0][0]',                
                                                                  'input_2[0][0]']                
                                                                                                  
 man_dist (ManDist)             (None, 1)            0           ['sequential[0][0]',         

In [47]:
import matplotlib

matplotlib.use('Agg')
import matplotlib.pyplot as plt


# Plot accuracy
plt.subplot(211)
plt.plot(malstm_trained.history['accuracy'])
plt.plot(malstm_trained.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

# Plot loss
plt.subplot(212)
plt.plot(malstm_trained.history['loss'])
plt.plot(malstm_trained.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

plt.tight_layout(h_pad=1.0)
plt.savefig('./data/history-graph.png')

print(str(malstm_trained.history['val_accuracy'][-1])[:6] +
      "(max: " + str(max(malstm_trained.history['val_accuracy']))[:6] + ")")
print("Done.")

0.7351(max: 0.7373)
Done.


In [49]:
#Viewing the first few data points of the Test dataset
X_test.head(5)

Unnamed: 0,Report1_n,Report2_n
465,"[311, 1067, 8974, 15, 616, 81, 8974, 7, 199, 1...","[1067, 576, 999, 8975, 8975, 1979, 8976, 1598,..."
39333,"[859, 40, 115, 7697, 1501, 45211, 8198, 613, 2...","[63, 65421, 3052, 1621, 980, 4746, 2389, 4538,..."
13524,"[2, 920, 1615, 914, 180, 341, 1451, 87057, 912...","[87058, 1120, 278, 87059, 2407, 100, 3565, 376..."
32453,"[7472, 28229, 165315, 563, 319, 2123, 2796, 64...","[8935, 3456, 4114, 1026, 189, 4344, 3968, 340,..."
37068,"[7472, 4999, 4994, 89, 1454, 76, 964, 81, 89, ...","[405, 313, 7322, 68, 751, 1548, 10718, 986, 11..."


In [60]:
#Loading libraries for test run
import pandas as pd
from util import make_w2v_embeddings
from util import split_and_zero_padding
from util import ManDist

# Load testing set
test_df = X_test

# Make sure shape is ok
assert X_test['left'].shape == X_test['right'].shape

# --  INFERENCE FROM MODEL ---

#Loading previously saved model
model = keras.models.load_model('./data/BugClassifierLSTM.h5', custom_objects={'ManDist': ManDist})
model.summary()

#making predictions on test set
y_test_pred = model.predict([X_test['left'], X_test['right']])

#Setting threshold for Manhattan distance cut-off
threshold = 0.5

#Predicting Label for each value in the test set
Y_pred_final = np.where(y_test_pred > threshold, 1,0)

Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_17 (InputLayer)          [(None, 20)]         0           []                               
                                                                                                  
 input_18 (InputLayer)          [(None, 20)]         0           []                               
                                                                                                  
 sequential_8 (Sequential)      (None, 50)           66271200    ['input_17[0][0]',               
                                                                  'input_18[0][0]']               
                                                                                                  
 man_dist_8 (ManDist)           (None, 1)            0           ['sequential_8[0][0]',     

In [61]:
print(Y_pred_final)

[[0]
 [1]
 [0]
 ...
 [0]
 [0]
 [0]]


In [62]:
#Printing the performance of results obtained from model on the test set
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report,confusion_matrix

print("ROC_AUC_score of model : ",roc_auc_score(y_test, Y_pred_final))
print(classification_report(y_test, Y_pred_final))
print(confusion_matrix(y_test, Y_pred_final))

ROC_AUC_score of model :  0.5335104576737897
              precision    recall  f1-score   support

           0       0.74      0.97      0.84     11238
           1       0.55      0.10      0.16      4242

    accuracy                           0.73     15480
   macro avg       0.65      0.53      0.50     15480
weighted avg       0.69      0.73      0.65     15480

[[10905   333]
 [ 3832   410]]
