In [2]:
# Loading all required libraries
import pandas as pd
import nltk
import gensim
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from gensim.parsing.preprocessing import STOPWORDS
import numpy as np
import pickle
import gensim
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split

import keras
from keras.models import Model, Sequential
from keras.layers import Input, Embedding, LSTM, GRU, Conv1D, Conv2D, GlobalMaxPool1D, Dense, Dropout
from sklearn.model_selection import train_test_split


# This will prompt for authorization.
# drive.mount('/content/drive')

In [3]:
#Tokeniztion and stop word removal function
def tokenization(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS:
            result.append(token)
    return result

preprocessed = lambda x: transform(x)

In [4]:
#All the data files from the various projects

#Eclipse BugReports
Eclip_Non_Dup = './dataBR/Eclipse/EP_nondup.csv'
Eclip_Dup = './dataBR/Eclipse/EP_dup.csv'


#Mozilla BugReports
Mozilla_Non_Dup = './dataBR/Mozilla/M_NonDuplicate.csv'
Mozilla_Dup = './dataBR/Mozilla/M_Duplicate.csv'

#ThunderBird Bug Reports
ThunderBird_Non_Dup = './dataBR/ThunderBird/Nondup_TB.csv'
ThunderBird_Dup = './dataBR/ThunderBird/dup_TB.csv'


#Eclipse dataframes
Eclipse_dups = pd.read_csv(Eclip_Dup,sep=";", engine='python')
Eclipse_nondups = pd.read_csv(Eclip_Non_Dup,sep=";", engine='python')
Eclipse_combined = pd.concat([Eclipse_dups, Eclipse_nondups], ignore_index=True, sort=False)
Eclipse_combined['Report1'] = Eclipse_combined['Title1'] +" "+ Eclipse_combined['Description1']
Eclipse_combined['Report2'] = Eclipse_combined['Title2'] +" "+ Eclipse_combined['Description2']


#Mozilla dataframes
Mozilla_dups = pd.read_csv(Mozilla_Dup,sep=";", engine='python')
Mozilla_nondups = pd.read_csv(Mozilla_Non_Dup,sep=";", engine='python')
Mozilla_combined = pd.concat([Mozilla_dups, Mozilla_nondups], ignore_index=True, sort=False)
Mozilla_combined['Report1'] = Mozilla_combined['Title1'] +" "+ Mozilla_combined['Description1']
Mozilla_combined['Report2'] = Mozilla_combined['Title2'] +" "+ Mozilla_combined['Description2']


#ThunderBird dataframes
ThunderBird_dups = pd.read_csv(ThunderBird_Dup,sep=";", engine='python')
ThunderBird_nondups = pd.read_csv(ThunderBird_Non_Dup,sep=";", engine='python')
ThunderBird_combined = pd.concat([ThunderBird_dups, ThunderBird_nondups], ignore_index=True, sort=False)
ThunderBird_combined['Report1'] = ThunderBird_combined['Title1'] +" "+ ThunderBird_combined['Description1']
ThunderBird_combined['Report2'] = ThunderBird_combined['Title2'] +" "+ ThunderBird_combined['Description2']

In [5]:
#Checking distribution of Label column
Eclipse_combined.Label.value_counts()

0    34222
1    12686
Name: Label, dtype: int64

In [6]:
#Train, test and validation splits for all 3 projects
#Validation and Test frames are of equal size : 20% of overall dataset each

Train_Eclipse_intermediate, Test_Eclipse = train_test_split(Eclipse_combined, test_size=0.2, random_state=42)
Train_Eclipse, Validation_Eclipse = train_test_split(Train_Eclipse_intermediate, test_size=0.25, random_state=56)

Train_Mozilla_intermediate, Test_Mozilla = train_test_split(Mozilla_combined, test_size=0.2, random_state=78)
Train_Mozilla, Validation_Mozilla = train_test_split(Train_Mozilla_intermediate, test_size=0.25, random_state=56)

Train_ThunderBird_intermediate, Test_intermediate = train_test_split(ThunderBird_combined, test_size=0.20, random_state=98)
Train_ThunderBird, Validation_ThunderBird = train_test_split(Train_ThunderBird_intermediate, test_size=0.25, random_state=56)

In [8]:
#Running word2vec.py to build vocabulary with Google News dataset and all bug reports in project
#Run only once


#%run -i 'word2vec.py'

In [9]:
from sklearn.model_selection import train_test_split

# Load training set and resetting index
train_df = Train_Eclipse_intermediate.reset_index()
for q in ['Report1', 'Report2']:
    train_df[q + '_n'] = train_df[q]

# Make word2vec embeddings
embedding_dim = 300
max_seq_length = 20
use_w2v = False

#trigerring make_w2v_embeddings support function from 'util.py' for the word embedding and vectorization process
#detailed in the final Paper submission
train_df, embeddings = make_w2v_embeddings(train_df, embedding_dim=embedding_dim, empty_w2v=use_w2v)

# Split to train validation, assigning size
X = train_df[['Report1_n','Report2_n']]
Y = train_df['Label']

#Train and validation split
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.25)

#trigerring split_and_zero_padding support function from 'util.py' for both train and validation dataframes
X_train = split_and_zero_padding(X_train, max_seq_length)
X_validation = split_and_zero_padding(X_validation, max_seq_length)

# Convert labels to their numpy representations
Y_train = Y_train.values
Y_validation = Y_validation.values

# Make sure shape of dataframe is as expected
assert X_train['left'].shape == X_train['right'].shape
assert len(X_train['left']) == len(Y_train)

# -- BEGIN MODEL ----

# Model variables
gpus = 2
batch_size = 1024 * gpus
n_epoch = 50
n_hidden = 50

# Define the shared model
x = Sequential()
#using word2vec generated embeddings
x.add(Embedding(len(embeddings), embedding_dim,
                weights=[embeddings], input_shape=(max_seq_length,), trainable=False))
#Adding the LSTM layer for the Siamese Signal Subnet Compression
x.add(LSTM(n_hidden))
shared_model = x

# The visible layer
left_input = Input(shape=(max_seq_length,), dtype='int32')
right_input = Input(shape=(max_seq_length,), dtype='int32')

# Rolled up into a Manhattan Distance model
malstm_distance = ManDist()([shared_model(left_input), shared_model(right_input)])
model = Model(inputs=[left_input, right_input], outputs=[malstm_distance])

# Build model with MSE loss, Adam optimizer and optimizing for accuracy 
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

#Display Model Summary
model.summary()

# Start training over the epochs
training_start_time = time()
malstm_trained = model.fit([X_train['left'], X_train['right']], Y_train,
                           batch_size=batch_size, epochs=n_epoch,
                           validation_data=([X_validation['left'], X_validation['right']], Y_validation))
training_end_time = time()

print("Training time finished.\n%d epochs in %12.2f" % (n_epoch,
                                                        training_end_time - training_start_time))
#Saving the model to a file for inference 
model.save('./data/BugClassifierLSTM.h5')



Loading word2vec model(it may takes 2-3 mins) ...
WORD2VEC loaded
1,000 sentences embedded.
2,000 sentences embedded.
3,000 sentences embedded.
4,000 sentences embedded.
5,000 sentences embedded.
6,000 sentences embedded.
7,000 sentences embedded.
8,000 sentences embedded.
9,000 sentences embedded.
10,000 sentences embedded.
11,000 sentences embedded.
12,000 sentences embedded.
13,000 sentences embedded.
14,000 sentences embedded.
15,000 sentences embedded.
16,000 sentences embedded.
17,000 sentences embedded.
18,000 sentences embedded.
19,000 sentences embedded.
20,000 sentences embedded.
21,000 sentences embedded.
22,000 sentences embedded.
23,000 sentences embedded.
24,000 sentences embedded.
25,000 sentences embedded.
26,000 sentences embedded.
27,000 sentences embedded.
28,000 sentences embedded.
29,000 sentences embedded.
30,000 sentences embedded.
31,000 sentences embedded.
32,000 sentences embedded.
33,000 sentences embedded.
34,000 sentences embedded.
35,000 sentences embedded

2022-04-24 19:43:59.771390: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 sequential (Sequential)        (None, 50)           57562500    ['input_1[0][0]',                
                                                                  'input_2[0][0]']                
                                                                                                  
 man_dist (ManDist)             (None, 1)            0           ['sequential[0][0]',         

In [15]:
#Loading libraries for test run
from util import make_w2v_embeddings
from util import split_and_zero_padding
from util import ManDist

# Load testing set
y_test = ThunderBird_combined['Label'].values
test_df = ThunderBird_combined
for q in ['Report1', 'Report2']:
    test_df[q + '_n'] = test_df[q]

# Make word2vec embeddings on test
embedding_dim = 300
max_seq_length = 20
test_df, embeddings = make_w2v_embeddings(test_df, embedding_dim=embedding_dim, empty_w2v=False)

# Split to dicts and append zero padding.
X_test = split_and_zero_padding(test_df, max_seq_length)

# Make sure everything is ok
assert X_test['left'].shape == X_test['right'].shape

# --

model = keras.models.load_model('./data/BugClassifierLSTM.h5', custom_objects={'ManDist': ManDist})
model.summary()

y_test_pred = model.predict([X_test['left'], X_test['right']])




Loading word2vec model(it may takes 2-3 mins) ...
WORD2VEC loaded
1,000 sentences embedded.
2,000 sentences embedded.
3,000 sentences embedded.
4,000 sentences embedded.
5,000 sentences embedded.
6,000 sentences embedded.
7,000 sentences embedded.
8,000 sentences embedded.
9,000 sentences embedded.
10,000 sentences embedded.
11,000 sentences embedded.
12,000 sentences embedded.
13,000 sentences embedded.
14,000 sentences embedded.
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 20)]         0           []                               
                                                       

In [18]:
#Making a prediction on across different project dataset (testdataset)
threshold = 0.1
Y_pred_final = np.where(y_test_pred > threshold, 1,0)
print(Y_pred_final)

[[0]
 [1]
 [0]
 ...
 [0]
 [0]
 [0]]


In [19]:
#Printing the performance of results obtained from model on the test set
#For the dataset selected for evaluation above and at specified threshold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report,confusion_matrix

print("ROC_AUC_score of model : ",roc_auc_score(y_test, Y_pred_final))
print(classification_report(y_test, Y_pred_final))
print(confusion_matrix(y_test, Y_pred_final))

ROC_AUC_score of model :  0.519858168433065
              precision    recall  f1-score   support

           0       0.71      0.58      0.64      9905
           1       0.33      0.46      0.38      4358

    accuracy                           0.54     14263
   macro avg       0.52      0.52      0.51     14263
weighted avg       0.59      0.54      0.56     14263

[[5755 4150]
 [2359 1999]]
