In [1]:
#importing the different packages to be used.
import pandas as pd
import nltk
import gensim
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import *
from gensim.parsing.preprocessing import STOPWORDS
import numpy as np
import pickle
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\swarn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\swarn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
#Eclipse duplicate bug pairs data loaded into a dataframe
df_duplicate = pd.read_csv('EP_dup.csv', sep=';'  , engine='python')
df_duplicate.head()

Unnamed: 0,Issue_id,Duplicated_issue,Title1,Description1,Title2,Description2,Label
0,25,28126,cvs ui need vcm prefs default repo connection gc,it would be helpful if there was a notion of d...,wizards patch standard public cvs repositories,this patch adds a convenient way to check thin...,1
1,40,20,need connect to team stream gcqpkw,i would like to be able to connect to a team s...,workspace files,thought it would be useful if the set of repo ...,1
2,48,22,make sure can future store other project refer...,project references come in three flavours . p...,persist sharing recommendations and project ve...,project descriptions dont store sharing recomm...,1
3,61,60,.vcmmeta showing as change gdqtgw,useruser install drop into declipse user ...,need custom .vcmignore comparemerge gdqt,useruser install drop into declipse user ...,1
4,94,2,repositories view all file types open to the t...,when browsing files in the repositories view i...,opening repository resources doesnt honor type...,opening repository resource open the default ...,1


In [3]:
#checking the no of records with Label 1(duplicate bug pairs)
df_duplicate['Label'].value_counts()

1    12686
Name: Label, dtype: int64

In [4]:
#Eclipse non duplicate bug pairs data loaded into a dataframe
df_nonduplicate = pd.read_csv('EP_nondup.csv', sep=';'  , engine='python')
df_nonduplicate.head()

Unnamed: 0,Issue_id,Duplicated_issue,Title1,Description1,Title2,Description2,Label
0,1,92840,usability issue with external editors geirl,setup a project that contains a .gif resource ...,api deleted field idebuguiconstants.imgactsynced,. head idebuguiconstants public static final ...,0
1,2,92844,opening repository resources doesnt honor type...,opening repository resource open the default ...,increase scrolling speed based on distance fro...,currently the speed of selection scrolling whe...,0
2,3,92850,sync does not indicate deletion gien,kmpm \tthis pr about the deletion indicator in...,jface api public api nonconstant field columnp...,. head columnpixeldata public boolean addtrim...,0
3,4,92851,need better error message if catching up over ...,become synchronized with some project in a rep...,viewers api protected field structuredviewer.c...,. head colorandfontcollector protected colora...,0
4,5,92852,isharingmanager sharing api inconsistent gaulh,for gettingsetting the managed state of a reso...,. ma nullpointerexception when install spindle...,steps . menu help software updates find and...,0


In [5]:
#checking the no of records with Label 0(non duplicate bug pairs)
df_nonduplicate['Label'].value_counts()

0    34222
Name: Label, dtype: int64

In [6]:
#Checking if any missing values are present.
df_nonduplicate['Description2'].isnull().sum() #no null values detected

0

In [7]:
#Combining eclipse duplicate data and non duplicate data into one dataframe
df_eclipse = pd.concat([df_duplicate, df_nonduplicate], ignore_index=True, sort=False)
df_eclipse.tail()

Unnamed: 0,Issue_id,Duplicated_issue,Title1,Description1,Title2,Description2,Label
46903,92835,424602,viewers retrieving projects from cvs creates d...,observed in i and i did not happen in m happen...,learn how to report a bug report,a invalid bug report just have a try. i am .,0
46904,92836,424655,api missing since . tag on abstractdebugview.s...,. head abstractdebugview.setviewerviewer priv...,projectdescription name properties dosent work,i have a project place in folder core here is ...,0
46905,92838,424656,api add missing since . tags on api added in ....,. head my first random probe of a field added...,m failed related to rcp.config,httpdownload.eclipse.orgeclipsedownloadsdropsm...,0
46906,424672,424658,menu bar does not display,when i click on one of the buttons on the menu...,eclipse kepler often crash on ubuntu . and jro...,eclipse kepler often crash on ubuntu . and ora...,0
46907,424764,424714,crash macos getivar,processeclipse pathapplicationseclipse.appecl...,gtklinux blank windows with gtk,i started testing . and noticed the editor win...,0


In [8]:
#Checking if combined correctly. Seems okay.
df_eclipse['Label'].value_counts()

0    34222
1    12686
Name: Label, dtype: int64

In [9]:
#Combining columns Title1 and Description1 to form Report1 (indicating the first bug report). 
#Combining columns Title2 and Description2 to form column Report2(indicating the second bug report)
df_eclipse['Report1'] = df_eclipse['Title1'] +" "+ df_eclipse['Description1']
df_eclipse['Report2'] = df_eclipse['Title2'] +" "+ df_eclipse['Description2']
df_eclipse.head()

Unnamed: 0,Issue_id,Duplicated_issue,Title1,Description1,Title2,Description2,Label,Report1,Report2
0,25,28126,cvs ui need vcm prefs default repo connection gc,it would be helpful if there was a notion of d...,wizards patch standard public cvs repositories,this patch adds a convenient way to check thin...,1,cvs ui need vcm prefs default repo connection...,wizards patch standard public cvs repositories...
1,40,20,need connect to team stream gcqpkw,i would like to be able to connect to a team s...,workspace files,thought it would be useful if the set of repo ...,1,need connect to team stream gcqpkw i would lik...,workspace files thought it would be useful if ...
2,48,22,make sure can future store other project refer...,project references come in three flavours . p...,persist sharing recommendations and project ve...,project descriptions dont store sharing recomm...,1,make sure can future store other project refer...,persist sharing recommendations and project ve...
3,61,60,.vcmmeta showing as change gdqtgw,useruser install drop into declipse user ...,need custom .vcmignore comparemerge gdqt,useruser install drop into declipse user ...,1,.vcmmeta showing as change gdqtgw useruser i...,need custom .vcmignore comparemerge gdqt useru...
4,94,2,repositories view all file types open to the t...,when browsing files in the repositories view i...,opening repository resources doesnt honor type...,opening repository resource open the default ...,1,repositories view all file types open to the t...,opening repository resources doesnt honor type...


In [10]:
import re
import string

def transform(text):
    """
    A Preprocessing method to remove punctuation, remove words containing numbers, make string lowercase.
    """
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('\w*\f\w*', '', text)
    text = re.sub('\(.*?\)', '', text)
    text = re.sub('\[.*]\)', '', text)
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\t', '', text)
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    return text

preprocessed = lambda x: transform(x)

In [11]:
# preprocessing the Report1 and Report2 columns to remove the punctuation etc.
df_eclipse['Report1'] = df_eclipse['Report1'].apply(transform)
df_eclipse['Report2'] = df_eclipse['Report2'].apply(transform)


In [12]:
#checking if everything okay.
df_eclipse.head()

Unnamed: 0,Issue_id,Duplicated_issue,Title1,Description1,Title2,Description2,Label,Report1,Report2
0,25,28126,cvs ui need vcm prefs default repo connection gc,it would be helpful if there was a notion of d...,wizards patch standard public cvs repositories,this patch adds a convenient way to check thin...,1,cvs ui need vcm prefs default repo connection...,wizards patch standard public cvs repositories...
1,40,20,need connect to team stream gcqpkw,i would like to be able to connect to a team s...,workspace files,thought it would be useful if the set of repo ...,1,need connect to team stream gcqpkw i would lik...,workspace files thought it would be useful if ...
2,48,22,make sure can future store other project refer...,project references come in three flavours . p...,persist sharing recommendations and project ve...,project descriptions dont store sharing recomm...,1,make sure can future store other project refer...,persist sharing recommendations and project ve...
3,61,60,.vcmmeta showing as change gdqtgw,useruser install drop into declipse user ...,need custom .vcmignore comparemerge gdqt,useruser install drop into declipse user ...,1,vcmmeta showing as change gdqtgw useruser in...,need custom vcmignore comparemerge gdqt userus...
4,94,2,repositories view all file types open to the t...,when browsing files in the repositories view i...,opening repository resources doesnt honor type...,opening repository resource open the default ...,1,repositories view all file types open to the t...,opening repository resources doesnt honor type...


In [13]:
#we can drop issue_id, Duplicated_issue, title1, description1, title2, descriprtion2 from our dataframe.
df_eclipse.drop(columns = ['Issue_id', 'Duplicated_issue','Title1','Description1', 'Title2', 'Description2'], inplace=True)

In [14]:
#dataframe after dropping the columns in the above step.

df_eclipse.head()

Unnamed: 0,Label,Report1,Report2
0,1,cvs ui need vcm prefs default repo connection...,wizards patch standard public cvs repositories...
1,1,need connect to team stream gcqpkw i would lik...,workspace files thought it would be useful if ...
2,1,make sure can future store other project refer...,persist sharing recommendations and project ve...
3,1,vcmmeta showing as change gdqtgw useruser in...,need custom vcmignore comparemerge gdqt userus...
4,1,repositories view all file types open to the t...,opening repository resources doesnt honor type...


In [15]:
# used tf-idf vectorizer. TF: frequency of word in a bug report. IDF: how important a word is in context to different
# bug reports. 
tfidf_eclipse = TfidfVectorizer(analyzer = 'word',
                        stop_words = 'english',
                        lowercase = True,
                        max_features = 300,
                        norm = 'l1')

In [16]:
# training the tfidf model in the whole bug report words corpus
bag_of_words_eclipse = pd.concat([df_eclipse.Report1,df_eclipse.Report2], axis = 0)
tfidf_eclipse.fit(bag_of_words_eclipse)

TfidfVectorizer(max_features=300, norm='l1', stop_words='english')

In [17]:
# vectorizing both the bug reports individually
duplicate_report1 = tfidf_eclipse.transform(df_eclipse.Report1)
duplicate_report2 = tfidf_eclipse.transform(df_eclipse.Report2)

In [18]:
duplicate_report1

<46908x300 sparse matrix of type '<class 'numpy.float64'>'
	with 645204 stored elements in Compressed Sparse Row format>

In [19]:
duplicate_report2

<46908x300 sparse matrix of type '<class 'numpy.float64'>'
	with 643235 stored elements in Compressed Sparse Row format>

In [20]:
# Since we are looking at pairs of data, we will be taking the difference of all bug reports 1 
#  and bug report 2 pairs with this. Resulting matrix has same no of rows and 
# a vector describing the relationship between the two bug reports
X = abs(duplicate_report1 - duplicate_report2)
Y = df_eclipse['Label']

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [22]:
#Train and test dataset split (80% train data and 20% test data)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

### Random Forest

In [23]:
#Defining a random forest model with default hyperparameters.
rf = RandomForestClassifier()

In [24]:
#training the model on the training data set
rf.fit(X_train, y_train)

RandomForestClassifier()

In [25]:
#predicting on the test data set.
y_pred_test = rf.predict(X_test)


In [26]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,roc_auc_score


In [27]:
# confusion matrix for the rf model.
confusion_matrix(y_test, y_pred_test)

array([[6778,   90],
       [1924,  590]], dtype=int64)

In [28]:
#classification report to evaluate the performance of the rf model.
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.78      0.99      0.87      6868
           1       0.87      0.23      0.37      2514

    accuracy                           0.79      9382
   macro avg       0.82      0.61      0.62      9382
weighted avg       0.80      0.79      0.74      9382



In [31]:
#calculating the ROC AUC score for the rf model.
rf_probs = rf.predict_proba(X_test)
rf_probs = rf_probs[:, 1]
# calculate scores
rf_auc = roc_auc_score(y_test, rf_probs)
print('Random forest: ROC AUC=%.3f' % (rf_auc))

Random forest: ROC AUC=0.751


#### Another TF-IDF model

In [29]:
# used tf-idf vectorizer. TF: frequency of word in a bug report. IDF: how important a word is in context to different
# bug reports. Different parameters than the tf-idf model used above. 

tfidf_eclipse = TfidfVectorizer(analyzer = 'word',
                        stop_words = 'english',
                        lowercase = True,
                        max_features = 500,
                        norm = 'l2')

In [30]:
# training the tfidf model in the whole bug report words corpus
bag_of_words_eclipse = pd.concat([df_eclipse.Report1,df_eclipse.Report2], axis = 0)
tfidf_eclipse.fit(bag_of_words_eclipse)

TfidfVectorizer(max_features=500, stop_words='english')

In [31]:
# vectorizing both the bug reports individually
duplicate_report1 = tfidf_eclipse.transform(df_eclipse.Report1)
duplicate_report2 = tfidf_eclipse.transform(df_eclipse.Report2)

In [32]:
# Since we are looking at pairs of data, we will be taking the difference of all bug reports 1 
# one and bug report 2 pairs with this. Resulting matrix has same no of rows and 
# a vector describing the relationship between the two bug reports
X = abs(duplicate_report1 - duplicate_report2)
Y = df_eclipse['Label']

In [33]:
#Train and test dataset split (80% train data and 20% test data)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [34]:
#Defining a random forest model with custom hyperparameters.
rf = RandomForestClassifier(n_estimators = 200,
                            min_samples_leaf = 10,
                            n_jobs = -1)

In [35]:
#training the model on the training data set
rf.fit(X_train, y_train)

RandomForestClassifier(min_samples_leaf=10, n_estimators=200, n_jobs=-1)

In [36]:
#predicting on the test data set.
y_pred_test = rf.predict(X_test)


In [37]:
# confusion matrix for the above rf model
confusion_matrix(y_test, y_pred_test)

array([[6818,   50],
       [2078,  436]], dtype=int64)

In [38]:
#classification report to evaluate the performance of the rf model.
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.77      0.99      0.87      6868
           1       0.90      0.17      0.29      2514

    accuracy                           0.77      9382
   macro avg       0.83      0.58      0.58      9382
weighted avg       0.80      0.77      0.71      9382



In [39]:
#calculating the ROC AUC score for the rf model.
rf_probs = rf.predict_proba(X_test)
rf_probs = rf_probs[:, 1]
# calculate scores
rf_auc = roc_auc_score(y_test, rf_probs)
print('Random forest: ROC AUC=%.3f' % (rf_auc))

Random forest: ROC AUC=0.791
