# Embeddings
This notebook imports the specified dataset (mozilla-firefox by default) and creates a new dataframe containing the document-level embeddings and their ids, saved as a pickle file in the specified model's file in the data folder.

In [1]:
import pandas as pd
import numpy as np
import sys
import os
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


### Define Model

In [2]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
model = SentenceTransformer("sentence-transformers/multi-qa-mpnet-base-dot-v1", device=device)
device

'cuda'

### Import Data
- Specify the dataset
- Create "Content" column by concatenating Title and Description for document-level embeddings
- Typecast Duplicated_issue entries to integer

In [3]:
dataset_name = 'mozilla_firefox'

In [4]:
df = pd.read_pickle('../data/' + dataset_name + '.pkl')

In [5]:
df.head()

Unnamed: 0,Issue_id,Priority,Component,Title,Description,Status,Resolution,Version,Created_time,Resolved_time,Content,Duplicated_issues,Duplicates_count
33591,335190,--,General,Ctrl+C does not copy,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2006-04-23 18:45:42 -0700,2006-04-23 19:50:22 -0700,Ctrl+C does not copy User-Agent: Mozilla...,"[335186, 334862]",2
21703,294616,--,Menus,Open URL in context menu,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2005-05-18 02:59:14 -0700,2006-01-19 00:20:44 -0800,Open URL in context menu User-Agent: Moz...,"[236336, 227922, 454518]",3
104453,787029,--,Untriaged,movement was canceled in web page Navigation...,User Agent: Mozilla/5.0 (Windows NT 6.1; WOW64...,RESOLVED,DUPLICATE,14 Branch,2012-08-30 06:12:03 -0700,2012-08-30 06:30:58 -0700,movement was canceled in web page Navigation...,"[651803, 787021, 787022]",3
1900,203901,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:21:59 -0700,2006-11-13 07:16:57 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3
1898,203898,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:10:53 -0700,2006-11-13 07:23:00 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3


### Data Sampling
Due to compute power grabbing a subset of data

In [6]:
# Function to find the row with a specific issue_id
# we are finding the duplicates like this
def find_row_by_issue_id(df, issue_id):
    return df.loc[df['Issue_id'] == issue_id]

def find_rows_by_issue_ids(dataframe, issue_ids):
    """
    Returns a DataFrame containing only the rows with matching Issue_id.
    
    :param dataframe: pandas DataFrame containing an 'Issue_id' column
    :param issue_ids: List of issue IDs to find
    :return: DataFrame with only the rows that have a matching Issue_id
    """
    return dataframe[dataframe['Issue_id'].isin(issue_ids)]


In [7]:
# Use a subset of the data for testing
# Comment out the following line to use the entire dataset


df_subset_without_duplicates = df.head(100)

df_subset_without_duplicates.head()

Unnamed: 0,Issue_id,Priority,Component,Title,Description,Status,Resolution,Version,Created_time,Resolved_time,Content,Duplicated_issues,Duplicates_count
33591,335190,--,General,Ctrl+C does not copy,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2006-04-23 18:45:42 -0700,2006-04-23 19:50:22 -0700,Ctrl+C does not copy User-Agent: Mozilla...,"[335186, 334862]",2
21703,294616,--,Menus,Open URL in context menu,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2005-05-18 02:59:14 -0700,2006-01-19 00:20:44 -0800,Open URL in context menu User-Agent: Moz...,"[236336, 227922, 454518]",3
104453,787029,--,Untriaged,movement was canceled in web page Navigation...,User Agent: Mozilla/5.0 (Windows NT 6.1; WOW64...,RESOLVED,DUPLICATE,14 Branch,2012-08-30 06:12:03 -0700,2012-08-30 06:30:58 -0700,movement was canceled in web page Navigation...,"[651803, 787021, 787022]",3
1900,203901,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:21:59 -0700,2006-11-13 07:16:57 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3
1898,203898,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:10:53 -0700,2006-11-13 07:23:00 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3


### Adding Duplicates

- approximately we wanna keep the same ratio of duplicates as original data 21%
- so lets add around 25 duplicates to our 100 dataframes
- we will also go to different section of our data frame to have duplicates more spread out
 

In [8]:
df.iloc[0:10]

Unnamed: 0,Issue_id,Priority,Component,Title,Description,Status,Resolution,Version,Created_time,Resolved_time,Content,Duplicated_issues,Duplicates_count
33591,335190,--,General,Ctrl+C does not copy,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2006-04-23 18:45:42 -0700,2006-04-23 19:50:22 -0700,Ctrl+C does not copy User-Agent: Mozilla...,"[335186, 334862]",2
21703,294616,--,Menus,Open URL in context menu,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2005-05-18 02:59:14 -0700,2006-01-19 00:20:44 -0800,Open URL in context menu User-Agent: Moz...,"[236336, 227922, 454518]",3
104453,787029,--,Untriaged,movement was canceled in web page Navigation...,User Agent: Mozilla/5.0 (Windows NT 6.1; WOW64...,RESOLVED,DUPLICATE,14 Branch,2012-08-30 06:12:03 -0700,2012-08-30 06:30:58 -0700,movement was canceled in web page Navigation...,"[651803, 787021, 787022]",3
1900,203901,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:21:59 -0700,2006-11-13 07:16:57 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3
1898,203898,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:10:53 -0700,2006-11-13 07:23:00 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3
1897,203897,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:10:52 -0700,2006-11-13 07:22:55 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3
1896,203896,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:10:52 -0700,2006-11-13 07:22:49 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3
1895,203893,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:10:48 -0700,2006-11-13 07:22:37 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3
14073,268255,--,General,Constant shaking / twitching of displayed web ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,1.0 Branch,2004-11-07 08:11:36 -0800,2005-03-10 05:21:33 -0800,Constant shaking / twitching of displayed web ...,"[175699, 267261, 200270]",3
22510,297306,--,Search,addEngine does not work on Firefox OS/X,User-Agent: Mozilla/5.0 (Macintosh; U; P...,VERIFIED,DUPLICATE,unspecified,2005-06-10 06:40:33 -0700,2005-06-14 08:39:16 -0700,addEngine does not work on Firefox OS/X User-A...,"[259072, 232638]",2


In [9]:
df.iloc[20:30]

Unnamed: 0,Issue_id,Priority,Component,Title,Description,Status,Resolution,Version,Created_time,Resolved_time,Content,Duplicated_issues,Duplicates_count
106302,809129,--,Toolbars and Customization,add on bar is not working,,RESOLVED,DUPLICATE,unspecified,2012-11-06 10:59:21 -0800,2012-11-06 14:45:32 -0800,,"[804537, 813763, 809127]",3
2055,205901,--,Bookmarks & History,Bookmark toolbar menus remain open if submenu ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-05-15 23:29:06 -0700,2006-08-27 06:41:23 -0700,Bookmark toolbar menus remain open if submenu ...,"[197227, 203899, 210910]",3
11304,259265,--,Tabbed Browser,keyboard focus go to background tab if it has ...,User-Agent: Mozilla/5.0 (X11; U; Linux i...,RESOLVED,DUPLICATE,unspecified,2004-09-14 04:30:40 -0700,2004-09-14 23:11:11 -0700,keyboard focus go to background tab if it has ...,"[259264, 245502]",2
50881,409563,--,Tabbed Browser,Hide tab in tabbar while dragging it; just kee...,User-Agent: Opera/9.25 (X11; Linux i686;...,RESOLVED,DUPLICATE,unspecified,2007-12-22 10:36:29 -0800,2008-01-28 22:43:24 -0800,Hide tab in tabbar while dragging it; just kee...,"[410972, 455694]",2
41426,364834,--,Tabbed Browser,Z-Order is not maintained beyond one tab,User-Agent: Mozilla/5.0 (X11; U; Linux i...,RESOLVED,DUPLICATE,unspecified,2006-12-23 12:31:02 -0800,2006-12-23 12:46:07 -0800,Z-Order is not maintained beyond one tab User-...,"[364824, 514796, 364797]",3
46182,389939,--,Bookmarks & History,bookmarks take favicon of previously visited s...,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2007-07-28 02:55:18 -0700,2007-09-11 04:57:50 -0700,bookmarks take favicon of previously visited s...,"[350440, 356777, 383453]",3
23777,301599,--,General,When a single word is typed in the Location Ba...,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2005-07-21 10:59:36 -0700,2005-07-21 11:32:45 -0700,When a single word is typed in the Location Ba...,"[301600, 269519]",2
10640,256660,--,General,Location bar is a different size on secure sites,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2004-08-23 18:10:27 -0700,2004-10-21 12:09:59 -0700,Location bar is a different size on secure si...,"[337427, 263754, 344635]",3
22301,296625,--,General,Open Highlighted Links in Tabs,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2005-06-04 08:39:27 -0700,2005-06-04 12:31:41 -0700,Open Highlighted Links in Tabs User-Agent: ...,"[236336, 227922, 454518]",3
101580,751000,--,Untriaged,bookmarks icon wont show on toolbar,User Agent: Mozilla/5.0 (X11; Ubuntu; Linux i6...,RESOLVED,DUPLICATE,13 Branch,2012-05-01 20:00:48 -0700,2012-05-28 05:55:44 -0700,bookmarks icon wont show on toolbar User Agent...,"[751417, 631330, 581238]",3


In [10]:
# display df row from 35 to 50 
df.iloc[30:40]

Unnamed: 0,Issue_id,Priority,Component,Title,Description,Status,Resolution,Version,Created_time,Resolved_time,Content,Duplicated_issues,Duplicates_count
99982,729936,--,Untriaged,Mouseover event handler failure on nearly hori...,User Agent: Mozilla/5.0 (Windows NT 6.0; rv:10...,RESOLVED,DUPLICATE,10 Branch,2012-02-23 07:21:24 -0800,2012-02-23 07:23:23 -0800,Mouseover event handler failure on nearly hori...,"[729932, 729935]",2
30994,325346,--,Tabbed Browser,waiting for FTP Transaction while have a link ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,1.5.0.x Branch,2006-01-31 10:43:15 -0800,2008-04-13 06:34:42 -0700,waiting for FTP Transaction while have a link ...,"[325345, 246801]",2
6675,238967,--,General,ctrl-w does not close the active tab if system...,User-Agent: Mozilla/5.0 (X11; U; FreeBSD...,RESOLVED,DUPLICATE,unspecified,2004-03-28 04:20:34 -0800,2004-04-04 14:05:26 -0700,ctrl-w does not close the active tab if system...,"[223675, 189615]",2
39159,356160,--,General,links cannot be clicked on but html appears valid,User-Agent: Mozilla/5.0 (X11; U; Linux i...,RESOLVED,DUPLICATE,unspecified,2006-10-10 07:26:18 -0700,2006-10-10 07:28:24 -0700,links cannot be clicked on but html appears va...,"[356157, 356159]",2
2820,214458,--,General,Cannot initialize firebird from behind firewall,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-07-30 07:54:25 -0700,2004-02-06 17:17:44 -0800,Cannot initialize firebird from behind firewal...,"[205129, 215031, 213375]",3
24333,303418,--,Toolbars and Customization,URL drop down list immediately loses focus,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2005-08-04 09:00:20 -0700,2005-08-04 16:18:38 -0700,URL drop down list immediately loses focus Use...,"[273592, 295056]",2
35981,344533,--,Bookmarks & History,Dragging bookmarks in the menu causes the menu...,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2006-07-13 07:57:43 -0700,2006-07-13 08:33:28 -0700,Dragging bookmarks in the menu causes the menu...,"[333602, 380301]",2
13091,265121,--,General,Latest Firefox wont start,User-Agent: Mozilla/5.0 (X11; U; Linux i...,RESOLVED,DUPLICATE,unspecified,2004-10-19 11:44:04 -0700,2004-10-19 11:48:48 -0700,Latest Firefox wont start User-Agent: Mo...,"[254967, 265118, 265103]",3
13092,265122,--,General,Latest Firefox wont start,User-Agent: Mozilla/5.0 (X11; U; Linux i...,RESOLVED,DUPLICATE,unspecified,2004-10-19 11:44:13 -0700,2004-10-19 11:49:07 -0700,Latest Firefox wont start User-Agent: Mo...,"[254967, 265118, 265103]",3
13116,265224,--,General,http://www.elektronik-kompendium.de/sites/net/...,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2004-10-20 05:44:58 -0700,2004-10-20 06:02:50 -0700,http://www.elektronik-kompendium.de/sites/net/...,"[264154, 273358]",2


In [11]:
# display df row from 60 to 70 
df.iloc[60:70]

Unnamed: 0,Issue_id,Priority,Component,Title,Description,Status,Resolution,Version,Created_time,Resolved_time,Content,Duplicated_issues,Duplicates_count
93039,650383,--,Extension Compatibility,http://www.mozilla.org/extension/manager;1,User-Agent: Mozilla/5.0 (Windows NT 6.1;...,RESOLVED,DUPLICATE,4.0 Branch,2011-04-15 14:05:14 -0700,2011-04-16 09:12:16 -0700,http://www.mozilla.org/extension/manager;1 Use...,"[578099, 650093]",2
97212,696126,--,Theme,Visual indication of unloaded tabs,Bug 648683 allows users to skip tabs from load...,RESOLVED,DUPLICATE,Trunk,2011-10-20 09:56:51 -0700,2011-10-20 13:23:54 -0700,Visual indication of unloaded tabs Bug 648683 ...,[675541],1
30841,324826,--,General,HP pavillion zv6000 synaptics touchpad does n...,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2006-01-26 10:27:06 -0800,2006-01-26 11:30:34 -0800,HP pavillion zv6000 synaptics touchpad does n...,[324827],1
25166,306205,--,Bookmarks & History,Manage Bookmarks shows bookmarks in proper ord...,User-Agent: Mozilla/5.0 (Macintosh; U; P...,RESOLVED,DUPLICATE,unspecified,2005-08-27 18:12:38 -0700,2005-08-28 13:24:50 -0700,Manage Bookmarks shows bookmarks in proper ord...,"[295372, 280509]",2
97648,702239,--,General,textarea . 4.,User Agent: Mozilla/5.0 (Windows NT 6.1; WOW64...,RESOLVED,DUPLICATE,8 Branch,2011-11-14 05:13:10 -0800,2011-11-15 01:15:39 -0800,textarea . 4. User...,[702274],1
58682,438887,--,Bookmarks & History,Selecting a fair number of bookmarks and movin...,User-Agent: Mozilla/5.0 (X11; U; Linux i...,RESOLVED,DUPLICATE,unspecified,2008-06-12 11:01:38 -0700,2009-11-26 07:13:51 -0800,Selecting a fair number of bookmarks and movin...,"[472343, 420199]",2
2795,214338,--,General,Immidiately Freezes Upon Load,User-Agent: Mozilla/4.0 (compatible; MSI...,VERIFIED,DUPLICATE,unspecified,2003-07-29 12:29:01 -0700,2003-08-18 00:19:51 -0700,Immidiately Freezes Upon Load User-Agent: ...,"[205129, 215031]",2
30912,325078,--,General,Restoring window from dock disables typing,User-Agent: Mozilla/5.0 (Macintosh; U; P...,RESOLVED,DUPLICATE,unspecified,2006-01-28 15:09:26 -0800,2006-01-29 13:20:14 -0800,Restoring window from dock disables typing Use...,"[269207, 274631]",2
8568,248869,--,File Handling,Clicking on PDF file freezes Firefox temporari...,User-Agent: Mozilla/4.0 (compatible; MSI...,RESOLVED,DUPLICATE,unspecified,2004-06-27 23:49:57 -0700,2005-03-22 20:32:53 -0800,Clicking on PDF file freezes Firefox temporari...,[253565],1
24299,303308,--,Bookmarks & History,Bottom of bookmark icons cutoff in Bookmarks s...,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2005-08-03 15:08:34 -0700,2005-08-03 15:26:55 -0700,Bottom of bookmark icons cutoff in Bookmarks s...,"[301776, 302946]",2


In [12]:
df.iloc[80:90]


Unnamed: 0,Issue_id,Priority,Component,Title,Description,Status,Resolution,Version,Created_time,Resolved_time,Content,Duplicated_issues,Duplicates_count
2813,214426,--,General,does not launch a second time within the same ...,User-Agent: Mozilla/5.0 (X11; U; Linux i...,RESOLVED,DUPLICATE,unspecified,2003-07-30 02:59:28 -0700,2003-08-03 19:06:08 -0700,does not launch a second time within the same ...,[212604],1
77517,539017,--,General,Firefox doesnt appropriately adjust scroll pos...,Note: Im using XMonad; in case the window mana...,RESOLVED,DUPLICATE,unspecified,2010-01-11 10:30:32 -0800,2010-01-11 10:37:21 -0800,Firefox doesnt appropriately adjust scroll pos...,"[252803, 452508]",2
18347,283087,--,Toolbars and Customization,unchecked (unselected) Toolbars reappear in ne...,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2005-02-21 16:10:44 -0800,2006-12-12 00:14:42 -0800,unchecked (unselected) Toolbars reappear in ne...,[215489],1
65764,469282,--,Location Bar,Autocomplete function changed from Firefox 2 i...,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2008-12-12 00:18:12 -0800,2008-12-13 06:54:30 -0800,Autocomplete function changed from Firefox 2 i...,"[413211, 409895]",2
101170,746494,--,Session Restore,sessionstore.js wiped/overwritten on private b...,User Agent: Mozilla/5.0 (Windows NT 6.1; WOW64...,RESOLVED,DUPLICATE,11 Branch,2012-04-18 00:03:20 -0700,2012-06-23 18:52:49 -0700,sessionstore.js wiped/overwritten on private b...,"[757056, 647655]",2
8367,248086,--,Migration,Firefox does not import IE favorites with Chin...,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2004-06-22 00:55:35 -0700,2004-09-06 17:44:11 -0700,Firefox does not import IE favorites with Chin...,"[174734, 235495]",2
30838,324814,--,General,Firefox 1.5 is appending a full URL to relativ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2006-01-26 09:20:09 -0800,2006-01-26 09:22:29 -0800,Firefox 1.5 is appending a full URL to relativ...,[324801],1
65800,469460,--,General,After closing browser and trying to open in ag...,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2008-12-13 03:20:05 -0800,2008-12-13 07:08:52 -0800,After closing browser and trying to open in ag...,[407981],1
2964,215180,--,Toolbars and Customization,Home button does not take you to web site save...,User-Agent: Mozilla/5.0 (Macintosh; U; P...,VERIFIED,DUPLICATE,unspecified,2003-08-05 13:15:52 -0700,2006-11-13 07:25:35 -0800,Home button does not take you to web site save...,"[206456, 223331]",2
2966,215191,--,General,Firebird will not start while Thunderbird (Moz...,User-Agent: Mozilla/5.0 (X11; U; Linux i...,VERIFIED,DUPLICATE,unspecified,2003-08-05 14:52:55 -0700,2003-08-08 07:00:57 -0700,Firebird will not start while Thunderbird (Moz...,[212604],1


In [13]:
df.iloc[90:100]


Unnamed: 0,Issue_id,Priority,Component,Title,Description,Status,Resolution,Version,Created_time,Resolved_time,Content,Duplicated_issues,Duplicates_count
30717,324379,--,General,Program forgets last opened sites afther crash...,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2006-01-23 00:23:52 -0800,2006-01-23 00:27:39 -0800,Program forgets last opened sites afther crash...,"[310261, 328159]",2
2979,215285,--,General,Can not run Firebird 0.6.1 on Windows XP,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-08-06 10:44:07 -0700,2003-08-18 00:10:07 -0700,Can not run Firebird 0.6.1 on Windows XP User-...,"[205129, 215031]",2
61485,449450,--,Bookmarks & History,Bookmark properties cause Firefox to hang,User-Agent: Mozilla/5.0 (X11; U; Linux i...,RESOLVED,DUPLICATE,unspecified,2008-08-06 12:44:32 -0700,2009-11-26 05:25:08 -0800,Bookmark properties cause Firefox to hang User...,"[449385, 449221]",2
58587,438364,--,Menus,should be able to select search engine when se...,User-Agent: Mozilla/5.0 (X11; U; Linux i...,RESOLVED,DUPLICATE,unspecified,2008-06-10 09:48:46 -0700,2008-06-10 10:10:00 -0700,should be able to select search engine when se...,"[348570, 248173]",2
70573,496616,--,General,Security icon in status bar too difficult to open,Using Mozilla/5.0 (Macintosh; U; Intel Mac OS ...,RESOLVED,DUPLICATE,3.5 Branch,2009-06-05 11:59:15 -0700,2009-06-19 14:46:35 -0700,Security icon in status bar too difficult to o...,"[251307, 432741]",2
8389,248180,--,Tabbed Browser,no option to disable new windows unless explic...,User-Agent: Mozilla/5.0 (X11; U; Linux i...,RESOLVED,DUPLICATE,unspecified,2004-06-22 10:36:32 -0700,2004-06-22 10:39:30 -0700,no option to disable new windows unless explic...,"[227241, 172962]",2
88281,614817,--,Panorama,Windows 7s Aero peek taskbar tabs preview has ...,User-Agent: Mozilla/5.0 (Windows NT 6.1;...,RESOLVED,DUPLICATE,Trunk,2010-11-25 07:22:10 -0800,2010-12-03 15:22:46 -0800,Windows 7s Aero peek taskbar tabs preview has ...,[587440],1
13617,266748,--,Keyboard Navigation,ctrl-u no longer clears a line of text; but in...,User-Agent: Mozilla/5.0 (X11; U; Linux i...,VERIFIED,DUPLICATE,unspecified,2004-10-29 12:20:00 -0700,2004-10-29 12:29:47 -0700,ctrl-u no longer clears a line of text; but in...,[260188],1
18177,282475,--,Bookmarks & History,Bookmark Icons,User-Agent: Mozilla/5.0 (Macintosh; U; P...,RESOLVED,DUPLICATE,unspecified,2005-02-16 09:24:52 -0800,2006-08-27 05:38:29 -0700,Bookmark Icons User-Agent: Mozilla/5.0 (...,[219846],1
30691,324290,--,Shell Integration,Sometimes; FF losts keyboard focus; mouse work...,User-Agent: Mozilla/5.0 (Macintosh; U; P...,VERIFIED,DUPLICATE,unspecified,2006-01-21 16:24:47 -0800,2006-04-21 07:22:22 -0700,Sometimes; FF losts keyboard focus; mouse work...,"[269207, 274631]",2


In [14]:

def insert_randomly(main_df, insert_dfs):
    """
    Inserts the rows from the list of DataFrames (insert_dfs) into the main DataFrame (main_df) at random positions.
    
    :param main_df: The main DataFrame where the other DataFrames are inserted.
    :param insert_dfs: A list of DataFrames to insert into main_df.
    :return: A new DataFrame with the inserted rows at random positions.
    """
    # Concatenate all the duplicates DataFrames into one for easier manipulation
    df_to_insert = pd.concat(insert_dfs).reset_index(drop=True)
    
    # Calculate the insertion points
    insertion_points = np.random.randint(0, len(main_df), len(df_to_insert))
    
    # Iterate through the insertion points and insert the rows
    for insertion_point, row_to_insert in zip(insertion_points, df_to_insert.iterrows()):
        part1 = main_df.iloc[:insertion_point]
        part2 = main_df.iloc[insertion_point:]
        main_df = pd.concat([part1, pd.DataFrame([row_to_insert[1]]), part2], ignore_index=True)
    
    return main_df

In [15]:

# YOU NEED TO MANUALLY GRAB DUPLICATES FROM ABOVE ISSUES MANUALLY FROM DIFFERENT DATA SETS EQUALLY

duplicates1 = find_rows_by_issue_ids(df, [335186, 334862, 175699, 267261, 200270])
duplicates2 = find_rows_by_issue_ids(df, [197227, 203899, 210910, 364824, 514796, 364797])
duplicates3 = find_rows_by_issue_ids(df, [254967, 265118, 265103, 205129, 215031, 213375])
duplicates4 = find_rows_by_issue_ids(df,[301776, 302946, 269207, 274631])
duplicates5 = find_rows_by_issue_ids(df,[757056, 647655, 174734, 235495, 227241, 172962])

# randomize order of duplicates to have it more natural 
# Use the function to insert duplicates into df_subset_without_duplicates
df_subset = insert_randomly(df_subset_without_duplicates, [duplicates1, duplicates2, duplicates3, duplicates4, duplicates5])

df_subset


Unnamed: 0,Issue_id,Priority,Component,Title,Description,Status,Resolution,Version,Created_time,Resolved_time,Content,Duplicated_issues,Duplicates_count
0,335190,--,General,Ctrl+C does not copy,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2006-04-23 18:45:42 -0700,2006-04-23 19:50:22 -0700,Ctrl+C does not copy User-Agent: Mozilla...,"[335186, 334862]",2
1,294616,--,Menus,Open URL in context menu,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2005-05-18 02:59:14 -0700,2006-01-19 00:20:44 -0800,Open URL in context menu User-Agent: Moz...,"[236336, 227922, 454518]",3
2,787029,--,Untriaged,movement was canceled in web page Navigation...,User Agent: Mozilla/5.0 (Windows NT 6.1; WOW64...,RESOLVED,DUPLICATE,14 Branch,2012-08-30 06:12:03 -0700,2012-08-30 06:30:58 -0700,movement was canceled in web page Navigation...,"[651803, 787021, 787022]",3
3,203901,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:21:59 -0700,2006-11-13 07:16:57 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3
4,203898,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:10:53 -0700,2006-11-13 07:23:00 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,614817,--,Panorama,Windows 7s Aero peek taskbar tabs preview has ...,User-Agent: Mozilla/5.0 (Windows NT 6.1;...,RESOLVED,DUPLICATE,Trunk,2010-11-25 07:22:10 -0800,2010-12-03 15:22:46 -0800,Windows 7s Aero peek taskbar tabs preview has ...,[587440],1
123,266748,--,Keyboard Navigation,ctrl-u no longer clears a line of text; but in...,User-Agent: Mozilla/5.0 (X11; U; Linux i...,VERIFIED,DUPLICATE,unspecified,2004-10-29 12:20:00 -0700,2004-10-29 12:29:47 -0700,ctrl-u no longer clears a line of text; but in...,[260188],1
124,335186,--,General,Copy and paste stopped working for no reason,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2006-04-23 17:40:21 -0700,2006-09-24 01:20:16 -0700,Copy and paste stopped working for no reason U...,[334862],1
125,282475,--,Bookmarks & History,Bookmark Icons,User-Agent: Mozilla/5.0 (Macintosh; U; P...,RESOLVED,DUPLICATE,unspecified,2005-02-16 09:24:52 -0800,2006-08-27 05:38:29 -0700,Bookmark Icons User-Agent: Mozilla/5.0 (...,[219846],1


Save the sampled data set

In [16]:
df_subset.to_pickle('../data/' + dataset_name + '_subset.pkl')

#### Some data inconsistencies found

This will cause false postives: (the content is the same but ids are different and they are not part of eachothers duplicated issues)

In [17]:
test = find_rows_by_issue_ids(df, [203898, 203897, 203896,203893])
test

Unnamed: 0,Issue_id,Priority,Component,Title,Description,Status,Resolution,Version,Created_time,Resolved_time,Content,Duplicated_issues,Duplicates_count
1898,203898,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:10:53 -0700,2006-11-13 07:23:00 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3
1897,203897,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:10:52 -0700,2006-11-13 07:22:55 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3
1896,203896,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:10:52 -0700,2006-11-13 07:22:49 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3
1895,203893,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:10:48 -0700,2006-11-13 07:22:37 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3


Some of data had empty descriptions

In [18]:
test1 = find_row_by_issue_id(df,809129)
test1

Unnamed: 0,Issue_id,Priority,Component,Title,Description,Status,Resolution,Version,Created_time,Resolved_time,Content,Duplicated_issues,Duplicates_count
106302,809129,--,Toolbars and Customization,add on bar is not working,,RESOLVED,DUPLICATE,unspecified,2012-11-06 10:59:21 -0800,2012-11-06 14:45:32 -0800,,"[804537, 813763, 809127]",3


Checking if its duplicates have empty content as well

In [19]:
test2 = find_rows_by_issue_ids(df, [804537, 813763, 809127]	)
test2

Unnamed: 0,Issue_id,Priority,Component,Title,Description,Status,Resolution,Version,Created_time,Resolved_time,Content,Duplicated_issues,Duplicates_count
106301,809127,--,Untriaged,add on bar is not working,User Agent: Mozilla/5.0 (Macintosh; Intel Mac ...,RESOLVED,DUPLICATE,unspecified,2012-11-06 10:58:21 -0800,2012-11-21 07:53:56 -0800,add on bar is not working User Agent: Mozilla/...,"[804537, 813763]",2
105911,804537,--,Extension Compatibility,Tab Mix Plus: Cant open Add-ons Manager,User Agent: Mozilla/5.0 (Windows NT 6.2; WOW64...,RESOLVED,DUPLICATE,17 Branch,2012-10-23 03:26:36 -0700,2012-11-22 01:01:47 -0800,Tab Mix Plus: Cant open Add-ons Manager User A...,[813763],1
106622,813763,--,Extension Compatibility,Menu items like Addons Manager not working in ...,We are getting lots of SUMO reports of menu it...,RESOLVED,FIXED,17 Branch,2012-11-20 14:58:24 -0800,2012-12-07 09:11:50 -0800,Menu items like Addons Manager not working in ...,[],0


Passing in title as content for embedding generation as well

#### Specify the Model

In [20]:
model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
model = SentenceTransformer(model_name, device=device)

### Create Embeddings

#### Create the Embeddings as New Dataframe

In [21]:
import math

# added id issue for debugging purposes 
def generate_embeddings(content, model, issue_id):
    """Generate embeddings for a given piece of text."""
    
    embedding = model.encode(content, convert_to_tensor=True)

    return embedding.cpu().numpy()

In [22]:
embeddings_df = pd.DataFrame()

embeddings_df['Embedding'] = df_subset.apply(lambda row: generate_embeddings(f"{row['Title']} {row['Content']}" if pd.notna(row['Content']) else row['Title'], model=model, issue_id=row['Issue_id']), axis=1)

embeddings_df['Issue_id'] = df_subset['Issue_id']
embeddings_df['Duplicated_issues'] = df_subset['Duplicated_issues']



In [23]:
def typecast_df(df):
    df['Duplicated_issues'] = df['Duplicated_issues'].apply(lambda x: [int(i) for i in x])
    df["Issue_id"] = df["Issue_id"].astype('Int64')
    return df
embeddings_df = typecast_df(embeddings_df)


embeddings_df.head()

Unnamed: 0,Embedding,Issue_id,Duplicated_issues
0,"[0.012851728, -0.33054906, -0.006574236, -0.07...",335190,"[335186, 334862]"
1,"[-0.20026441, -0.37414283, -0.072336294, -0.23...",294616,"[236336, 227922, 454518]"
2,"[0.15873507, -0.33249518, -0.105194904, -0.123...",787029,"[651803, 787021, 787022]"
3,"[-0.24302617, -0.090266466, -0.040903233, 0.29...",203901,"[197227, 203899, 210910]"
4,"[-0.23883097, -0.09577434, -0.041675787, 0.298...",203898,"[197227, 203899, 210910]"


In [24]:
filename = '../data/' + dataset_name + '_embeddings_' + model_name + '.pkl'
directory = os.path.dirname(filename)

os.makedirs(directory, exist_ok=True) # Create the directory if it doesn't exist
embeddings_df.to_pickle(filename)

In [25]:
embeddings_df.head()

Unnamed: 0,Embedding,Issue_id,Duplicated_issues
0,"[0.012851728, -0.33054906, -0.006574236, -0.07...",335190,"[335186, 334862]"
1,"[-0.20026441, -0.37414283, -0.072336294, -0.23...",294616,"[236336, 227922, 454518]"
2,"[0.15873507, -0.33249518, -0.105194904, -0.123...",787029,"[651803, 787021, 787022]"
3,"[-0.24302617, -0.090266466, -0.040903233, 0.29...",203901,"[197227, 203899, 210910]"
4,"[-0.23883097, -0.09577434, -0.041675787, 0.298...",203898,"[197227, 203899, 210910]"
