# Embeddings
This notebook imports the specified dataset (mozilla-firefox by default) and creates a new dataframe containing the document-level embeddings and their ids, saved as a pickle file in the specified model's file in the data folder.

In [1]:
import pandas as pd
import numpy as np
import sys
import os
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


### Define Model

In [2]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
model = SentenceTransformer("sentence-transformers/multi-qa-mpnet-base-dot-v1", device=device)
device

'cuda'

### Import Data
- Specify the dataset
- Create "Content" column by concatenating Title and Description for document-level embeddings
- Typecast Duplicated_issue entries to integer

In [3]:
dataset_name = 'mozilla_firefox'

In [4]:
df = pd.read_pickle('../data/' + dataset_name + '.pkl')

In [5]:
df.head()

Unnamed: 0,Issue_id,Priority,Component,Title,Description,Status,Resolution,Version,Created_time,Resolved_time,Content,Duplicated_issues,Duplicates_count
33591,335190,--,General,Ctrl+C does not copy,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2006-04-23 18:45:42 -0700,2006-04-23 19:50:22 -0700,Ctrl+C does not copy User-Agent: Mozilla...,"[335186, 334862]",2
21703,294616,--,Menus,Open URL in context menu,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2005-05-18 02:59:14 -0700,2006-01-19 00:20:44 -0800,Open URL in context menu User-Agent: Moz...,"[236336, 227922, 454518]",3
104453,787029,--,Untriaged,movement was canceled in web page Navigation...,User Agent: Mozilla/5.0 (Windows NT 6.1; WOW64...,RESOLVED,DUPLICATE,14 Branch,2012-08-30 06:12:03 -0700,2012-08-30 06:30:58 -0700,movement was canceled in web page Navigation...,"[651803, 787021, 787022]",3
1900,203901,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:21:59 -0700,2006-11-13 07:16:57 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3
1898,203898,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:10:53 -0700,2006-11-13 07:23:00 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3


### Data Sampling
Due to compute power grabbing a subset of data

In [6]:
# Function to find the row with a specific issue_id
# we are finding the duplicates like this
def find_row_by_issue_id(df, issue_id):
    return df.loc[df['Issue_id'] == issue_id]

def find_rows_by_issue_ids(dataframe, issue_ids):
    """
    Returns a DataFrame containing only the rows with matching Issue_id.
    
    :param dataframe: pandas DataFrame containing an 'Issue_id' column
    :param issue_ids: List of issue IDs to find
    :return: DataFrame with only the rows that have a matching Issue_id
    """
    return dataframe[dataframe['Issue_id'].isin(issue_ids)]


In [7]:
# Use a subset of the data for testing
# Comment out the following line to use the entire dataset


df_subset_without_duplicates = df.head(100)

df_subset_without_duplicates.head()

Unnamed: 0,Issue_id,Priority,Component,Title,Description,Status,Resolution,Version,Created_time,Resolved_time,Content,Duplicated_issues,Duplicates_count
33591,335190,--,General,Ctrl+C does not copy,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2006-04-23 18:45:42 -0700,2006-04-23 19:50:22 -0700,Ctrl+C does not copy User-Agent: Mozilla...,"[335186, 334862]",2
21703,294616,--,Menus,Open URL in context menu,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2005-05-18 02:59:14 -0700,2006-01-19 00:20:44 -0800,Open URL in context menu User-Agent: Moz...,"[236336, 227922, 454518]",3
104453,787029,--,Untriaged,movement was canceled in web page Navigation...,User Agent: Mozilla/5.0 (Windows NT 6.1; WOW64...,RESOLVED,DUPLICATE,14 Branch,2012-08-30 06:12:03 -0700,2012-08-30 06:30:58 -0700,movement was canceled in web page Navigation...,"[651803, 787021, 787022]",3
1900,203901,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:21:59 -0700,2006-11-13 07:16:57 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3
1898,203898,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:10:53 -0700,2006-11-13 07:23:00 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3


### Adding Duplicates

- approximately we wanna keep the same ratio of duplicates as original data 21%
- so lets add around 25 duplicates to our 100 dataframes
- we will also go to different section of our data frame to have duplicates more spread out
 

In [14]:

def insert_randomly(main_df, insert_dfs):
    """
    Inserts the rows from the list of DataFrames (insert_dfs) into the main DataFrame (main_df) at random positions.
    
    :param main_df: The main DataFrame where the other DataFrames are inserted.
    :param insert_dfs: A list of DataFrames to insert into main_df.
    :return: A new DataFrame with the inserted rows at random positions.
    """
    # Concatenate all the duplicates DataFrames into one for easier manipulation
    df_to_insert = pd.concat(insert_dfs).reset_index(drop=True)
    
    # Calculate the insertion points
    insertion_points = np.random.randint(0, len(main_df), len(df_to_insert))
    
    # Iterate through the insertion points and insert the rows
    for insertion_point, row_to_insert in zip(insertion_points, df_to_insert.iterrows()):
        part1 = main_df.iloc[:insertion_point]
        part2 = main_df.iloc[insertion_point:]
        main_df = pd.concat([part1, pd.DataFrame([row_to_insert[1]]), part2], ignore_index=True)
    
    return main_df

In [15]:

# YOU NEED TO MANUALLY GRAB DUPLICATES FROM ABOVE ISSUES MANUALLY FROM DIFFERENT DATA SETS EQUALLY

duplicates1 = find_rows_by_issue_ids(df, [335186, 334862, 175699, 267261, 200270])
duplicates2 = find_rows_by_issue_ids(df, [197227, 203899, 210910, 364824, 514796, 364797])
duplicates3 = find_rows_by_issue_ids(df, [254967, 265118, 265103, 205129, 215031, 213375])
duplicates4 = find_rows_by_issue_ids(df,[301776, 302946, 269207, 274631])
duplicates5 = find_rows_by_issue_ids(df,[757056, 647655, 174734, 235495, 227241, 172962])

# randomize order of duplicates to have it more natural 
# Use the function to insert duplicates into df_subset_without_duplicates
df_subset = insert_randomly(df_subset_without_duplicates, [duplicates1, duplicates2, duplicates3, duplicates4, duplicates5])

df_subset


Unnamed: 0,Issue_id,Priority,Component,Title,Description,Status,Resolution,Version,Created_time,Resolved_time,Content,Duplicated_issues,Duplicates_count
0,335190,--,General,Ctrl+C does not copy,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2006-04-23 18:45:42 -0700,2006-04-23 19:50:22 -0700,Ctrl+C does not copy User-Agent: Mozilla...,"[335186, 334862]",2
1,294616,--,Menus,Open URL in context menu,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2005-05-18 02:59:14 -0700,2006-01-19 00:20:44 -0800,Open URL in context menu User-Agent: Moz...,"[236336, 227922, 454518]",3
2,787029,--,Untriaged,movement was canceled in web page Navigation...,User Agent: Mozilla/5.0 (Windows NT 6.1; WOW64...,RESOLVED,DUPLICATE,14 Branch,2012-08-30 06:12:03 -0700,2012-08-30 06:30:58 -0700,movement was canceled in web page Navigation...,"[651803, 787021, 787022]",3
3,203901,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:21:59 -0700,2006-11-13 07:16:57 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3
4,203898,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:10:53 -0700,2006-11-13 07:23:00 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,614817,--,Panorama,Windows 7s Aero peek taskbar tabs preview has ...,User-Agent: Mozilla/5.0 (Windows NT 6.1;...,RESOLVED,DUPLICATE,Trunk,2010-11-25 07:22:10 -0800,2010-12-03 15:22:46 -0800,Windows 7s Aero peek taskbar tabs preview has ...,[587440],1
123,266748,--,Keyboard Navigation,ctrl-u no longer clears a line of text; but in...,User-Agent: Mozilla/5.0 (X11; U; Linux i...,VERIFIED,DUPLICATE,unspecified,2004-10-29 12:20:00 -0700,2004-10-29 12:29:47 -0700,ctrl-u no longer clears a line of text; but in...,[260188],1
124,335186,--,General,Copy and paste stopped working for no reason,User-Agent: Mozilla/5.0 (Windows; U; Win...,RESOLVED,DUPLICATE,unspecified,2006-04-23 17:40:21 -0700,2006-09-24 01:20:16 -0700,Copy and paste stopped working for no reason U...,[334862],1
125,282475,--,Bookmarks & History,Bookmark Icons,User-Agent: Mozilla/5.0 (Macintosh; U; P...,RESOLVED,DUPLICATE,unspecified,2005-02-16 09:24:52 -0800,2006-08-27 05:38:29 -0700,Bookmark Icons User-Agent: Mozilla/5.0 (...,[219846],1


Save the sampled data set

In [16]:
df_subset.to_pickle('../data/' + dataset_name + '_subset.pkl')

#### Some data inconsistencies found

This will cause false postives: (the content is the same but ids are different and they are not part of eachothers duplicated issues)

In [17]:
test = find_rows_by_issue_ids(df, [203898, 203897, 203896,203893])
test

Unnamed: 0,Issue_id,Priority,Component,Title,Description,Status,Resolution,Version,Created_time,Resolved_time,Content,Duplicated_issues,Duplicates_count
1898,203898,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:10:53 -0700,2006-11-13 07:23:00 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3
1897,203897,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:10:52 -0700,2006-11-13 07:22:55 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3
1896,203896,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:10:52 -0700,2006-11-13 07:22:49 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3
1895,203893,--,Toolbars and Customization,Folders in Bookmarks Toolbar stay Expanded if ...,User-Agent: Mozilla/5.0 (Windows; U; Win...,VERIFIED,DUPLICATE,unspecified,2003-04-30 03:10:48 -0700,2006-11-13 07:22:37 -0800,Folders in Bookmarks Toolbar stay Expanded if ...,"[197227, 203899, 210910]",3


Some of data had empty descriptions

In [18]:
test1 = find_row_by_issue_id(df,809129)
test1

Unnamed: 0,Issue_id,Priority,Component,Title,Description,Status,Resolution,Version,Created_time,Resolved_time,Content,Duplicated_issues,Duplicates_count
106302,809129,--,Toolbars and Customization,add on bar is not working,,RESOLVED,DUPLICATE,unspecified,2012-11-06 10:59:21 -0800,2012-11-06 14:45:32 -0800,,"[804537, 813763, 809127]",3


Checking if its duplicates have empty content as well

In [19]:
test2 = find_rows_by_issue_ids(df, [804537, 813763, 809127]	)
test2

Unnamed: 0,Issue_id,Priority,Component,Title,Description,Status,Resolution,Version,Created_time,Resolved_time,Content,Duplicated_issues,Duplicates_count
106301,809127,--,Untriaged,add on bar is not working,User Agent: Mozilla/5.0 (Macintosh; Intel Mac ...,RESOLVED,DUPLICATE,unspecified,2012-11-06 10:58:21 -0800,2012-11-21 07:53:56 -0800,add on bar is not working User Agent: Mozilla/...,"[804537, 813763]",2
105911,804537,--,Extension Compatibility,Tab Mix Plus: Cant open Add-ons Manager,User Agent: Mozilla/5.0 (Windows NT 6.2; WOW64...,RESOLVED,DUPLICATE,17 Branch,2012-10-23 03:26:36 -0700,2012-11-22 01:01:47 -0800,Tab Mix Plus: Cant open Add-ons Manager User A...,[813763],1
106622,813763,--,Extension Compatibility,Menu items like Addons Manager not working in ...,We are getting lots of SUMO reports of menu it...,RESOLVED,FIXED,17 Branch,2012-11-20 14:58:24 -0800,2012-12-07 09:11:50 -0800,Menu items like Addons Manager not working in ...,[],0


Passing in title as content for embedding generation as well

#### Specify the Model

In [20]:
model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
model = SentenceTransformer(model_name, device=device)

### Create Embeddings

#### Create the Embeddings as New Dataframe

In [21]:
import math

# added id issue for debugging purposes 
def generate_embeddings(content, model, issue_id):
    """Generate embeddings for a given piece of text."""
    
    embedding = model.encode(content, convert_to_tensor=True)

    return embedding.cpu().numpy()

In [22]:
embeddings_df = pd.DataFrame()

embeddings_df['Embedding'] = df_subset.apply(lambda row: generate_embeddings(f"{row['Title']} {row['Content']}" if pd.notna(row['Content']) else row['Title'], model=model, issue_id=row['Issue_id']), axis=1)

embeddings_df['Issue_id'] = df_subset['Issue_id']
embeddings_df['Duplicated_issues'] = df_subset['Duplicated_issues']



In [23]:
def typecast_df(df):
    df['Duplicated_issues'] = df['Duplicated_issues'].apply(lambda x: [int(i) for i in x])
    df["Issue_id"] = df["Issue_id"].astype('Int64')
    return df
embeddings_df = typecast_df(embeddings_df)


embeddings_df.head()

Unnamed: 0,Embedding,Issue_id,Duplicated_issues
0,"[0.012851728, -0.33054906, -0.006574236, -0.07...",335190,"[335186, 334862]"
1,"[-0.20026441, -0.37414283, -0.072336294, -0.23...",294616,"[236336, 227922, 454518]"
2,"[0.15873507, -0.33249518, -0.105194904, -0.123...",787029,"[651803, 787021, 787022]"
3,"[-0.24302617, -0.090266466, -0.040903233, 0.29...",203901,"[197227, 203899, 210910]"
4,"[-0.23883097, -0.09577434, -0.041675787, 0.298...",203898,"[197227, 203899, 210910]"


In [24]:
filename = '../data/' + dataset_name + '_embeddings_' + model_name + '.pkl'
directory = os.path.dirname(filename)

os.makedirs(directory, exist_ok=True) # Create the directory if it doesn't exist
embeddings_df.to_pickle(filename)

In [25]:
embeddings_df.head()

Unnamed: 0,Embedding,Issue_id,Duplicated_issues
0,"[0.012851728, -0.33054906, -0.006574236, -0.07...",335190,"[335186, 334862]"
1,"[-0.20026441, -0.37414283, -0.072336294, -0.23...",294616,"[236336, 227922, 454518]"
2,"[0.15873507, -0.33249518, -0.105194904, -0.123...",787029,"[651803, 787021, 787022]"
3,"[-0.24302617, -0.090266466, -0.040903233, 0.29...",203901,"[197227, 203899, 210910]"
4,"[-0.23883097, -0.09577434, -0.041675787, 0.298...",203898,"[197227, 203899, 210910]"
