In [15]:
import pandas as pd
import numpy as np
import sys
import os
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers.util import cos_sim
import math

In [10]:
def generate_embeddings(model_name):
    if torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"
    model = SentenceTransformer(model_name, device=device)
    
    dataset_name = 'mozilla_firefox'
    df = pd.read_pickle('../data/' + dataset_name + '.pkl')
    
    def find_row_by_issue_id(df, issue_id):
        return df.loc[df['Issue_id'] == issue_id]

    def find_rows_by_issue_ids(dataframe, issue_ids):
        """
        Returns a DataFrame containing only the rows with matching Issue_id.
        
        :param dataframe: pandas DataFrame containing an 'Issue_id' column
        :param issue_ids: List of issue IDs to find
        :return: DataFrame with only the rows that have a matching Issue_id
        """
        return dataframe[dataframe['Issue_id'].isin(issue_ids)]
    
    df_subset_without_duplicates = df.head(10000)
    
    def insert_randomly(main_df, insert_dfs):
        """
        Inserts the rows from the list of DataFrames (insert_dfs) into the main DataFrame (main_df) at random positions.
        
        :param main_df: The main DataFrame where the other DataFrames are inserted.
        :param insert_dfs: A list of DataFrames to insert into main_df.
        :return: A new DataFrame with the inserted rows at random positions.
        """
        # Concatenate all the duplicates DataFrames into one for easier manipulation
        df_to_insert = pd.concat(insert_dfs).reset_index(drop=True)
        
        # Calculate the insertion points
        insertion_points = np.random.randint(0, len(main_df), len(df_to_insert))
        
        # Iterate through the insertion points and insert the rows
        for insertion_point, row_to_insert in zip(insertion_points, df_to_insert.iterrows()):
            part1 = main_df.iloc[:insertion_point]
            part2 = main_df.iloc[insertion_point:]
            main_df = pd.concat([part1, pd.DataFrame([row_to_insert[1]]), part2], ignore_index=True)
        
        return main_df
    
    duplicates1 = find_rows_by_issue_ids(df, [335186, 334862])
    duplicates2 = find_rows_by_issue_ids(df, [254967, 265118, 265103])
    duplicates3 = find_rows_by_issue_ids(df, [324801, 205129, 215031])
    duplicates4 = find_rows_by_issue_ids(df,[227241, 172962])
    duplicates5 = find_rows_by_issue_ids(df,[587440, 407981, 675541, 757056, 647655, 295372, 280509, 413211, 409895, 301776, 310261, 328159, 449385, 449221, 587440, 269207, 274631])

    # Concatenate all DataFrames
    # df_combined = pd.concat([df_subset_without_duplicates, duplicates1, duplicates2, duplicates3, duplicates4, duplicates5], axis=0)


    # randomize order of duplicates to have it more natural 
    # Use the function to insert duplicates into df_subset_without_duplicates
    df_subset = insert_randomly(df_subset_without_duplicates, [duplicates1, duplicates2, duplicates3, duplicates4, duplicates5])
    
    df_subset.to_pickle('../data/' + dataset_name + '_subset.pkl')
    
    def generate_embeddings(content, model, issue_id):
        """Generate embeddings for a given piece of text."""
        
        embedding = model.encode(content, convert_to_tensor=True)

        return embedding.cpu().numpy()
    
    embeddings_df = pd.DataFrame()

    embeddings_df['Embedding'] = df_subset.apply(lambda row: generate_embeddings(f"{row['Title']} {row['Content']}" if pd.notna(row['Content']) else row['Title'], model=model, issue_id=row['Issue_id']), axis=1)

    embeddings_df['Issue_id'] = df_subset['Issue_id']
    embeddings_df['Duplicated_issues'] = df_subset['Duplicated_issues']

    def typecast_df(df):
        df['Duplicated_issues'] = df['Duplicated_issues'].apply(lambda x: [int(i) for i in x])
        df["Issue_id"] = df["Issue_id"].astype('Int64')
        return df
    embeddings_df = typecast_df(embeddings_df)

    filename = '../data/' + dataset_name + '_embeddings_' + model_name + '.pkl'
    directory = os.path.dirname(filename)

    os.makedirs(directory, exist_ok=True) # Create the directory if it doesn't exist
    embeddings_df.to_pickle(filename)

In [12]:
model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
generate_embeddings(model_name)

In [13]:
model_name = "sentence-transformers/all-distilroberta-v1"
generate_embeddings(model_name)

In [14]:
model_name = "sentence-transformers/all-MiniLM-L12-v2"
generate_embeddings(model_name)

In [None]:
model_name = "mixedbread-ai/mxbai-embed-large-v1"
generate_embeddings(model_name)

In [None]:
model_name = "intfloat/multilingual-e5-large-instruct"
generate_embeddings(model_name)

In [None]:
model_name = "avsolatorio/GIST-large-Embedding-v0"
generate_embeddings(model_name)

In [None]:
model_name = "llmrails/ember-v1"
generate_embeddings(model_name)