# Create embeddings and perform similarity analysis

My first attempt using bert models and hugging face transformers.

In [None]:
# basics
import os
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt

# bert transformers, huggingace, etc
from sentence_transformers import SentenceTransformer, util

# we are using some less optimal code, suppress the warnings for now
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

## Similarity between two texts

We compute the cosine similarity between embeddings, for example, to measure the semantic similarity of two texts. For this, we pick a smaller set from the orig data, i.e. all github issues that carry the label `area/devmode` already. Why? Issues with the same label are classified manually to belong to the same label. Let's see if our algorithm reflects this too.

### One way to look at the data

In [None]:

model = SentenceTransformer('all-MiniLM-L6-v2')

print("Loading issues from file...")
unique_labels = pickle.load(open("../data/labels.pkl", 'rb'))
df = pickle.load(open("../data/issues_prepared.pkl", 'rb'))
same_label_df = df[df['area/devmode'] == 1]

# create equally sizes df's
if (len(same_label_df) % 2) > 0:
    same_label_df.drop(same_label_df.tail(1).index,inplace=True)
df1, df2 = np.split(same_label_df['title'], 2)

# Two lists of sentences
sentences1 = df1.to_numpy()
sentences2 = df2.to_numpy()

# #Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the score for each pair
# for i in range(len(sentences1)):
#     print("{} \t Score: {:.4f}".format(i, cosine_scores[i][i]))

# Calculate the histogram of the above created tensor
hist = torch.histc(cosine_scores, min = 0, max = 0.99)

# Visualize above calculated histogram as bar diagram
bins = 100
x = range(bins)
plt.bar(x, hist, align='center')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.show()


Out previous comparison show very low similarity scores for the majority of issues 

## Another way to look at the data

Lets try find the ones with the highest degree of similarity, to see if we can learn something from it.

In [None]:
# Single list of sentences
sentences = [*sentences1, *sentences2]

#Compute embeddings
embeddings = model.encode(sentences, convert_to_tensor=True)

#Compute cosine-similarities for each sentence with each other sentence
cosine_scores = util.cos_sim(embeddings, embeddings)

#Find the pairs with the highest cosine similarity scores
pairs = []
for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})

#Sort scores in decreasing order
pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

for pair in pairs[0:10]:
    i, j = pair['index']
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i], sentences[j], pair['score']))
