In [2]:
from sentence_transformers import SentenceTransformer, util
import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from scipy import spatial
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.datasets import fetch_20newsgroups


## Computing Matrix Multiplication of two embedding matricies

In [3]:
#Define two large embedding matricies

matrix_1 = torch.rand(50, 768)
matrix_2 = torch.rand(45, 768)

In [4]:
matrix_1 = matrix_1.cpu().detach().numpy()     
matrix_2 = matrix_2.cpu().detach().numpy()     
cosine_sim = cosine_similarity(matrix_1, matrix_2)
cosine_sim.shape

(50, 45)

## Running on Test Data

In [5]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [6]:
cats = ['alt.atheism', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)

newsgroups_test = fetch_20newsgroups(subset='test',
                                 categories=cats)

In [7]:
len(newsgroups_train.data)

1073

In [8]:
#Compute embedding for both lists
train= model.encode(newsgroups_train.data, convert_to_tensor=True)
test = model.encode(newsgroups_test.data, convert_to_tensor=True)

In [9]:
matrix_1 = train.cpu().detach().numpy()     
matrix_2 = test.cpu().detach().numpy()     
cosine_sim = cosine_similarity(matrix_1, matrix_2)
cosine_sim.shape

(1073, 713)

## Finding the Average Cosine Similarity Across First Axis?

In [10]:
# we'd want to find the sum of the top-n for each row
cosine_sim[:50].shape

(50, 713)

In [11]:
# how do we find the sum of only the first "b" of each row of array
avg_matrix = np.sum(cosine_sim[:50], axis = 0)
# sort in place by descending - https://stackoverflow.com/questions/26984414/efficiently-sorting-a-numpy-array-in-descending-order 
avg_matrix[::-1].sort()

avg_matrix.shape

(713,)

In [12]:
avg_matrix[0]

11.210295

## Finding Wood Score across dummy inputs

In [13]:
"""
What we're worried about now is that we need to include p in the sample weights for accuracy... that either means we factor it
into the avg matrix which would be preferrable or to calculate accuracy by hand which I dont like as much...

"""

"\nWhat we're worried about now is that we need to include p in the sample weights for accuracy... that either means we factor it\ninto the avg matrix which would be preferrable or to calculate accuracy by hand which I dont like as much...\n\n"

In [14]:
newsgroups_test.target.shape

(713,)

In [15]:
pred = np.zeros((713,))

In [16]:
accuracy_score(newsgroups_test.target, pred)

0.4474053295932679

In [17]:
p_matrix = 15 / avg_matrix
accuracies = []
for count, value in enumerate(newsgroups_test.target):
    accuracy = accuracy_score([value], [int(pred[count])])
    p = p_matrix[count]
    accuracies.append(accuracy*p)
    
result = sum(accuracies) / len(accuracies)
print(f'Wood Score v1 = {result}!')

Wood Score v1 = 1.152342695842117!


## What is a good value for "A"?

In [18]:
"""
The main equation that we have to worry about here is - a / sum(max_b of similarity(train/test))


Cosine similarity takes on values between 0 and 1 meaning that the denomenator will take on a value somewhere between (0, b)

For stability we may want to add a max(sum(max_b), np.eps) to avoid division by 0?
"""

'\nThe main equation that we have to worry about here is - a / sum(max_b of similarity(train/test))\n\n\nCosine similarity takes on values between 0 and 1 meaning that the denomenator will take on a value somewhere between (0, b)\n\nFor stability we may want to add a max(sum(max_b), np.eps) to avoid division by 0?\n'

## How to use sentence transformers for STS???

In [19]:
from sentence_transformers import SentenceTransformer, util
sentences = ["I'm happy", "I'm full of happiness"]

model = SentenceTransformer('all-MiniLM-L6-v2')

#Compute embedding for both lists
embedding_1= model.encode(sentences[0], convert_to_tensor=True)
embedding_2 = model.encode(sentences[1], convert_to_tensor=True)

test = util.pytorch_cos_sim(embedding_1, embedding_2)

In [20]:
a_norm = embedding_1 / embedding_1.norm()
b_norm = embedding_2 / embedding_2.norm()
res = a_norm @ b_norm.T
res

  res = a_norm @ b_norm.T


tensor(0.6003, device='cuda:0')

In [21]:
embedding_1@embedding_2.T

tensor(0.6003, device='cuda:0')

In [22]:
test.item()

0.6002568602561951

In [23]:
embeddings = model.encode(sentences, convert_to_tensor = True)

In [24]:
embeddings.shape

torch.Size([2, 384])

In [25]:
#embeddings / embeddings.norm(dim=0)

In [26]:
a = torch.randn(4, 3)
a

tensor([[-0.6676, -2.6176,  0.5403],
        [-0.4915, -0.7438,  0.5187],
        [-1.9908,  1.2999, -0.4224],
        [ 1.1458, -0.5736,  1.1171]])

In [27]:
a = torch.randn(2, 2)
b = torch.randn(3, 2) # different row number, for the fun

# Given that cos_sim(u, v) = dot(u, v) / (norm(u) * norm(v))
#                          = dot(u / norm(u), v / norm(v))
# We fist normalize the rows, before computing their dot products via transposition:
a_norm = a / a.norm(dim=1)[:, None]
b_norm = b / b.norm(dim=1)[:, None]
res = torch.mm(a_norm, b_norm.transpose(0,1))
print(res)
#  0.9978 -0.9986 -0.9985
# -0.8629  0.9172  0.9172

# -------
# Let's verify with numpy/scipy if our computations are correct:
a_n = a.numpy()
b_n = b.numpy()
res_n = np.zeros((2, 3))
for i in range(2):
    for j in range(3):
        # cos_sim(u, v) = 1 - cos_dist(u, v)
        res_n[i, j] = 1 - spatial.distance.cosine(a_n[i], b_n[j])
print(res_n)
# [[ 0.9978022  -0.99855876 -0.99854881]
#  [-0.86285472  0.91716063  0.9172349 ]]

tensor([[-0.9398,  0.1911, -0.3412],
        [ 0.9986, -0.4693,  0.0519]])
[[-0.93977839  0.1910819  -0.34116521]
 [ 0.99861783 -0.46929857  0.05190084]]


In [28]:
torch.mean(a, 1)[0].item()

-0.6531582474708557

In [29]:
sentences1 = ["I'm happy", "I'm full of happiness"]

sentences2 = ["I'm sad", "I'm full of"]

embedding_1= model.encode(sentences1, convert_to_tensor=True)
embedding_2 = model.encode(sentences2, convert_to_tensor=True)

In [30]:
a_norm = embedding_1 / embedding_1.norm(dim = 0)
b_norm = embedding_2 / embedding_2.norm(dim = 0)
res = a_norm @ b_norm.T
res

tensor([[inf, inf],
        [inf, inf]], device='cuda:0')

In [31]:
output = F.cosine_similarity(embedding_1, embedding_2, dim = 1)
print(output)

tensor([0.4390, 0.5458], device='cuda:0')


In [32]:
a = torch.tensor([[1,2],[3,4]])
b = torch.tensor([[1,2], [4,5], [7,8]])

In [33]:
a@b.T

tensor([[ 5, 14, 23],
        [11, 32, 53]])