In [1]:
from sentence_transformers import SentenceTransformer, util
import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from scipy import spatial
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.datasets import fetch_20newsgroups


  from .autonotebook import tqdm as notebook_tqdm


## Computing Matrix Multiplication of two embedding matricies

In [2]:
#Define two large embedding matricies

matrix_1 = torch.rand(50, 768)
matrix_2 = torch.rand(45, 768)

In [3]:
matrix_1 = matrix_1.cpu().detach().numpy()     
matrix_2 = matrix_2.cpu().detach().numpy()     
cosine_sim = cosine_similarity(matrix_1, matrix_2)
cosine_sim.shape

(50, 45)

## Running on Test Data

In [7]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [8]:
cats = ['alt.atheism', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)

newsgroups_test = fetch_20newsgroups(subset='test',
                                 categories=cats)

In [9]:
len(newsgroups_train.data)

1073

In [10]:
#Compute embedding for both lists
train= model.encode(newsgroups_train.data, convert_to_tensor=True)
test = model.encode(newsgroups_test.data, convert_to_tensor=True)

In [11]:
matrix_1 = train.cpu().detach().numpy()     
matrix_2 = test.cpu().detach().numpy()     
cosine_sim = cosine_similarity(matrix_1, matrix_2)
cosine_sim.shape

(1073, 713)

## Finding the Average Cosine Similarity Across First Axis?

In [12]:
# we'd want to find the sum of the top-n for each row
cosine_sim[:50].shape

(50, 713)

In [13]:
# how do we find the sum of only the first "b" of each row of array
avg_matrix = np.sum(cosine_sim[:50], axis = 0)
# sort in place by descending - https://stackoverflow.com/questions/26984414/efficiently-sorting-a-numpy-array-in-descending-order 
avg_matrix[::-1].sort()

avg_matrix.shape

(713,)

In [14]:
avg_matrix[0]

11.210295

## Finding Wood Score across dummy inputs

In [15]:
"""
What we're worried about now is that we need to include p in the sample weights for accuracy... that either means we factor it
into the avg matrix which would be preferrable or to calculate accuracy by hand which I dont like as much...

"""

"\nWhat we're worried about now is that we need to include p in the sample weights for accuracy... that either means we factor it\ninto the avg matrix which would be preferrable or to calculate accuracy by hand which I dont like as much...\n\n"

In [16]:
newsgroups_test.target.shape

(713,)

In [17]:
pred = np.zeros((713,))

In [18]:
accuracy_score(newsgroups_test.target, pred)

0.4474053295932679

In [23]:
p_matrix = 15 / avg_matrix
accuracies = []
for count, value in enumerate(newsgroups_test.target):
    accuracy = accuracy_score([value], [int(pred[count])])
    p = p_matrix[count]
    accuracies.append(accuracy*p)
    
result = sum(accuracies) / len(accuracies)
print(f'Wood Score v1 = {result}!')

Wood Score v1 = 1.152342695842117!


## What is a good value for "A"?

In [None]:
"""
The main equation that we have to worry about here is - a / sum(max_b of similarity(train/test))


Cosine similarity takes on values between 0 and 1 meaning that the denomenator will take on a value somewhere between (0, b)

For stability we may want to add a max(sum(max_b), np.eps) to avoid division by 0?
"""

## How to use sentence transformers for STS???

In [12]:
from sentence_transformers import SentenceTransformer, util
sentences = ["I'm happy", "I'm full of happiness"]

model = SentenceTransformer('all-MiniLM-L6-v2')

#Compute embedding for both lists
embedding_1= model.encode(sentences[0], convert_to_tensor=True)
embedding_2 = model.encode(sentences[1], convert_to_tensor=True)

test = util.pytorch_cos_sim(embedding_1, embedding_2)

In [26]:
a_norm = embedding_1 / embedding_1.norm()
b_norm = embedding_2 / embedding_2.norm()
res = a_norm @ b_norm.T
res

tensor(0.6003, device='cuda:0')

In [18]:
embedding_1@embedding_2.T

tensor(0.6003, device='cuda:0')

In [7]:
test.item()

0.6002568602561951

In [28]:
embeddings = model.encode(sentences, convert_to_tensor = True)

In [29]:
embeddings.shape

torch.Size([2, 384])

In [35]:
embeddings / embeddings.norm(dim=0)

tensor([[-0.9789,  0.0256, -0.1723,  0.2196,  0.3638, -0.9793,  0.7141, -0.0123,
         -0.6612,  0.5094,  0.9746, -0.5441,  0.9950,  0.9288,  0.5303,  0.3601,
          0.8983, -0.8832,  0.0420,  0.7086, -0.7811,  0.2444,  0.0368, -0.0227,
          0.4534, -0.0326,  0.9171, -0.9680,  0.9405, -0.4979, -0.9880,  0.9960,
          0.7064, -0.9897, -0.6682, -0.7107, -0.7046, -0.6136,  0.9078, -0.8851,
         -0.0738, -0.9702,  0.1700,  0.2970, -0.5985,  0.9864,  0.6968,  0.3486,
          0.6120, -0.8676,  0.3502,  0.3012, -0.6626, -0.9267,  0.6914,  0.8699,
          0.5964,  0.9408, -0.9999, -0.8361,  0.9981,  0.5822, -0.9998,  0.9626,
          0.3409,  0.5239, -0.8711,  0.1554, -0.4778,  0.5798,  0.9586,  0.3363,
          0.5820, -0.9199,  0.9971,  0.9649,  0.8329,  0.7814,  0.5148,  0.6839,
          0.8625, -0.7379, -0.7334, -0.8007, -0.8483, -0.9305, -0.4614,  0.6405,
         -0.7090,  0.8925, -0.2260,  0.2287,  0.8580, -0.8629, -0.7607, -0.8898,
         -0.2928,  0.9413, -

In [2]:
a = torch.randn(4, 3)
a

tensor([[ 0.1758, -0.2684,  2.3224],
        [ 0.0442,  0.4949, -0.0749],
        [ 0.8558, -0.3579, -1.0935],
        [-1.0837, -0.0103,  1.3125]])

In [4]:
a = torch.randn(2, 2)
b = torch.randn(3, 2) # different row number, for the fun

# Given that cos_sim(u, v) = dot(u, v) / (norm(u) * norm(v))
#                          = dot(u / norm(u), v / norm(v))
# We fist normalize the rows, before computing their dot products via transposition:
a_norm = a / a.norm(dim=1)[:, None]
b_norm = b / b.norm(dim=1)[:, None]
res = torch.mm(a_norm, b_norm.transpose(0,1))
print(res)
#  0.9978 -0.9986 -0.9985
# -0.8629  0.9172  0.9172

# -------
# Let's verify with numpy/scipy if our computations are correct:
a_n = a.numpy()
b_n = b.numpy()
res_n = np.zeros((2, 3))
for i in range(2):
    for j in range(3):
        # cos_sim(u, v) = 1 - cos_dist(u, v)
        res_n[i, j] = 1 - spatial.distance.cosine(a_n[i], b_n[j])
print(res_n)
# [[ 0.9978022  -0.99855876 -0.99854881]
#  [-0.86285472  0.91716063  0.9172349 ]]

tensor([[-0.2340, -0.7446, -0.1310],
        [ 0.6339,  0.9609, -0.3136]])
[[-0.2340468  -0.74459177 -0.13096361]
 [ 0.63388145  0.96090335 -0.31360054]]


In [11]:
torch.mean(a, 1)[0].item()

-0.24105462431907654

In [62]:
sentences1 = ["I'm happy", "I'm full of happiness"]

sentences2 = ["I'm sad", "I'm full of"]

embedding_1= model.encode(sentences1, convert_to_tensor=True)
embedding_2 = model.encode(sentences2, convert_to_tensor=True)

In [64]:
a_norm = embedding_1 / embedding_1.norm(dim = 0)
b_norm = embedding_2 / embedding_2.norm(dim = 0)
res = a_norm @ b_norm.T
res

RuntimeError: The size of tensor a (384) must match the size of tensor b (2) at non-singleton dimension 1

In [20]:
output = F.cosine_similarity(embedding_1, embedding_2, dim = 1)
print(output)

tensor([1., 1.], device='cuda:0')


In [9]:
a = torch.tensor([[1,2],[3,4]])
b = torch.tensor([[1,2], [4,5], [7,8]])

In [11]:
a@b.T

tensor([[ 5, 14, 23],
        [11, 32, 53]])