In [110]:
from sentence_transformers import SentenceTransformer, util
import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from scipy import spatial
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.datasets import fetch_20newsgroups

## Computing Matrix Multiplication of two embedding matricies

In [113]:
"""
Define two matricies that we will use to test the computation of the p-matrix which we will use to
weight the accuracy downstream

Matrix 1 will serve as a "train" set while Matrix 2 will serve as a "test" set...
"""
matrix_1 = torch.rand(5, 768)
matrix_2 = torch.rand(3, 768)

In [114]:
"""
Taking the cosine similarity of the two matricies in this order gives us a 3x5 array where we
have an array for each sample in "test" with the 5 floats in each array the cosine similarity with
test...
"""

cosine_sim = cosine_similarity(matrix_2, matrix_1)
cosine_sim.shape

(3, 5)

In [115]:
cosine_sim

array([[0.7520966 , 0.75049555, 0.72056574, 0.7420371 , 0.7596464 ],
       [0.76788914, 0.7493172 , 0.75799406, 0.75636005, 0.7686009 ],
       [0.77892154, 0.74041355, 0.74213445, 0.75219685, 0.7631999 ]],
      dtype=float32)

In [116]:
"""
This solution borrows from the second solution in: https://stackoverflow.com/questions/26984414/efficiently-sorting-a-numpy-array-in-descending-order
which gets the array into decending order by ordering the negative of the array and then 
transforming it back...

Dealing with axis: https://stackoverflow.com/questions/40200070/what-does-axis-0-do-in-numpys-sum-function
"""

-np.sort(-cosine_sim, axis = 1)

array([[0.7596464 , 0.7520966 , 0.75049555, 0.7420371 , 0.72056574],
       [0.7686009 , 0.76788914, 0.75799406, 0.75636005, 0.7493172 ],
       [0.77892154, 0.7631999 , 0.75219685, 0.74213445, 0.74041355]],
      dtype=float32)

In [117]:
"""
We only want the first n values of each array... here we use two as a test of indexing.

These are the top-n for each array regarding cosine similarity which we will next sum
"""

cosine_sim[:,:2]

array([[0.7520966 , 0.75049555],
       [0.76788914, 0.7493172 ],
       [0.77892154, 0.74041355]], dtype=float32)

In [118]:
summed = np.sum(cosine_sim[:, :2], axis = 1)
summed

array([1.5025921, 1.5172064, 1.519335 ], dtype=float32)

In [119]:
p = summed / 10
p

array([0.15025921, 0.15172064, 0.1519335 ], dtype=float32)

In [120]:
p[0]

0.15025921

## Running on Test Data

In [12]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [13]:
cats = ['alt.atheism', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)

newsgroups_test = fetch_20newsgroups(subset='test',
                                 categories=cats)

In [14]:
len(newsgroups_train.data)

1073

In [22]:
#Compute embedding for both lists
train = model.encode(newsgroups_train.data, convert_to_tensor=True).cpu()
test = model.encode(newsgroups_test.data, convert_to_tensor=True).cpu()

In [31]:
cosine_sim = cosine_similarity(train, test)
cosine_sim.shape

(1073, 713)

## Finding the Average Cosine Similarity Across First Axis?

In [32]:
# we'd want to find the sum of the top-n for each row
cosine_sim[:50].shape

(50, 713)

In [33]:
# how do we find the sum of only the first "b" of each row of array
avg_matrix = np.sum(cosine_sim[:50], axis = 0)
# sort in place by descending - https://stackoverflow.com/questions/26984414/efficiently-sorting-a-numpy-array-in-descending-order 
avg_matrix[::-1].sort()

avg_matrix.shape

(713,)

In [30]:
avg_matrix[0]

12.471147

## Finding Wood Score across dummy inputs

In [13]:
"""
What we're worried about now is that we need to include p in the sample weights for accuracy... that either means we factor it
into the avg matrix which would be preferrable or to calculate accuracy by hand which I dont like as much...

"""

"\nWhat we're worried about now is that we need to include p in the sample weights for accuracy... that either means we factor it\ninto the avg matrix which would be preferrable or to calculate accuracy by hand which I dont like as much...\n\n"

In [14]:
newsgroups_test.target.shape

(713,)

In [15]:
pred = np.zeros((713,))

In [16]:
accuracy_score(newsgroups_test.target, pred)

0.4474053295932679

In [17]:
p_matrix = 15 / avg_matrix
accuracies = []
for count, value in enumerate(newsgroups_test.target):
    accuracy = accuracy_score([value], [int(pred[count])])
    p = p_matrix[count]
    accuracies.append(accuracy*p)
    
result = sum(accuracies) / len(accuracies)
print(f'Wood Score v1 = {result}!')

Wood Score v1 = 1.152342695842117!


## What is a good value for "A"?

In [18]:
"""
The main equation that we have to worry about here is - a / sum(max_b of similarity(train/test))


Cosine similarity takes on values between 0 and 1 meaning that the denomenator will take on a value somewhere between (0, b)

For stability we may want to add a max(sum(max_b), np.eps) to avoid division by 0?
"""

'\nThe main equation that we have to worry about here is - a / sum(max_b of similarity(train/test))\n\n\nCosine similarity takes on values between 0 and 1 meaning that the denomenator will take on a value somewhere between (0, b)\n\nFor stability we may want to add a max(sum(max_b), np.eps) to avoid division by 0?\n'