# Efficient Cosine similarities

## With PyTorch

In [27]:
import torch
import numpy as np

In [28]:
torch.cuda.is_available()

False

In [29]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)


cpu


In [102]:
a = torch.randn(2, 10).to(device)
b = torch.randn(3, 10).to(device) # different row number, for the fun

In [103]:
%%timeit
a_norm = a / a.norm(dim=1)[:, None]
b_norm = b / b.norm(dim=1)[:, None]
res = torch.mm(a_norm, b_norm.transpose(0,1))

41.1 µs ± 659 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [115]:
a_norm = a / a.norm(dim=1)[:, None]
b_norm = b / b.norm(dim=1)[:, None]
res = torch.mm(a_norm, b_norm.transpose(0,1))
print(res)

tensor([[ 0.1189,  0.0432, -0.4907],
        [-0.0593,  0.1020, -0.4319]])


## With numpy

In [105]:
A = a.cpu().detach().numpy()
B = b.cpu().detach().numpy()

In [117]:
%%timeit
dots = np.dot(A,B.T)
l2norms = np.sqrt(((A**2).sum(1)[:,None])*((B**2).sum(1)))
similarity_scores = (dots/l2norms)

14.1 µs ± 607 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [116]:
dots = np.dot(A,B.T)
l2norms = np.sqrt(((A**2).sum(1)[:,None])*((B**2).sum(1)))
similarity_scores = (dots/l2norms)
print(similarity_scores)

[[ 0.11893534  0.04317652 -0.49073324]
 [-0.05932184  0.10199047 -0.4318895 ]]


## Compare to sklearn or scipy

In [108]:
from sklearn.metrics.pairwise import cosine_similarity

In [118]:
%%timeit
cosine_similarity(A,B)

205 µs ± 6.59 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [110]:
cosine_similarity(a,b)

array([[ 0.11893529,  0.04317654, -0.4907333 ],
       [-0.05932185,  0.10199045, -0.43188953]], dtype=float32)

In [111]:
from scipy.spatial.distance import cosine as cosine_distance

In [112]:
%%timeit
distances=[]
for vec_a in A:
    for vec_b in B:
        distances.append(cosine_distance(vec_a,vec_b))

402 µs ± 124 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [113]:
distances=[]
for vec_a in A:
    for vec_b in B:
        distances.append(1-cosine_distance(vec_a,vec_b))
distances

[0.11893530935049057,
 0.04317651689052582,
 -0.49073323607444763,
 -0.059321850538253784,
 0.1019904762506485,
 -0.43188953399658203]

In [120]:
import pandas as pd

In [122]:
df = pd.read_csv('f250000.csv')

In [123]:
df

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5
0,0,BB52BEE967694542B309F1924878C1D8206,"[array([-0.22878039, -0.41120726, -0.5793576 ,...",0.000660,0.000660,5,5
1,1,169FC71888B64710B6CA2836A73D2BFA206,"[array([-0.22317335, -0.3341324 , -0.5560438 ,...",0.001082,0.000780,19,11
2,2,F92FF77DEEB540FFBEAA1A76A1574427206,"[array([-2.72363633e-01, -3.51571441e-01, -5.9...",0.001070,0.000800,15,8
3,3,5821C1E0DF3240118CA39013E1E8CFED206,"[array([-0.24757737, -0.3919592 , -0.5711255 ,...",0.000514,0.000466,18,8
4,4,AF3D6FCFBD054BCB86A03D57BAF073A6206,"[array([-1.84394434e-01, -3.83317232e-01, -5.8...",0.000466,0.000466,10,10
...,...,...,...,...,...,...,...
24995,24995,EE022B08D3E845B48C05765750658627206,"[array([-1.84042796e-01, -3.68921787e-01, -5.6...",0.000377,0.000377,22,22
24996,24996,52C77D7F170E43379EE4BE4B6F2EAAC3206,"[array([-0.2660032 , -0.3723642 , -0.5636557 ,...",0.000513,0.000513,8,8
24997,24997,D105E52E5BEC421A9997E73BA7972D7D206,"[array([-0.19452327, -0.3731445 , -0.5522237 ,...",0.000753,0.000639,23,4
24998,24998,C68E38838DE746C6871B763E42F90B1A206,"[array([-0.19274597, -0.35365516, -0.55021644,...",0.000554,0.000554,11,11
