# Cosine Similarity

In [72]:
import numpy as np
import matplotlib.pyplot as plt
import math as ma
import sklearn

# Two vectors, $a$ and $b$ represent documents $a$ and $b$
- They represent the term frequency of the entire vocabulary of documents. 0 for that word not appearing in the document, and a postive int for how many times that word appears in each document

In [73]:
#v = ['fairy', 'princess', 'king'...]
a = [1,0,0,1,6,3,0,0,1,0]
b = [0,1,2,0,0,3,0,4,1,5]

# $\Sigma_{i=1}^{len(d)} a[i] * b[i]$

or

# $\langle a, b \rangle$

or

# $a \cdot b$

or

# $a^\top b$

In [74]:
def similarity(a,b):
    """
    Inner product of two vectors
    """
    similarity = 0
    
    #Element wise
    for i in range(len(a)):
        similarity += a[i] * b[i]
        
    return similarity

similarity(a,b)

10

# $\cos\theta = \Sigma_{i=1}^{len(d)} \frac{a[i] * b[i]}{\sqrt{a[i]^2} \sqrt{b[i]^2}}$

or 

# $\cos\theta = \frac{a^\top b}{\left\lVert a \right\rVert \left\lVert b \right\rVert}$

In [75]:
def cosine_similarity(a,b):
    """
    Normalized Inner product of the two vectors
    """
    #Element wise similarity
    similarity = 0
    for i in range(len(a)):
        similarity += a[i] * b[i]
        
    #Maginitude of a
    norm_a = 0
    for i in range(len(a)):
        norm_a += a[i]**2
    norm_a = ma.sqrt(norm_a)
    
    #Magintude of b
    norm_b = 0
    for i in range(len(a)):
        norm_b += b[i]**2
    norm_b = ma.sqrt(norm_b)
    
    cosine_similarity = similarity / (norm_a * norm_b)
    
    return cosine_similarity

cosine_similarity(a,b)
    

0.1928791874526149

# Similarity is simply the inner product of the vectors

In [76]:
print(np.inner(a,b))
print(similarity(a,b))

10
10


# Cosine Similarity is the normalized inner product of the vectors

In [77]:
print(np.inner(a,b)/(np.linalg.norm(a)*np.linalg.norm(b)))

0.1928791874526149


In [78]:
import sklearn
print(sklearn.metrics.pairwise.cosine_similarity([a,b]))

[[1.         0.19287919]
 [0.19287919 1.        ]]
