# Cosine Similarity

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import math as ma
import sklearn

### Two vectors, $a$ and $b$ represent documents $a$ and $b$
- They represent the term frequency of the entire vocabulary of documents. 0 for that word not appearing in the document, and a postive int for how many times that word appears in each document

In [2]:
#v = ['fairy', 'princess', 'king'...]
a = [1,0,0,1,6,3,0,0,1,0]
b = [0,1,2,0,0,3,0,4,1,5]

### $\Sigma_{i=1}^{len(d)} a[i] * b[i]$

or

### $\langle a, b \rangle$

or

### $a \cdot b$

or

### $a^\top b$

In [3]:
def similarity(a,b):
    """
    Inner product of two vectors
    """
    similarity = 0
    
    #Element wise
    for i in range(len(a)):
        similarity += a[i] * b[i]
        
    return similarity

similarity(a,b)

10

### $\cos\theta = \Sigma_{i=1}^{len(d)} \frac{a[i] * b[i]}{\sqrt{a[i]^2} \sqrt{b[i]^2}}$

or 

### $\cos\theta = \frac{a^\top b}{\left\lVert a \right\rVert \left\lVert b \right\rVert}$

In [8]:
def cos_similarity(a,b):
    """
    Normalized Inner product of the two vectors
    """
    #Element wise similarity
    similarity = 0
    for i in range(len(a)):
        similarity += a[i] * b[i]
        
    #Maginitude of a
    norm_a = 0
    for i in range(len(a)):
        norm_a += a[i]**2
    norm_a = ma.sqrt(norm_a)
    
    #Magintude of b
    norm_b = 0
    for i in range(len(a)):
        norm_b += b[i]**2
    norm_b = ma.sqrt(norm_b)
    
    cosine_similarity = similarity / (norm_a * norm_b)
    
    return cosine_similarity

cos_similarity(a,b)
    

0.1928791874526149

### Similarity is simply the inner product of the vectors

In [9]:
print(np.inner(a,b))
print(similarity(a,b))

10
10


### Cosine Similarity is the normalized inner product of the vectors

In [10]:
print(np.inner(a,b)/(np.linalg.norm(a)*np.linalg.norm(b)))

0.1928791874526149


In [14]:
import sklearn
print(sklearn.metrics.pairwise.cosine_similarity([a,b]))

[[1.         0.19287919]
 [0.19287919 1.        ]]


# Example:


In [28]:
a = 'Leonhard Euler was a Swiss mathematician, physicist, astronomer, logician and engineer who made important and influential discoveries in many branches of mathematics, such as infinitesimal calculus and graph theory, while also making pioneering contributions to several branches such as topology and analytic number theory. He also introduced much of the modern mathematical terminology and notation, particularly for mathematical analysis, such as the notion of a mathematical function.[3] He is also known for his work in mechanics, fluid dynamics, optics, astronomy and music theory'

b = "Wilhelm Leibniz was a prominent German polymath and philosopher in the history of mathematics and the history of philosophy. His most notable accomplishment was conceiving the ideas of differential and integral calculus, independently of Isaac Newton's contemporaneous developments.[16] Mathematical works have generally favored Leibniz's notation as the conventional expression of calculus. It was only in the 20th century that Leibniz's law of continuity and transcendental law of homogeneity found mathematical implementation (by means of non-standard analysis). He became one of the most prolific inventors in the field of mechanical calculators. While working on adding automatic multiplication and division to Pascal's calculator, he was the first to describe a pinwheel calculator in 1685[17] and invented the Leibniz wheel, used in the arithmometer, the first mass-produced mechanical calculator. He also refined the binary number system, which is the foundation of all digital computers."

In [50]:
from sklearn.feature_extraction.text import CountVectorizer

#init vectorizer
vectorizer = CountVectorizer()

documents_vectorized = vectorizer.fit_transform([a,b])
print(vectorizer.get_feature_names())

['16', '1685', '17', '20th', 'accomplishment', 'adding', 'all', 'also', 'analysis', 'analytic', 'and', 'arithmometer', 'as', 'astronomer', 'astronomy', 'automatic', 'became', 'binary', 'branches', 'by', 'calculator', 'calculators', 'calculus', 'century', 'computers', 'conceiving', 'contemporaneous', 'continuity', 'contributions', 'conventional', 'describe', 'developments', 'differential', 'digital', 'discoveries', 'division', 'dynamics', 'engineer', 'euler', 'expression', 'favored', 'field', 'first', 'fluid', 'for', 'found', 'foundation', 'function', 'generally', 'german', 'graph', 'have', 'he', 'his', 'history', 'homogeneity', 'ideas', 'implementation', 'important', 'in', 'independently', 'infinitesimal', 'influential', 'integral', 'introduced', 'invented', 'inventors', 'is', 'isaac', 'it', 'known', 'law', 'leibniz', 'leonhard', 'logician', 'made', 'making', 'many', 'mass', 'mathematical', 'mathematician', 'mathematics', 'means', 'mechanical', 'mechanics', 'modern', 'most', 'much', 'm

In [56]:
print('Euler:', documents_vectorized.toarray()[0])
print('Leibniz:',documents_vectorized.toarray()[1])


Euler: [0 0 0 0 0 0 0 3 1 1 6 0 3 1 1 0 0 0 2 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 1
 1 1 0 0 0 0 1 2 0 0 1 0 0 1 0 2 1 0 0 0 0 1 2 0 1 1 0 1 0 0 1 0 0 1 0 0 1
 1 1 1 1 0 3 1 1 0 0 1 1 0 1 0 1 0 0 0 1 1 1 3 0 0 0 1 1 0 0 0 1 0 1 0 0 0
 0 0 1 0 3 1 0 1 0 2 3 1 1 0 0 1 0 0 1 1 0 1 0 0]
Leibniz: [ 1  1  1  1  1  1  1  1  1  0  6  1  1  0  0  1  1  1  0  1  3  1  2  1
  1  1  1  1  0  1  1  1  1  1  0  1  0  0  0  1  1  1  2  0  0  1  1  0
  1  1  0  1  3  1  2  1  1  1  0  5  1  0  0  1  0  1  1  1  1  1  0  2
  4  0  0  0  0  0  1  2  0  1  1  2  0  0  2  0  1  0  1  1  1  1  0  1
 11  1  1  1  0  0  1  1  1  0  1  0  1  1  1  1  1  0  1  0  0  1  0  1
 13  0  2  0  1  1  4  1  1  1  0  1  0  1  1]
270


# Cosine Similarity Between two documents:

In [57]:
cosine_similarity(documents_vectorized.toarray()[0], documents_vectorized.toarray()[1])

0.48721293531595355