# Tutorial: How to Calculate Cosine Similarity

## Method 1: Use numpy's dot and norm functions

In [7]:
from numpy import dot
from numpy.linalg import norm
X = [1,2]
Y = [2,2]
cos_sim = dot(X,Y) / (norm(X)*norm(Y))
print(cos_sim)

0.9486832980505138


## Method 2: Use scipy's built-in cosine function

In [8]:
# note that this function actually calculates cosine similarity 
# and then use "1-similarity" to convert similarity to distance
# to get the actual cosine similarity, you need to do 1-distance

from scipy import spatial
X = [1,2]
Y = [2,2]
cos_sim = 1 - spatial.distance.cosine(X, Y)
print(cos_sim)

0.9486832980505138


## Method 3: Use sklearn to calculate the cosine similarity matrix among vectors

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
X = np.array([1,2])
Y = np.array([2,2])
Z = np.array([2,4])

# calculate cosine similarity between [X] and [Y,Z]
# sending input as arrays would allow for calculating both cosine_sim(X,Y) and cosine_sim (X,Y)
cos_sim = cosine_similarity([X], [Y,Z])
print(cos_sim)

# calculate the entire cosie similarity matrix among X, Y, and Z
cos_sim = cosine_similarity([X, Y, Z])
print(cos_sim)
print()


[[0.9486833 1.       ]]
[[1.        0.9486833 1.       ]
 [0.9486833 1.        0.9486833]
 [1.        0.9486833 1.       ]]



## Use cosine similarity for plagiarism detection

In [10]:
# data from https://www.bowdoin.edu/studentaffairs/academic-honesty/examples/mosaic/index.shtml

txt_original = "Contrast the condition into which all these friendly Indians are suddenly plunged now, with their condition only two years previous: martial law now in force on all their reservations; themselves in danger of starvation, and constantly exposed to the influence of emissaries from their friends and relations, urging them to join in fighting this treacherous government that had kept faith with nobody--neither with friend nor with foe."
txt_plagiarized = "Only two years later, all these friendly Sioux were suddenly plunged into new conditions, including starvation, martial law on all their reservations, and constant urging by their friends and relations to join in warfare against the treacherous government that had kept faith with neither friend nor foe."
txt_control = "Only two years later, all the money he won from lottery was gone."

txts = [txt_original, txt_plagiarized, txt_control]
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

unigram_count = CountVectorizer(encoding='latin-1', binary=False)
unigram_count_stop_remove = CountVectorizer(encoding='latin-1', binary=True, stop_words='english')

vecs = unigram_count.fit_transform(txts)

print(vecs.shape)

from sklearn.metrics.pairwise import linear_kernel
cos_sim = cosine_similarity(vecs[0], vecs)
print(cos_sim)
sim_sorted_doc_idx = cos_sim.argsort()
print(sim_sorted_doc_idx.shape)

# print the most similar doc; it's actually the original doc itself
print(txts[sim_sorted_doc_idx[0][len(txts)-1]])
print()

# print the second most similar doc; it's the most likely plagiarized one
print(txts[sim_sorted_doc_idx[0][len(txts)-2]])


(3, 70)
[[1.         0.69376514 0.21550898]]
(1, 3)
Contrast the condition into which all these friendly Indians are suddenly plunged now, with their condition only two years previous: martial law now in force on all their reservations; themselves in danger of starvation, and constantly exposed to the influence of emissaries from their friends and relations, urging them to join in fighting this treacherous government that had kept faith with nobody--neither with friend nor with foe.

Only two years later, all these friendly Sioux were suddenly plunged into new conditions, including starvation, martial law on all their reservations, and constant urging by their friends and relations to join in warfare against the treacherous government that had kept faith with neither friend nor foe.


In [54]:
Doc1 = ("book, book, music, video, video")
Doc2= ("music, music, video")
Doc3= ("book, book, video")
txts = [Doc1,Doc2,Doc3]

boolean_unigram = CountVectorizer(encoding='latin-1', binary=True, stop_words='english', lowercase=False)
count_unigram = CountVectorizer(encoding='latin-1', binary=False, stop_words='english', lowercase=False)
vectorizers = [boolean_unigram, CountVectorizer]

vec = boolean_unigram.fit_transform(txts)
vec1 = count_unigram.fit_transform(txts)
 

print('Cos Similarity Scores with Boolean Vectorization:')
for i in range(0, 3):        
    print('Doc {}: {} -- Most similar, Ranked: {}'.format(i +1 , cosine_similarity(vec[i], vec), cosine_similarity(vec[i], vec).argsort()))
print('---------------------------------------------------')
print('---------------------------------------------------')
print('Cos Similarity Scores with Count Vectorization:')
for i in range(0, 3):        
    print('Doc {}: {} -- Most similar, Ranked: {}'.format(i +1 , cosine_similarity(vec1[i], vec1), cosine_similarity(vec1[i], vec1).argsort()))


Cos Similarity Scores with Boolean Vectorization:
Doc 1: [[1.         0.81649658 0.81649658]] -- Most similar, Ranked: [[1 2 0]]
Doc 2: [[0.81649658 1.         0.5       ]] -- Most similar, Ranked: [[2 0 1]]
Doc 3: [[0.81649658 0.5        1.        ]] -- Most similar, Ranked: [[1 0 2]]
---------------------------------------------------
---------------------------------------------------
Cos Similarity Scores with Count Vectorization:
Doc 1: [[1.         0.59628479 0.89442719]] -- Most similar, Ranked: [[1 2 0]]
Doc 2: [[0.59628479 1.         0.2       ]] -- Most similar, Ranked: [[2 0 1]]
Doc 3: [[0.89442719 0.2        1.        ]] -- Most similar, Ranked: [[1 0 2]]


In [64]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
import sklearn
from sklearn.model_selection import  cross_val_score

newsgroup = fetch_20newsgroups('all', categories= ['talk.politics.misc', 'talk.religion.misc'])
count_unigram = CountVectorizer(encoding='latin-1', binary=False, stop_words='english', lowercase=False)


X =newsgroup['data']
y = newsgroup['target']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

unigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')
X_train_vec = unigram_count_vectorizer.fit_transform(X_train)
X_test_vec = unigram_count_vectorizer.transform(X_test)

In [65]:
# import the LinearSVC module
from sklearn.svm import LinearSVC

# initialize the LinearSVC model
svm_clf = LinearSVC(C=1)

# use the training data to train the model
svm_clf.fit(X_train_vec,y_train)

LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [90]:
## get the 10 features that are best indicators of very negative sentiment (they are at the bottom of the ranked list)
feature_ranks = sorted(zip(svm_clf.coef_[0], unigram_count_vectorizer.get_feature_names()))

very_negative_10 = feature_ranks[:10]

print("Very negative words")
for i in range(0, len(very_negative_10)):
    print(very_negative_10[i])
print()

## get 10 features that are least relevant to "very negative" sentiment (they are at the top of the ranked list)
not_very_negative_10 = feature_ranks[-10:]
print("positive words")
for i in range(0, len(not_very_negative_10)):
    print(not_very_negative_10[i])
print()


Very negative words
(-0.22775602422547228, 'sexual')
(-0.16481286336424758, 'clinton')
(-0.14921268049946385, 'york')
(-0.14387612158057855, 'judge')
(-0.1260833516960672, 'kaldis')
(-0.12191641377190139, 'american')
(-0.12170341046292382, 'open')
(-0.1129558060598557, 'malcolm')
(-0.10786750331199461, 'lines')
(-0.10665460040984912, 'brandeis')

positive words
(0.1274565597259194, '15')
(0.12985612675857158, 'suggestion')
(0.13179645640026524, '666')
(0.13289886219082053, 'promise')
(0.134754092309615, 'buffalo')
(0.1377677974968512, 'happened')
(0.1386329374671371, 'info')
(0.14200448134572397, 'morality')
(0.18320065011216924, 'age')
(0.22244676841279454, 'christian')



In [86]:
import numpy as np
np.shape(svm_clf.coef_)

(1, 3223)