# Cosine Similarity

Cosine Similarity
 - Metric used to determine how similar the documents are irrespective of their size
 - the two vercors here we are talking about are arrays containing the word counts of two or more documents

In [16]:
import os              # working directory 
import pandas as pd    # data preprocessing
import numpy as np     # data preprocessing
import math            # using mathematical functions
import seaborn as sns            # for visualization
import matplotlib.pyplot as plt  # for visualization
from plotly.offline import init_notebook_mode, iplot   # for visualization
from pandas.plotting import scatter_matrix             # for visualization
import warnings  
%matplolib inline

pd.options.display.max_columns = 999  
pd.options.display.max_rows = 999  
warnings.filterwarnings("ignore")  
init_notebook_mode(connected=True)  

UsageError: Line magic function `%matplolib` not found.


### Example

In [17]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
 
# vectors
a = np.array([1,2,3])
b = np.array([1,1,4])
 
# manually compute cosine similarity
dot = np.dot(a, b)
norma = np.linalg.norm(a)
normb = np.linalg.norm(b)
cos = dot / (norma * normb)
 
# use library, operates on sets of vectors
aa = a.reshape(1,3)
ba = b.reshape(1,3)
cos_lib = cosine_similarity(aa, ba)
 
print(
    dot,
    norma,
    normb,
    cos,
    cos_lib[0][0]
)

15 3.7416573867739413 4.242640687119285 0.9449111825230682 0.9449111825230683


### Example 
Document(1) : he likes apple  
Dcoument(2) : he likes banana  
Document(3) : he likes banana he likes banana    
  
  
Document-Term Matrix
 - (banana, apple, he, likes)
 - Document(1) : (0,1,1,1)
 - Document(2) : (1,0,1,1)
 - Document(3) : (2,0,2,2)

In [18]:
# package import
from numpy import dot
from numpy.linalg import norm
import numpy as np

# define function for cosine similarity
def cos_sim(A,B) : 
    return dot(A, B) / (norm(A) * norm(B))

# generate array
doc1 = np.array([0,1,1,1])
doc2 = np.array([1,0,1,1])
doc3 = np.array([2,0,2,2])

# cosine similarity
print('cos_sim between doc1 and doc2 :', cos_sim(doc1,doc2))
print('cos_sim between doc1 and doc3 :', cos_sim(doc1,doc3))
print('cos_sim between doc2 and doc3 :', cos_sim(doc2,doc3))

cos_sim between doc1 and doc2 : 0.6666666666666667
cos_sim between doc1 and doc3 : 0.6666666666666667
cos_sim between doc2 and doc3 : 1.0000000000000002


-----

# Practice
 - Implementation of recommendation system using Cosine-Similarity
 - Data Source : https://www.kaggle.com/rounakbanik/the-movies-dataset
 - Subject : Movie Recommendation

### Reading Data

In [19]:
import pandas as pd
d_orig = pd.read_csv('./data/kaggle_the_movies_dataset/movies_metadata.csv', low_memory=False)
d = d_orig.copy()
print('shape:', d.shape)
d.head(2).transpose()

shape: (45466, 24)


Unnamed: 0,0,1
adult,False,False
belongs_to_collection,"{'id': 10194, 'name': 'Toy Story Collection', ...",
budget,30000000,65000000
genres,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
homepage,http://toystory.disney.com/toy-story,
id,862,8844
imdb_id,tt0114709,tt0113497
original_language,en,en
original_title,Toy Story,Jumanji
overview,"Led by Woody, Andy's toys live happily in his ...",When siblings Judy and Peter discover an encha...


Here, we will use title column and overview column.  
We are gonna calculate cos-sim using 'overview' column values.  
Consequently, when we put the title of movie we like as an input, model will return(reccomend) title of the movie we seem to like either. 

In [20]:
# use 40,000 rows for training
d = d.head(40000)
d['overview'].isnull().sum()

731

-> have many Null values (in this case, 731)   
-> replace 'Null' values with ''(empty value)

In [21]:
d['overview'] = d['overview'].fillna('')
d['overview'].isnull().sum()

0

### Implementation of tf-idf

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(d['overview'])
print('shape of tfidf_matrix :', tfidf_matrix.shape)

shape of tfidf_matrix : (40000, 70485)


### Calculate Cosine-Similarity

In [23]:
# Built-In module for Cosine-Similarity
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1.        , 0.01511114, 0.        , ..., 0.        , 0.0408661 ,
        0.        ],
       [0.01511114, 1.        , 0.04738832, ..., 0.        , 0.02197618,
        0.        ],
       [0.        , 0.04738832, 1.        , ..., 0.        , 0.        ,
        0.0057983 ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.0336424 ],
       [0.0408661 , 0.02197618, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.0057983 , ..., 0.0336424 , 0.        ,
        1.        ]])

In [24]:
# generate series which has 'title' and 'index'

indices = pd.Series(d.index, index=d['title']).drop_duplicates()
indices[0:5]

# Here,
#  -> index of series : title
#  -> value of series : index number

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64

-> This is because we want to return index number when put title of the movie as an input.

In [25]:
# For example,
idx = indices['Waiting to Exhale']
print(idx)

3


### Define function for Recommendation

In [26]:
def get_recommendation(title, cosine_sim=cosine_sim) :
    
    # 선택한 영화의 타이틀로부터 해당되는 인덱스를 받아옵니다. 이제 선택한 영화를 가지고 연산할 수 있습니다.
    idx = indices[title]

    # 모든 영화에 대해서 해당 영화와의 유사도를 구합니다.
    sim_scores = list(enumerate(cosine_sim[idx]))

    # 유사도에 따라 영화들을 정렬합니다.
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # 가장 유사한 10개의 영화를 받아옵니다.
    sim_scores = sim_scores[1:11]

    # 가장 유사한 10개의 영화의 인덱스를 받아옵니다.
    movie_indices = [i[0] for i in sim_scores]

    # 가장 유사한 10개의 영화의 제목을 리턴합니다.
    return d['title'].iloc[movie_indices]

In [27]:
get_recommendation('The Dark Knight Rises')

12481                                      The Dark Knight
150                                         Batman Forever
1328                                        Batman Returns
15511                           Batman: Under the Red Hood
585                                                 Batman
21194    Batman Unmasked: The Psychology of the Dark Kn...
9230                    Batman Beyond: Return of the Joker
18035                                     Batman: Year One
19792              Batman: The Dark Knight Returns, Part 1
3095                          Batman: Mask of the Phantasm
Name: title, dtype: object

### Appendix

The list of Movies (=titles)

In [28]:
print('***** the number of unique movie title :',len(indices.index.unique()), '*****')
list(indices.index)[:50]

***** the number of unique movie title : 37296 *****


['Toy Story',
 'Jumanji',
 'Grumpier Old Men',
 'Waiting to Exhale',
 'Father of the Bride Part II',
 'Heat',
 'Sabrina',
 'Tom and Huck',
 'Sudden Death',
 'GoldenEye',
 'The American President',
 'Dracula: Dead and Loving It',
 'Balto',
 'Nixon',
 'Cutthroat Island',
 'Casino',
 'Sense and Sensibility',
 'Four Rooms',
 'Ace Ventura: When Nature Calls',
 'Money Train',
 'Get Shorty',
 'Copycat',
 'Assassins',
 'Powder',
 'Leaving Las Vegas',
 'Othello',
 'Now and Then',
 'Persuasion',
 'The City of Lost Children',
 'Shanghai Triad',
 'Dangerous Minds',
 'Twelve Monkeys',
 'Wings of Courage',
 'Babe',
 'Carrington',
 'Dead Man Walking',
 'Across the Sea of Time',
 'It Takes Two',
 'Clueless',
 'Cry, the Beloved Country',
 'Richard III',
 'Dead Presidents',
 'Restoration',
 'Mortal Kombat',
 'To Die For',
 'How To Make An American Quilt',
 'Se7en',
 'Pocahontas',
 'When Night Is Falling',
 'The Usual Suspects']

# Soft Cosin-Similarity
 - When we have another dset of documents on a completely different topic
 - we want a similarity metric that gives higher scores for documents belonging to the same topic   
   and lower scores when comparing docs from different topics
 - In such case, we consider the semantic meaning, the words similar in meaning should be treaded as similar.
 - For example, 'President' vs 'Prime Minister', 'Food' vs 'Dish', 'Hi' vs 'Hello' should be considered similar.
 - For this, converting the words into respective word vectors, and then, computing the similarities can address this problem.

In [1]:
# Define the documents (1)
doc_trump = "Mr. Trump became president after winning the political election. Though he lost the support of some republican friends, Trump is friends with President Putin"
doc_election = "President Trump says Putin had no political interference is the election outcome. He says it was a witchhunt by political parties. He claimed President Putin is a friend who had nothing to do with the election"
doc_putin = "Post elections, Vladimir Putin became President of Russia. President Putin had served as the Prime Minister earlier in his political career"

# Define the documents (2)
doc_soup = "Soup is a primarily liquid food, generally served warm or hot (but may be cool or cold), that is made by combining ingredients of meat or vegetables with stock, juice, water, or another liquid. "
doc_noodles = "Noodles are a staple food in many cultures. They are made from unleavened dough which is stretched, extruded, or rolled flat and cut into one of a variety of shapes."
doc_dosa = "Dosa is a type of pancake from the Indian subcontinent, made from a fermented batter. It is somewhat similar to a crepe in appearance. Its main ingredients are rice and black gram."

documents = [doc_trump, doc_election, doc_putin, doc_soup, doc_noodles, doc_dosa]

In [2]:
import gensim
# upgrade gensim if you can't import softcossim
from gensim.matutils import softcossim 
from gensim import corpora
import gensim.downloader as api
from gensim.utils import simple_preprocess
print(gensim.__version__)
#> '3.6.0'

3.8.1


In [33]:
info = api.info()
info

{'corpora': {'semeval-2016-2017-task3-subtaskBC': {'num_records': -1,
   'record_format': 'dict',
   'file_size': 6344358,
   'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/semeval-2016-2017-task3-subtaskB-eng/__init__.py',
   'license': 'All files released for the task are free for general research use',
   'fields': {'2016-train': ['...'],
    '2016-dev': ['...'],
    '2017-test': ['...'],
    '2016-test': ['...']},
   'description': 'SemEval 2016 / 2017 Task 3 Subtask B and C datasets contain train+development (317 original questions, 3,169 related questions, and 31,690 comments), and test datasets in English. The description of the tasks and the collected data is given in sections 3 and 4.1 of the task paper http://alt.qcri.org/semeval2016/task3/data/uploads/semeval2016-task3-report.pdf linked in section “Papers” of https://github.com/RaRe-Technologies/gensim-data/issues/18.',
   'checksum': '701ea67acd82e75f95e1d8e62fb0ad29',
   'file_name': 'se

In [5]:
# take some time 
# Download the FastText model  
word2vec_google_news300 = api.load('word2vec-google-news-300')   # original api : fasttext-wiki-news-subwords-300 



In [9]:
# Prepare a dictionary and a corpus.  
dictionary = corpora.Dictionary([simple_preprocess(doc) for doc in documents])  

# Prepare the similarity matrix  
similarity_matrix = word2vec_google_news300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)  

# Convert the sentences into bag-of-words vectors.
sent_1 = dictionary.doc2bow(simple_preprocess(doc_trump))  
sent_2 = dictionary.doc2bow(simple_preprocess(doc_election))
sent_3 = dictionary.doc2bow(simple_preprocess(doc_putin))
sent_4 = dictionary.doc2bow(simple_preprocess(doc_soup))
sent_5 = dictionary.doc2bow(simple_preprocess(doc_noodles))
sent_6 = dictionary.doc2bow(simple_preprocess(doc_dosa))

sentences = [sent_1, sent_2, sent_3, sent_4, sent_5, sent_6]

  """


In [10]:
# Compute soft cosine similarity  
print(softcossim(sent_1, sent_2, similarity_matrix))  
#> 0.567228632589  

0.616090142642847


  


In [14]:
import numpy as np  
import pandas as pd  
   
def create_soft_cossim_matrix(sentences):  
    len_array = np.arange(len(sentences))  
    xx, yy = np.meshgrid(len_array, len_array)  
    cossim_mat = pd.DataFrame([[round(softcossim(sentences[i],sentences[j], similarity_matrix) ,2) for i, j in zip(x,y)] for y, x in zip(xx, yy)])
    return cossim_mat

create_soft_cossim_matrix(sentences)


Call to deprecated `softcossim` (Function will be removed in 4.0.0, use gensim.similarities.termsim.SparseTermSimilarityMatrix.inner_product instead).


Call to deprecated `softcossim` (Function will be removed in 4.0.0, use gensim.similarities.termsim.SparseTermSimilarityMatrix.inner_product instead).


Call to deprecated `softcossim` (Function will be removed in 4.0.0, use gensim.similarities.termsim.SparseTermSimilarityMatrix.inner_product instead).


Call to deprecated `softcossim` (Function will be removed in 4.0.0, use gensim.similarities.termsim.SparseTermSimilarityMatrix.inner_product instead).


Call to deprecated `softcossim` (Function will be removed in 4.0.0, use gensim.similarities.termsim.SparseTermSimilarityMatrix.inner_product instead).


Call to deprecated `softcossim` (Function will be removed in 4.0.0, use gensim.similarities.termsim.SparseTermSimilarityMatrix.inner_product instead).


Call to deprecated `softcossim` (Function will be removed in 4.0.0, use gensim.si

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.62,0.51,0.18,0.22,0.27
1,0.62,1.0,0.52,0.22,0.16,0.28
2,0.51,0.52,1.0,0.11,0.17,0.19
3,0.18,0.22,0.11,1.0,0.34,0.26
4,0.22,0.16,0.17,0.34,1.0,0.44
5,0.27,0.28,0.19,0.26,0.44,1.0


-> The similarity scores amongst similar documents are higher. (0:2, 0:2) & (3:5, 3:5)