### 18. How to compute similarity metrics like cosine similarity and soft cosine similarity?
Soft cosine similarity is similar to cosine similarity but in addition considers the semantic relationship between the words through its vector representation.

To compute soft cosines, you will need a word embedding model like Word2Vec or FastText. First, compute the similarity_matrix. Then convert the input sentences to bag-of-words corpus and pass them to the softcossim() along with the similarity matrix.



In [5]:


from gensim.matutils import softcossim
from gensim import corpora
import gensim.downloader as api

fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')

sent_1 = 'Sachin is a cricket player and a opening batsman'.split()
sent_2 = 'Dhoni is a cricket player too He is a batsman and keeper'.split()
sent_3 = 'Anand is a chess player'.split()

# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

# Prepare a dictionary and a corpus.
documents = [sent_1, sent_2, sent_3]
dictionary = corpora.Dictionary(documents)

# Convert the sentences into bag-of-words vectors.
sent_1 = dictionary.doc2bow(sent_1)
sent_2 = dictionary.doc2bow(sent_2)
sent_3 = dictionary.doc2bow(sent_3)

# Compute soft cosine similarity
print(softcossim(sent_1, sent_2, similarity_matrix))
#> 0.7868705819999783

print(softcossim(sent_1, sent_3, similarity_matrix))
#> 0.6036445529268666

print(softcossim(sent_2, sent_3, similarity_matrix))
#> 0.60965453519611

NameError: name 'dictionary' is not defined

In [None]:
# Which word from the given list doesn't go with the others?
print(fasttext_model300.doesnt_match(['india', 'australia', 'pakistan', 'china', 'beetroot']))  
#> beetroot

# Compute cosine distance between two words.
print(fasttext_model300.distance('king', 'queen'))
#> 0.22957539558410645


# Compute cosine distances from given word or vector to all words in `other_words`.
print(fasttext_model300.distances('king', ['queen', 'man', 'woman']))
#> [0.22957546 0.465837   0.547001  ]


# Compute cosine similarities
print(fasttext_model300.cosine_similarities(fasttext_model300['king'], 
                                            vectors_all=(fasttext_model300['queen'], 
                                                        fasttext_model300['man'], 
                                                        fasttext_model300['woman'],
                                                        fasttext_model300['queen'] + fasttext_model300['man'])))  
#> array([0.77042454, 0.534163  , 0.45299897, 0.76572555], dtype=float32)
# Note: Queen + Man is very similar to King.

# Get the words closer to w1 than w2
print(glove_model300.words_closer_than(w1='king', w2='kingdom'))
#> ['prince', 'queen', 'monarch']


# Find the top-N most similar words.
print(fasttext_model300.most_similar(positive='king', negative=None, topn=5, restrict_vocab=None, indexer=None))
#> [('queen', 0.63), ('prince', 0.62), ('monarch', 0.59), ('kingdom', 0.58), ('throne', 0.56)]


# Find the top-N most similar words, using the multiplicative combination objective,
print(glove_model300.most_similar_cosmul(positive='king', negative=None, topn=5))
#> [('queen', 0.82), ('prince', 0.81), ('monarch', 0.79), ('kingdom', 0.79), ('throne', 0.78)]