# spaCy

In [1]:
! pip install spacy==3.2.3
! python -m spacy download "en_core_web_lg"

Collecting en-core-web-lg==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0-py3-none-any.whl (777.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m777.4/777.4 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:04[0m


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [3]:
import spacy

nlp = spacy.load("en_core_web_lg")

docs = [
    nlp(u"I like cold soda"),
    nlp(u"hot chocolate is filling"),
    nlp(u"ice cream is cold"),
    nlp(u"burger tastes best when hot")
]

sample = nlp(u"hot meal")


for doc in docs:
    
    print(doc, "<>", sample, "->", doc.similarity(sample))

I like cold soda <> hot meal -> 0.6526761073589249
hot chocolate is filling <> hot meal -> 0.7840665641430987
ice cream is cold <> hot meal -> 0.6564778194706912
burger tastes best when hot <> hot meal -> 0.8263906754007433


# Gensim

In [6]:
! pip install gensim



In [5]:
import gensim.downloader as api
corpus = api.load('word2vec-google-news-300')



In [7]:
print(corpus.n_similarity(['hot', 'meal'], ['burger', 'tastes', 'best', 'when', 'hot']))
print(corpus.n_similarity(['hot', 'meal'], ['I', 'like', 'cold', 'soda']))


0.674938
0.46843547


# Transformers

In [13]:
! pip install transformers
! pip install sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting torchvision
  Downloading torchvision-0.14.0-cp39-cp39-macosx_10_9_x86_64.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125925 sha256=de3e7084c61926c188002d96279baf10d912a01f66c611d60976a2bfecfa7a2c
  Stored in directory: /Users/jyotikasingh/Library/Caches/pip/wheels/71/67/06/162a3760c40d74dd40bc855d527008d26341c2b0ecf3e8e11f
Successfully built sentence-transformers
Installing collected packages: sentencepiece, pillow, torchvision, sentence-transformers
Successfully installed pillow-9.3.0 sentence-transformers-2.2.2 sentencepiece-0.1.97 torchvision-0.14.0


In [14]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

model = SentenceTransformer('stsb-roberta-base')
doc1 = "hot chocolate is filling"
doc2 = "ice cream is cold"
samp = "hot meal"

# encode sentences to get their embeddings
embedding1 = model.encode(doc1, convert_to_tensor=True)
embedding2 = model.encode(doc2, convert_to_tensor=True)
samp_embedding = model.encode(samp, convert_to_tensor=True)

# compute similarity scores of two embeddings
cosine_scores = util.cos_sim(embedding1, samp_embedding)
print("Similarity score:", cosine_scores.item())
# >> Similarity score: 0.3480038046836853

cosine_scores = util.cos_sim(embedding2, samp_embedding)
print("Similarity score:", cosine_scores.item())
# >> Similarity score: 0.11001470685005188

Similarity score: 0.3480038046836853
Similarity score: 0.11001470685005188


In [15]:
bert_model = SentenceTransformer('bert-base-nli-mean-tokens')

# encode sentences to get their embeddings
embedding1 = bert_model.encode(doc1, convert_to_tensor=True)
embedding2 = bert_model.encode(doc2, convert_to_tensor=True)
samp_embedding = bert_model.encode(samp, convert_to_tensor=True)

# compute similarity scores of two embeddings
cosine_scores = util.cos_sim(embedding1, samp_embedding)
print("Similarity score:", cosine_scores.item())

cosine_scores = util.cos_sim(embedding2, samp_embedding)
print("Similarity score:", cosine_scores.item())

Similarity score: 0.7925456762313843
Similarity score: 0.30324894189834595
