# Doc Embedding

In [1]:
# obtain the data
!wget http://alt.qcri.org/semeval2017/task1/data/uploads/sts2017.eval.v1.1.zip
!wget http://alt.qcri.org/semeval2017/task1/data/uploads/sts2017.gs.zip

!unzip sts2017.eval.v1.1.zip 
!unzip sts2017.gs.zip



--2023-03-30 07:27:13--  http://alt.qcri.org/semeval2017/task1/data/uploads/sts2017.eval.v1.1.zip
Resolving alt.qcri.org (alt.qcri.org)... 80.76.166.234
Connecting to alt.qcri.org (alt.qcri.org)|80.76.166.234|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://alt.qcri.org/semeval2017/task1/data/uploads/sts2017.eval.v1.1.zip [following]
--2023-03-30 07:27:13--  https://alt.qcri.org/semeval2017/task1/data/uploads/sts2017.eval.v1.1.zip
Connecting to alt.qcri.org (alt.qcri.org)|80.76.166.234|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 87902 (86K) [application/zip]
Saving to: ‘sts2017.eval.v1.1.zip’


2023-03-30 07:27:14 (426 KB/s) - ‘sts2017.eval.v1.1.zip’ saved [87902/87902]

URL transformed to HTTPS due to an HSTS policy
--2023-03-30 07:27:15--  https://alt.qcri.org/semeval2017/task1/data/uploads/sts2017.gs.zip
Resolving alt.qcri.org (alt.qcri.org)... 80.76.166.234
Connecting to alt.qcri.org (alt.qcri.org)|80.76

In [2]:
# load the data

def load_STS_data():
    with open("STS2017.gs/STS.gs.track5.en-en.txt") as f:
        labels = [float(line.strip()) for line in f]
    
    text_a, text_b = [], []
    with open("STS2017.eval.v1.1/STS.input.track5.en-en.txt") as f:
        for line in f:
            line = line.strip().split("\t")
            text_a.append(line[0])
            text_b.append(line[1])
    return text_a, text_b, labels

text_a, text_b, labels = load_STS_data()
text_a[0], text_b[0], labels[0]

('A person is on a baseball team.',
 'A person is playing basketball on a team.',
 2.4)

In [3]:
# some utils
from scipy.stats import spearmanr
def evaluate(predictions, labels):
    print ("spearman's rank correlation", spearmanr(predictions, labels)[0])

import numpy as np
from numpy import dot
from numpy.linalg import norm

def cosine_similarity(a,b):
    return dot(a, b)/(norm(a)*norm(b))



In [None]:
# Wordcounts baseline
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
vec.fit(text_a + text_b)

# encode documents
text_a_encoded = np.array(vec.transform(text_a).todense())
text_b_encoded = np.array(vec.transform(text_b).todense())

# predict cosine similarities
predictions = [cosine_similarity(a,b) for a,b in zip(text_a_encoded, text_b_encoded)]

# evaluate
evaluate(predictions, labels)

spearman's rank correlation 0.6998056665685976


In [None]:
# Wordcounts baseline
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
vec.fit(text_a + text_b)

# encode documents
text_a_encoded = np.array(vec.transform(text_a).todense())
text_b_encoded = np.array(vec.transform(text_b).todense())

# predict cosine similarities
predictions = [cosine_similarity(a,b) for a,b in zip(text_a_encoded, text_b_encoded)]

# evaluate
evaluate(predictions, labels)

spearman's rank correlation 0.6998056665685976


In [9]:
##TODO train Doc2Vec on the texts in the dataset
##TODO derive the word vectors for each text in the dataset
##TODO compute cosine similarity between the text pairs and evaluate spearman's rank correlation

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
from nltk import word_tokenize
nltk.download('punkt')
docs = []

for i in text_a + text_b:
    docs.append(word_tokenize(i))

# we have to encode the unique id for each document
doc_iterator = [TaggedDocument(doc,[i]) for i, doc in enumerate(docs)]
d2v = Doc2Vec(doc_iterator,
              min_count=2,
              window=5,
              vector_size=25,
              sample=1e-4,
              negative=5,
              workers=4,
              max_vocab_size=1000)


text_a_encoded = d2v.dv.vectors[:len(text_a)]
text_b_encoded = d2v.dv.vectors[len(text_a):]

pred = [cosine_similarity(a, b) for a, b in zip(text_a_encoded, text_b_encoded)]

evaluate(pred, labels)

spearman's rank correlation 0.014730844630254609


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
##TODO do the same with embeddings provided by spaCy

import spacy

nlp = spacy.load('en_core_web_sm')

text_a_encoded = [nlp(text).vector for text in text_a]
text_b_encoded = [nlp(text).vector for text in text_b]



In [11]:
predictions = [cosine_similarity(a, b) for a, b in zip(text_a_encoded, text_b_encoded)]
evaluate(predictions, labels)

spearman's rank correlation 0.48796451382976114


In [15]:
! pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m68.3 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8

In [16]:
##TODO do the same with SBERT embeddings

from sentence_transformers import SentenceTransformer
model = "bert-base-nli-mean-tokens"
embedder = SentenceTransformer(model)
text_a_encoded = embedder.encode(text_a)
text_b_encoded = embedder.encode(text_b)

predictions = [cosine_similarity(a, b) for a,b in zip(text_a_encoded, text_b_encoded)]

evaluate(predictions, labels)

Downloading (…)821d1/.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)8d01e821d1/README.md:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading (…)d1/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)01e821d1/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)821d1/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading (…)8d01e821d1/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)1e821d1/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

spearman's rank correlation 0.8008164100246977
