<a href="https://colab.research.google.com/github/hubertwel/paragraph-similarity/blob/main/paragraph-similarity/paragraph_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import numpy as np

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

sentences = ["I ate dinner.", 
       "We had a three-course meal.", 
       "Brad came to dinner with us.",
       "He loves fish tacos.",
       "In the end, we all felt like we ate too much.",
       "We all agreed; it was a magnificent evening."]

test_sentence = "I had pizza and pasta."

# Tokenization of each document
sentence_tk = []
sentences_tk = []
tokenizer = RegexpTokenizer(r'\w+')
for s in sentences:
    sentence_tk = tokenizer.tokenize(s.lower())
    sentences_tk.append(sentence_tk)
print('tokenized sentences: ', sentences_tk)
print()
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(sentences_tk)]
print('tagged data: ', tagged_data)
print()

# Train doc2vec model
# vector_size: dimensionality of the feature vectors.
# window: the maximum distance between the current and predicted word within a sentence.
# min_count: ignores all words with total frequency lower than this.
# epochs: preferred number of passes
model = Doc2Vec(tagged_data, vector_size = 20, window = 2, min_count = 1, epochs = 100)

## Print model vocabulary
print('model vocabulary: ', model.wv.vocab)

test_sentence_tk = tokenizer.tokenize(test_sentence.lower())
print('test_sentence_tk: ', test_sentence_tk)

# Find the top 5 most similar sentences from training data
# positive: list of sentences that contribute positively
test_sent_vector = model.infer_vector(test_sentence_tk)
documents_vectors = model.docvecs.most_similar(positive = [test_sent_vector])
print('documents_vectors: ', documents_vectors)
print()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
tokenized sentences:  [['i', 'ate', 'dinner'], ['we', 'had', 'a', 'three', 'course', 'meal'], ['brad', 'came', 'to', 'dinner', 'with', 'us'], ['he', 'loves', 'fish', 'tacos'], ['in', 'the', 'end', 'we', 'all', 'felt', 'like', 'we', 'ate', 'too', 'much'], ['we', 'all', 'agreed', 'it', 'was', 'a', 'magnificent', 'evening']]

tagged data:  [TaggedDocument(words=['i', 'ate', 'dinner'], tags=[0]), TaggedDocument(words=['we', 'had', 'a', 'three', 'course', 'meal'], tags=[1]), TaggedDocument(words=['brad', 'came', 'to', 'dinner', 'with', 'us'], tags=[2]), TaggedDocument(words=['he', 'loves', 'fish', 'tacos'], tags=[3]), TaggedDocument(words=['in', 'the', 'end', 'we', 'all', 'felt', 'like', 'we', 'ate', 'too', 'much'], tags=[4]), TaggedDocument(words=['we', 'all', 'agreed', 'it', 'was', 'a', 'magnificent', 'evening'], tags=[5])]

model vocabulary:  {'i': <gensim.models.keyedvectors.Vo

In [6]:
pip install --upgrade gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/2b/e0/fa6326251692056dc880a64eb22117e03269906ba55a6864864d24ec8b4e/gensim-3.8.3-cp36-cp36m-manylinux1_x86_64.whl (24.2MB)
[K     |████████████████████████████████| 24.2MB 163kB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-3.8.3
