<a href="https://colab.research.google.com/github/hubertwel/paragraph-similarity/blob/main/paragraph-similarity/paragraph_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import numpy as np

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim
import gensim.downloader as api
import json
import inspect
import os
import smart_open
import collections
import random

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

corpus = api.load('text8')
info = api.info()
print(json.dumps(info, indent=4))
print()

# Set file names for train and test data
test_data_dir = os.path.join(gensim.__path__[0], 'test', 'test_data')
lee_train_file = os.path.join(test_data_dir, 'lee_background.cor')
lee_test_file = os.path.join(test_data_dir, 'lee.cor')
print(test_data_dir)

train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))
print('train_corpus the beginning: ', train_corpus[:2])
print()
print('test_corpus the beginning: ', test_corpus[:2])
print()

# Build a vocabulary
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(train_corpus)

# Train the model on the corpus
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
print()

# Assessing the model
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

counter = collections.Counter(ranks)
print(counter)
print()

# Testing the model
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

"""
# something else
sentences = ["I ate dinner.", 
       "I like this restaurant", 
       "Brad came to see us.",
       "He loves dancing.",
       "In the end, we all felt like we ate too much.",
       "We all agreed; it was a magnificent evening."]

test_sentence = "I had a tomato soup, chicken and dessert."

# Tokenization of each document
sentence_tk = []
sentences_tk = []
tokenizer = RegexpTokenizer(r'\w+')
for s in sentences:
    sentence_tk = tokenizer.tokenize(s.lower())
    sentences_tk.append(sentence_tk)
print('tokenized sentences: ', sentences_tk)
print()
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(sentences_tk)]
print('tagged data: ', tagged_data)
print()

# Train doc2vec model
# vector_size: dimensionality of the feature vectors.
# window: the maximum distance between the current and predicted word within a sentence.
# min_count: ignores all words with total frequency lower than this.
# epochs: preferred number of passes
model = Doc2Vec(tagged_data, vector_size = 20, window = 2, min_count = 1, epochs = 100)

## Print model vocabulary
print('model vocabulary: ', model.wv.vocab)

test_sentence_tk = tokenizer.tokenize(test_sentence.lower())
print('test_sentence_tk: ', test_sentence_tk)

# Find the top 5 most similar sentences from training data
# positive: list of sentences that contribute positively
test_sent_vector = model.infer_vector(test_sentence_tk)
documents_vectors = model.docvecs.most_similar(positive = [test_sent_vector])
print('documents_vectors: ', documents_vectors)
print()
"""

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
{
    "corpora": {
        "semeval-2016-2017-task3-subtaskBC": {
            "num_records": -1,
            "record_format": "dict",
            "file_size": 6344358,
            "reader_code": "https://github.com/RaRe-Technologies/gensim-data/releases/download/semeval-2016-2017-task3-subtaskB-eng/__init__.py",
            "license": "All files released for the task are free for general research use",
            "fields": {
                "2016-train": [
                    "..."
                ],
                "2016-dev": [
                    "..."
                ],
                "2017-test": [
                    "..."
                ],
                "2016-test": [
                    "..."
                ]
            },
            "description": "SemEval 2016 / 2017 Task 3 Subtask B and C datasets contain train+development (317 original questions, 3,169 rela

'\n# something else\nsentences = ["I ate dinner.", \n       "I like this restaurant", \n       "Brad came to see us.",\n       "He loves dancing.",\n       "In the end, we all felt like we ate too much.",\n       "We all agreed; it was a magnificent evening."]\n\ntest_sentence = "I had a tomato soup, chicken and dessert."\n\n# Tokenization of each document\nsentence_tk = []\nsentences_tk = []\ntokenizer = RegexpTokenizer(r\'\\w+\')\nfor s in sentences:\n    sentence_tk = tokenizer.tokenize(s.lower())\n    sentences_tk.append(sentence_tk)\nprint(\'tokenized sentences: \', sentences_tk)\nprint()\ntagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(sentences_tk)]\nprint(\'tagged data: \', tagged_data)\nprint()\n\n# Train doc2vec model\n# vector_size: dimensionality of the feature vectors.\n# window: the maximum distance between the current and predicted word within a sentence.\n# min_count: ignores all words with total frequency lower than this.\n# epochs: preferred number of pass

In [24]:
pip install --pre --upgrade gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/16/4a/c529159de5a417eb2f574941ccd9f937a47cafffaf1a3e485c6e2a8a4153/gensim-4.0.0b0-cp36-cp36m-manylinux1_x86_64.whl (24.0MB)
[K     |████████████████████████████████| 24.0MB 1.3MB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.0.0b0


In [None]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install --upgrade gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/2b/e0/fa6326251692056dc880a64eb22117e03269906ba55a6864864d24ec8b4e/gensim-3.8.3-cp36-cp36m-manylinux1_x86_64.whl (24.2MB)
[K     |████████████████████████████████| 24.2MB 163kB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-3.8.3
