<a href="https://colab.research.google.com/github/hubertwel/paragraph-similarity/blob/main/paragraph-similarity/paragraph_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [45]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags, strip_multiple_whitespaces
#from gensim.parsing.preprocessing import remove_stopwords, strip_short
from gensim.utils import simple_preprocess
from django.contrib.admin.utils import flatten
#from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
#from sklearn.metrics import roc_auc_score
from sklearn import datasets, svm
import numpy as np
import os
import smart_open
import csv
import collections
import math
import random
import re

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        csv_reader = csv.DictReader(f, quoting=csv.QUOTE_ALL)
        CUSTOM_FILTERS = [strip_tags, strip_multiple_whitespaces]
        for i, row in enumerate(csv_reader):
          line = row['x']
          line = remove_urls(line)
          line_list = preprocess_string(line, CUSTOM_FILTERS)
          line = " ".join(line_list)
          #print('line: ', line)
          tokens = simple_preprocess(line)
          if tokens_only:
            yield tokens
          else:
            # For training data, add tags
            yield TaggedDocument(tokens, [i])

def remove_urls(text):
  text = re.sub(r'https?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE)
  return text

# Set file names for train and test data
# test_data_dir = os.path.join(gensim.__path__[0], 'test', 'test_data')
test_data_dir = os.path.join('/content/drive/MyDrive/Colab Notebooks/data/', 'gouvfr', 'CorpusRandomTwitter')
print("test_data_dir: %s" % test_data_dir)
train_file = os.path.join(test_data_dir, 'randomtweets3.txt')
test_file = os.path.join(test_data_dir, 'randomtweets4.txt')

train_corpus = list(read_corpus(train_file, tokens_only=True))
train_corpus_tagged = list(read_corpus(train_file))
test_corpus = list(read_corpus(test_file, tokens_only=True))
print('train_corpus the beginning: ', train_corpus[:2])
print(len(train_corpus))
print('train_corpus_tagged the beginning: ', train_corpus_tagged[:2])
print(len(train_corpus))
print('test_corpus the beginning: ', test_corpus[:2])
print(len(test_corpus))
print()

N = len(train_corpus)
print("N %d" % N)
vocabulary = flatten(train_corpus)
# Remove duplicates from a list
tmp_list = list(dict.fromkeys(vocabulary))
vocabulary = list(dict.fromkeys(tmp_list))
print('vocabulary: ', vocabulary)

df = {}
tf = {}
for word in vocabulary:
  # tf: number of occurrences of word in document (paragraph) divided by document length
  tf[word] = [doc.count(word)/len(doc) for doc in train_corpus]
       
  # df: number of documents containing word divided by the number of documents (paragraphs)
  df[word] = sum([word in doc for doc in train_corpus])/N

# 3. after you have your term frequencies and document frequencies, go over each line in the text and 
# calculate its TF-IDF representation, which will be a vector
tfidf_docs = []
for doc_index, doc in enumerate(train_corpus):
  tfidf_doc = []
  for word in vocabulary:
    # a tfidf value of word in this doc (sentence)
    score = tf[word][doc_index] * math.log(1/df[word], 10)
    # a list of tfdif values for this doc (sentence)
    tfidf_doc.append(score) 
  tfidf_docs.append(tfidf_doc)

#print('tfidf_docs: ', tfidf_docs)

# Build a vocabulary
model = Doc2Vec(dm=0, vector_size=80, min_count=3, epochs=50, hs=1, dbow_words=1)
model.build_vocab(train_corpus_tagged)

# Train the model on the corpus
model.train(train_corpus_tagged, total_examples=model.corpus_count, epochs=model.epochs)
print()

# Assessing the model
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus_tagged)):
    inferred_vector = model.infer_vector(train_corpus_tagged[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    # sanity check (self-similarity)
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    second_ranks.append(sims[1])

print('ranks: ', ranks)
print('second ranks: ', second_ranks)
counter = collections.Counter(ranks)
print(counter)
print()

# Testing the model
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
print('RANDOM DOCUMENT ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus_tagged[doc_id].words)))

# Compare and print the most/second-most/third-most/median/least similar documents from the train corpus
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('THIRD-MOST', 2), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus_tagged[sims[index][0]].words)))

# Cross validatiom
X_digits, y_digits = datasets.load_digits(return_X_y=True)
svc = svm.SVC(C=1, kernel='linear')
#svc.fit(X_digits[:-100], y_digits[:-100]).score(X_digits[-100:], y_digits[-100:])
k_fold = KFold(n_splits=5)
#cross_val_score = cross_val_score(svc, X_digits, y_digits, cv=k_fold, n_jobs=-1)
#print('cross_val_score: ', cross_val_score)
for train_indices, test_indices in k_fold.split(train_corpus):
  print('Train: %s | test: %s' % (train_indices, test_indices))
  score = svc.fit(X_digits[train_indices], y_digits[train_indices]).score(X_digits[test_indices], y_digits[test_indices])
  print('score: ', score)


test_data_dir: /content/drive/MyDrive/Colab Notebooks/data/gouvfr/CorpusRandomTwitter
train_corpus the beginning:  [['rt', 'americanidol', 'watch', 'as', 'ddlovato', 'gives', 'an', 'amazing', 'performance', 'of', 'her', 'single', 'stonecold', 'demionidol'], ['rt', 'dipti_varun', 'varun', 'dhawan', 'junaid', 'is', 'rookie', 'cop', 'and', 'heâ', 'kind', 'of', 'discovering', 'himself', 'as', 'the', 'movie', 'goes', 'on', 'varun_dvn', 'dishoom']]
1000
train_corpus_tagged the beginning:  [TaggedDocument(words=['rt', 'americanidol', 'watch', 'as', 'ddlovato', 'gives', 'an', 'amazing', 'performance', 'of', 'her', 'single', 'stonecold', 'demionidol'], tags=[0]), TaggedDocument(words=['rt', 'dipti_varun', 'varun', 'dhawan', 'junaid', 'is', 'rookie', 'cop', 'and', 'heâ', 'kind', 'of', 'discovering', 'himself', 'as', 'the', 'movie', 'goes', 'on', 'varun_dvn', 'dishoom'], tags=[1])]
1000
test_corpus the beginning:  [['live', 'morning', 'news', 'weather', 'and', 'traffic', 'updates', 'fromâ', 'wdsu

In [3]:
pip install --pre --upgrade gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/16/4a/c529159de5a417eb2f574941ccd9f937a47cafffaf1a3e485c6e2a8a4153/gensim-4.0.0b0-cp36-cp36m-manylinux1_x86_64.whl (24.0MB)
[K     |████████████████████████████████| 24.0MB 1.7MB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.0.0b0


In [None]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
