<a href="https://colab.research.google.com/github/hubertwel/paragraph-similarity/blob/main/paragraph-similarity/paragraph_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
# UNDER CONSTRUCTION
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags, strip_multiple_whitespaces
#from gensim.parsing.preprocessing import remove_stopwords, strip_short
from gensim.utils import simple_preprocess
from gensim.test.utils import get_tmpfile
from django.contrib.admin.utils import flatten
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn import linear_model, model_selection
from sklearn.metrics import accuracy_score
from statistics import mean
import numpy as np
import os
import smart_open
import csv
import collections
import math
import random
import re
import optuna

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        csv_reader = csv.DictReader(f, quoting=csv.QUOTE_ALL)
        CUSTOM_FILTERS = [strip_tags, strip_multiple_whitespaces]
        for i, row in enumerate(csv_reader):
          line = row['x']
          line = remove_urls(line)
          line_list = preprocess_string(line, CUSTOM_FILTERS)
          line = " ".join(line_list)
          #print('line: ', line)
          tokens = simple_preprocess(line)
          if tokens_only:
            yield tokens
          else:
            # For training data, add tags
            yield TaggedDocument(tokens, [i])

def remove_urls(text):
  text = re.sub(r'https?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE)
  return text

# Define an objective function to be maximized
def objective(trial):
  classifier_name = trial.suggest_categorical("classifier", ["LogReg"])
  # Setup values for the hyperparameters:
  if classifier_name == 'LogReg':
    logreg_c = trial.suggest_float("logreg_c", 1e-10, 1e10, log=True)
    clf = linear_model.LogisticRegression(C=logreg_c)
  # Scoring method:
    k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
    score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=-1, scoring='accuracy')
    accuracy = score.mean()
    return accuracy
    
# Set file names for train and test data
# test_data_dir = os.path.join(gensim.__path__[0], 'test', 'test_data')
test_data_dir = os.path.join('/content/drive/MyDrive/Colab Notebooks/data/', 'gouvfr', 'CorpusRandomTwitter')
print("test_data_dir: %s" % test_data_dir)
train_file = os.path.join(test_data_dir, 'randomtweets3.txt')
test_file = os.path.join(test_data_dir, 'randomtweets4.txt')

train_corpus = list(read_corpus(train_file, tokens_only=True))
train_corpus_tagged = list(read_corpus(train_file))
test_corpus = list(read_corpus(test_file, tokens_only=True))
test_corpus_tagged = list(read_corpus(test_file))
'''
print('train_corpus the beginning: ', train_corpus[:2])
print(len(train_corpus))
print('train_corpus_tagged the beginning: ', train_corpus_tagged[:2])
print(len(train_corpus))
print('test_corpus the beginning: ', test_corpus[:2])
print(len(test_corpus))
print('test_corpus_tagged the beginning: ', test_corpus_tagged[:2])
print(len(train_corpus))
print()
'''
N = len(train_corpus)
#print("N %d" % N)
vocabulary = flatten(train_corpus)
# Remove duplicates from a list
tmp_list = list(dict.fromkeys(vocabulary))
vocabulary = list(dict.fromkeys(tmp_list))

df = {}
tf = {}
for word in vocabulary:
  # tf: number of occurrences of word in document (paragraph) divided by document length
  tf[word] = [doc.count(word)/len(doc) for doc in train_corpus]
       
  # df: number of documents containing word divided by the number of documents (paragraphs)
  df[word] = sum([word in doc for doc in train_corpus])/N

# 3. after you have your term frequencies and document frequencies, go over each line in the text and 
# calculate its TF-IDF representation, which will be a vector
tfidf_docs = []
for doc_index, doc in enumerate(train_corpus):
  tfidf_doc = []
  for word in vocabulary:
    # a tfidf value of word in this doc (paragraph)
    tfidf_score = tf[word][doc_index] * math.log(1/df[word], 10)
    # a list of tfdif values for this doc (paragraph)
    word_vec_pair = [word, tfidf_score]
    tfidf_doc.append(list(word_vec_pair)) 
  tfidf_docs.append(tfidf_doc)

#print('tfidf_docs len: ', len(tfidf_docs))
#print('tfidf_docs[0] len: ', len(tfidf_docs[0]))

# Build a vocabulary
model = Doc2Vec(dm=0, vector_size=80, min_count=3, epochs=50, hs=1, dbow_words=1)
model.build_vocab(train_corpus_tagged)

# Train the model on the corpus
model.train(train_corpus_tagged, total_examples=model.corpus_count, epochs=model.epochs)
print()
#print('model.wv.vocab: ', model.wv['makes'])
# Assessing the model
ranks = []
first_ranks = []
second_ranks = []
inferred_vectors = []
errors = 0
for doc_id in range(len(train_corpus_tagged)):
    # Supercharge paragraph vectors with tfidf vectors
    for word in vocabulary:
      # obtain the tf_idfidf of a word in a document (paragraph)
      ind = [x[0] for x in tfidf_docs[doc_id]].index(word)
      tfidf_vec = tfidf_docs[doc_id][ind][1]
      try:
        model.wv[word]
      except KeyError as err:
        #print(err.args)
        errors += 1
      else:
        max_vec = max(mean(model.wv[word]), tfidf_vec)
        model.wv[word] = np.array([max_vec]*80)
    # end of supercharge
    
    inferred_vector = model.infer_vector(train_corpus_tagged[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    # sanity check (self-similarity)
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    first_ranks.append(sims[0][0])
    inferred_vectors.append(inferred_vector)
        
print('errors noted: '+str(errors))

print('sims: ', sims[:3])
print('ranks: ', ranks)
counter = collections.Counter(ranks)
print(counter)
print()

# Testing the model with one random document
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
print('RANDOM DOCUMENT ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus_tagged[doc_id].words)))

# Compare and print the most/second-most/third-most/median/least similar documents from the train corpus
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('THIRD-MOST', 2), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus_tagged[sims[index][0]].words)))

filename = get_tmpfile("parsim_doc2vec_model")
model.save(filename)

# Assessing the model
ranks_test = []
first_ranks_test = []
inferred_vectors_test = []
for doc_id in range(len(test_corpus)):
    # Supercharge paragraph vectors with tfidf vectors
    for word in vocabulary:
      # obtain the tf_idfidf of a word in a document (paragraph)
      ind = [x[0] for x in tfidf_docs[doc_id]].index(word)
      tfidf_vec = tfidf_docs[doc_id][ind][1]
      try:
        model.wv[word]
      except KeyError as err:
        #print(err.args)
        errors += 1
      else:
        max_vec = max(mean(model.wv[word]), tfidf_vec)
        model.wv[word] = np.array([max_vec]*80)
    # end of supercharge
    
    inferred_vector_test = model.infer_vector(test_corpus[doc_id])
    sims_test = model.dv.most_similar([inferred_vector_test], topn=len(model.dv))
    # sanity check (self-similarity)
    rank_test = [docid for docid, sim in sims_test].index(doc_id)
    ranks_test.append(rank_test)
    first_ranks_test.append(sims_test[0][0])
    inferred_vectors_test.append(inferred_vector_test)

print('first_ranks_test: ', first_ranks_test)

# Cross validatiom
tags_array_train = np.array(first_ranks)
vectors_2Darray_train = np.array(inferred_vectors)
tags_array_test = np.array(first_ranks_test)
vectors_2Darray_test = np.array(inferred_vectors_test)
y_train, X_train = tags_array_train, vectors_2Darray_train
y_test, X_test = tags_array_test, vectors_2Darray_test

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1)
clf = LogisticRegression(solver='liblinear', max_iter=300, class_weight='balanced', multi_class='auto')
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=-1, scoring='accuracy')
print('score: ', score)
print('Valid acc: {}'.format(round(np.mean(score)*100, 4)))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Test acc: {}'.format(accuracy_score(y_test, y_pred)))
#print('Testing accuracy %s' % accuracy_score(y_test, y_pred))

test_data_dir: /content/drive/MyDrive/Colab Notebooks/data/gouvfr/CorpusRandomTwitter

errors noted: 4149000
sims:  [(807, 0.9593755602836609), (999, 0.9565048217773438), (677, 0.9542163014411926)]
ranks:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 6, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 23, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 19, 0, 0, 0, 0, 0, 0, 

[32m[I 2021-02-13 00:54:10,014][0m A new study created in memory with name: no-name-ebde6c28-440e-4b91-97e5-fe696bcdc580[0m


first_ranks_test:  [391, 857, 530, 636, 994, 288, 349, 714, 238, 131, 676, 37, 814, 656, 662, 693, 597, 168, 87, 625, 142, 893, 208, 412, 340, 382, 312, 580, 594, 235, 786, 36, 647, 42, 494, 7, 145, 164, 262, 733, 545, 893, 536, 914, 485, 355, 671, 252, 76, 82, 206, 127, 535, 665, 194, 68, 954, 119, 389, 789, 621, 567, 174, 939, 64, 545, 738, 240, 738, 251, 781, 764, 96, 270, 516, 345, 285, 76, 131, 546, 155, 404, 131, 731, 503, 82, 294, 918, 552, 709, 578, 733, 221, 258, 978, 924, 131, 481, 147, 778, 634, 194, 131, 17, 391, 103, 141, 906, 657, 514, 791, 206, 95, 163, 797, 247, 117, 381, 78, 769, 647, 886, 700, 562, 314, 340, 131, 131, 463, 764, 974, 788, 165, 9, 576, 352, 557, 683, 438, 660, 457, 584, 775, 683, 715, 925, 177, 334, 201, 86, 369, 131, 454, 142, 659, 974, 614, 453, 87, 270, 425, 682, 131, 182, 352, 993, 259, 989, 414, 397, 621, 261, 964, 364, 720, 278, 992, 360, 775, 967, 628, 507, 545, 591, 206, 854, 909, 127, 426, 571, 452, 20, 633, 208, 318, 170, 234, 604, 430, 369, 7

[32m[I 2021-02-13 00:55:10,515][0m Trial 0 finished with value: 0.11600000000000002 and parameters: {'classifier': 'LogReg', 'logreg_c': 8484.25091686996}. Best is trial 0 with value: 0.11600000000000002.[0m


score:  [0.13 0.14 0.13 0.1  0.11 0.08 0.12 0.2  0.08 0.09]
Valid acc: 11.8
Test acc: 0.534


In [None]:
pip install optuna

Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/91/88/9c53460b97c61bce926dfe9dce51e4887c283416ff89ed30af0b73f44efa/optuna-2.5.0-py3-none-any.whl (287kB)
[K     |████████████████████████████████| 296kB 4.9MB/s 
[?25hCollecting alembic
[?25l  Downloading https://files.pythonhosted.org/packages/a9/53/daab5c96e22e9ed1c9f8ca4e3256e72213ade42d519b6254c32e59610967/alembic-1.5.4.tar.gz (1.1MB)
[K     |████████████████████████████████| 1.1MB 6.9MB/s 
Collecting cliff
[?25l  Downloading https://files.pythonhosted.org/packages/0f/8f/3c74fa4b6c3db1051b495385f5302fc5d5aa0f180d40ce3e9a13c82f8c82/cliff-3.6.0-py3-none-any.whl (79kB)
[K     |████████████████████████████████| 81kB 6.3MB/s 
[?25hCollecting colorlog
  Downloading https://files.pythonhosted.org/packages/5e/39/0230290df0519d528d8d0ffdfd900150ed24e0076d13b1f19e279444aab1/colorlog-4.7.2-py2.py3-none-any.whl
Collecting cmaes>=0.6.0
  Downloading https://files.pythonhosted.org/packages/21/a2/21775c7343e7dd34

In [None]:
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score
diabetes = datasets.load_diabetes()
X = diabetes.data[:150]
y = diabetes.target[:150]
print('X: ', X)
print('y: ', y)
y = diabetes.target[:150]
lasso = linear_model.Lasso()
print(cross_val_score(lasso, X, y, cv=3))


X:  [[ 0.03807591  0.05068012  0.06169621 ... -0.00259226  0.01990842
  -0.01764613]
 [-0.00188202 -0.04464164 -0.05147406 ... -0.03949338 -0.06832974
  -0.09220405]
 [ 0.08529891  0.05068012  0.04445121 ... -0.00259226  0.00286377
  -0.02593034]
 ...
 [-0.05637009 -0.04464164  0.09295276 ...  0.02545259  0.02605609
   0.04034337]
 [-0.06000263  0.05068012  0.01535029 ... -0.00259226 -0.03075121
  -0.0010777 ]
 [-0.04910502  0.05068012 -0.00512814 ...  0.07120998  0.06123791
  -0.03835666]]
y:  [151.  75. 141. 206. 135.  97. 138.  63. 110. 310. 101.  69. 179. 185.
 118. 171. 166. 144.  97. 168.  68.  49.  68. 245. 184. 202. 137.  85.
 131. 283. 129.  59. 341.  87.  65. 102. 265. 276. 252.  90. 100.  55.
  61.  92. 259.  53. 190. 142.  75. 142. 155. 225.  59. 104. 182. 128.
  52.  37. 170. 170.  61. 144.  52. 128.  71. 163. 150.  97. 160. 178.
  48. 270. 202. 111.  85.  42. 170. 200. 252. 113. 143.  51.  52. 210.
  65. 141.  55. 134.  42. 111.  98. 164.  48.  96.  90. 162. 150. 279.
  9

In [None]:
pip install --pre --upgrade gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/16/4a/c529159de5a417eb2f574941ccd9f937a47cafffaf1a3e485c6e2a8a4153/gensim-4.0.0b0-cp36-cp36m-manylinux1_x86_64.whl (24.0MB)
[K     |████████████████████████████████| 24.0MB 8.2MB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.0.0b0


In [None]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
