<a href="https://colab.research.google.com/github/hubertwel/paragraph-similarity/blob/main/paragraph-similarity/paragraph_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# UNDER CONSTRUCTION
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags, strip_multiple_whitespaces
#from gensim.parsing.preprocessing import remove_stopwords, strip_short
from gensim.utils import simple_preprocess
from gensim.test.utils import get_tmpfile
from django.contrib.admin.utils import flatten
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn import linear_model, model_selection
from sklearn.metrics import accuracy_score
from statistics import mean
import numpy as np
import os
import smart_open
import csv
import collections
import math
import random
import re
import optuna

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        csv_reader = csv.DictReader(f, quoting=csv.QUOTE_ALL)
        CUSTOM_FILTERS = [strip_tags, strip_multiple_whitespaces]
        for i, row in enumerate(csv_reader):
          line = row['x']
          line = remove_urls(line)
          line_list = preprocess_string(line, CUSTOM_FILTERS)
          line = " ".join(line_list)
          #print('line: ', line)
          tokens = simple_preprocess(line)
          if tokens_only:
            yield tokens
          else:
            # For training data, add tags
            yield TaggedDocument(tokens, [i])

def remove_urls(text):
  text = re.sub(r'https?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE)
  return text

# Define an objective function to be maximized
def objective(trial):
  classifier_name = trial.suggest_categorical("classifier", ["LogReg"])
  # Setup values for the hyperparameters:
  if classifier_name == 'LogReg':
    logreg_c = trial.suggest_float("logreg_c", 1e-10, 1e10, log=True)
    clf = linear_model.LogisticRegression(C=logreg_c)
  # Scoring method:
    k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
    score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=-1, scoring='accuracy')
    accuracy = score.mean()
    return accuracy
    
# Set file names for train and test data
# test_data_dir = os.path.join(gensim.__path__[0], 'test', 'test_data')
test_data_dir = os.path.join('/content/drive/MyDrive/Colab Notebooks/data/', 'gouvfr', 'CorpusRandomTwitter')
print("test_data_dir: %s" % test_data_dir)
train_file = os.path.join(test_data_dir, 'randomtweets3.txt')
test_file = os.path.join(test_data_dir, 'randomtweets4.txt')

train_corpus = list(read_corpus(train_file, tokens_only=True))
train_corpus_tagged = list(read_corpus(train_file))
test_corpus = list(read_corpus(test_file, tokens_only=True))
test_corpus_tagged = list(read_corpus(test_file))
'''
print('train_corpus the beginning: ', train_corpus[:2])
print(len(train_corpus))
print('train_corpus_tagged the beginning: ', train_corpus_tagged[:2])
print(len(train_corpus))
print('test_corpus the beginning: ', test_corpus[:2])
print(len(test_corpus))
print('test_corpus_tagged the beginning: ', test_corpus_tagged[:2])
print(len(train_corpus))
print()
'''
N = len(train_corpus)
#print("N %d" % N)
vocabulary = flatten(train_corpus)
# Remove duplicates from a list
tmp_list = list(dict.fromkeys(vocabulary))
vocabulary = list(dict.fromkeys(tmp_list))

df = {}
tf = {}
for word in vocabulary:
  # tf: number of occurrences of word in document (paragraph) divided by document length
  tf[word] = [doc.count(word)/len(doc) for doc in train_corpus]
       
  # df: number of documents containing word divided by the number of documents (paragraphs)
  df[word] = sum([word in doc for doc in train_corpus])/N

# 3. after you have your term frequencies and document frequencies, go over each line in the text and 
# calculate its TF-IDF representation, which will be a vector
tfidf_docs = []
for doc_index, doc in enumerate(train_corpus):
  tfidf_doc = []
  for word in vocabulary:
    # a tfidf value of word in this doc (paragraph)
    tfidf_score = tf[word][doc_index] * math.log(1/df[word], 10)
    # a list of tfdif values for this doc (paragraph)
    word_vec_pair = [word, tfidf_score]
    tfidf_doc.append(list(word_vec_pair)) 
  tfidf_docs.append(tfidf_doc)

#print('tfidf_docs len: ', len(tfidf_docs))
#print('tfidf_docs[0] len: ', len(tfidf_docs[0]))

# Build a vocabulary
model = Doc2Vec(dm=0, vector_size=80, min_count=3, epochs=50, hs=1, dbow_words=1)
model.build_vocab(train_corpus_tagged)

# Train the model on the corpus
model.train(train_corpus_tagged, total_examples=model.corpus_count, epochs=model.epochs)
print()
#print('model.wv.vocab: ', model.wv['makes'])
# Assessing the model
ranks = []
first_ranks = []
second_ranks = []
inferred_vectors = []
errors = 0
for doc_id in range(len(train_corpus_tagged)):
    # Supercharge paragraph vectors with tfidf vectors
    paragraph = train_corpus_tagged[doc_id]
    paragraph_vocab = flatten(paragraph)
    for word in vocabulary:
      # obtain the tf_idfidf of a word in a document (paragraph)
      if word in paragraph_vocab:
        ind = [x[0] for x in tfidf_docs[doc_id]].index(word)
        tfidf_vec = tfidf_docs[doc_id][ind][1]
        try:
          model.wv[word]
        except KeyError as err:
          #print(err.args)
          errors += 1
        else:
          max_vec = max(mean(model.wv[word]), tfidf_vec)
          model.wv[word] = np.array([max_vec]*80)
    # end of supercharge
    
    inferred_vector = model.infer_vector(train_corpus_tagged[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    # sanity check (self-similarity)
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    first_ranks.append(sims[0][0])
    inferred_vectors.append(inferred_vector)
        
print('errors noted: '+str(errors))

print('sims: ', sims[:3])
print('ranks: ', ranks)
counter = collections.Counter(ranks)
print(counter)
print()

# Testing the model with one random document
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
print('RANDOM DOCUMENT ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus_tagged[doc_id].words)))

# Compare and print the most/second-most/third-most/median/least similar documents from the train corpus
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('THIRD-MOST', 2), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus_tagged[sims[index][0]].words)))

filename = get_tmpfile("parsim_doc2vec_model")
model.save(filename)

# Assessing the model
ranks_test = []
first_ranks_test = []
inferred_vectors_test = []
for doc_id in range(len(test_corpus)):
    # Supercharge paragraph vectors with tfidf vectors
    paragraph = train_corpus_tagged[doc_id]
    paragraph_vocab = flatten(paragraph)
    for word in vocabulary:
      # obtain the tf_idfidf of a word in a document (paragraph)
      if word in paragraph_vocab:
        ind = [x[0] for x in tfidf_docs[doc_id]].index(word)
        tfidf_vec = tfidf_docs[doc_id][ind][1]
        try:
          model.wv[word]
        except KeyError as err:
          #print(err.args)
          errors += 1
        else:
          max_vec = max(mean(model.wv[word]), tfidf_vec)
          model.wv[word] = np.array([max_vec]*80)
    # end of supercharge

    inferred_vector_test = model.infer_vector(test_corpus[doc_id])
    sims_test = model.dv.most_similar([inferred_vector_test], topn=len(model.dv))
    # sanity check (self-similarity)
    rank_test = [docid for docid, sim in sims_test].index(doc_id)
    ranks_test.append(rank_test)
    first_ranks_test.append(sims_test[0][0])
    inferred_vectors_test.append(inferred_vector_test)

print('errors noted: '+str(errors))
print('first_ranks_test: ', first_ranks_test)

# Cross validatiom
tags_array_train = np.array(first_ranks)
vectors_2Darray_train = np.array(inferred_vectors)
tags_array_test = np.array(first_ranks_test)
vectors_2Darray_test = np.array(inferred_vectors_test)
y_train, X_train = tags_array_train, vectors_2Darray_train
y_test, X_test = tags_array_test, vectors_2Darray_test

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1)
clf = LogisticRegression(solver='liblinear', max_iter=300, class_weight='balanced', multi_class='auto')
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=-1, scoring='accuracy')
print('score: ', score)
print('Valid acc: {}'.format(round(np.mean(score)*100, 4)))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Test acc: {}'.format(accuracy_score(y_test, y_pred)))
#print('Testing accuracy %s' % accuracy_score(y_test, y_pred))

test_data_dir: /content/drive/MyDrive/Colab Notebooks/data/gouvfr/CorpusRandomTwitter

errors noted: 4774
sims:  [(999, 0.9667202830314636), (677, 0.9605291485786438), (807, 0.9588572978973389)]
ranks:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 3, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 6, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 20, 0, 0, 1, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 22, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0,

[32m[I 2021-02-13 17:59:44,187][0m A new study created in memory with name: no-name-d4204f26-e6e3-4506-88f2-c9cb7cd4e4f7[0m


errors noted: 9548
first_ranks_test:  [391, 509, 943, 492, 303, 955, 349, 714, 238, 131, 676, 697, 814, 656, 662, 693, 582, 325, 87, 69, 142, 893, 208, 412, 351, 382, 312, 580, 507, 235, 806, 36, 647, 636, 494, 7, 145, 164, 262, 177, 375, 893, 474, 914, 485, 355, 671, 367, 76, 82, 435, 250, 535, 665, 194, 68, 954, 298, 389, 789, 621, 567, 174, 939, 978, 545, 766, 240, 738, 251, 781, 764, 45, 611, 516, 918, 285, 76, 131, 775, 159, 404, 131, 731, 503, 82, 564, 955, 642, 709, 87, 733, 221, 258, 768, 463, 131, 797, 147, 778, 634, 500, 131, 17, 660, 429, 141, 119, 657, 514, 791, 435, 95, 163, 311, 127, 117, 381, 593, 769, 884, 886, 700, 197, 314, 429, 131, 131, 463, 764, 974, 788, 39, 9, 576, 352, 283, 683, 438, 660, 292, 521, 775, 104, 715, 633, 121, 962, 201, 859, 607, 131, 633, 262, 390, 776, 277, 79, 87, 934, 872, 682, 399, 895, 352, 595, 259, 989, 383, 397, 621, 261, 964, 364, 720, 278, 992, 360, 775, 967, 874, 42, 82, 591, 139, 36, 78, 127, 426, 571, 452, 20, 633, 976, 163, 170, 234, 

[32m[I 2021-02-13 18:00:33,412][0m Trial 0 finished with value: 0.10200000000000001 and parameters: {'classifier': 'LogReg', 'logreg_c': 1047.3271530866102}. Best is trial 0 with value: 0.10200000000000001.[0m


score:  [0.09 0.13 0.1  0.06 0.1  0.06 0.07 0.13 0.1  0.07]
Valid acc: 9.1
Test acc: 0.543


In [2]:
pip install optuna

Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/91/88/9c53460b97c61bce926dfe9dce51e4887c283416ff89ed30af0b73f44efa/optuna-2.5.0-py3-none-any.whl (287kB)
[K     |█▏                              | 10kB 18.3MB/s eta 0:00:01[K     |██▎                             | 20kB 18.7MB/s eta 0:00:01[K     |███▍                            | 30kB 10.6MB/s eta 0:00:01[K     |████▋                           | 40kB 7.6MB/s eta 0:00:01[K     |█████▊                          | 51kB 4.2MB/s eta 0:00:01[K     |██████▉                         | 61kB 4.7MB/s eta 0:00:01[K     |████████                        | 71kB 4.9MB/s eta 0:00:01[K     |█████████▏                      | 81kB 5.2MB/s eta 0:00:01[K     |██████████▎                     | 92kB 5.0MB/s eta 0:00:01[K     |███████████▍                    | 102kB 4.1MB/s eta 0:00:01[K     |████████████▌                   | 112kB 4.1MB/s eta 0:00:01[K     |█████████████▊                  | 122kB 4.1MB/s eta 0:

In [None]:
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score
diabetes = datasets.load_diabetes()
X = diabetes.data[:150]
y = diabetes.target[:150]
print('X: ', X)
print('y: ', y)
y = diabetes.target[:150]
lasso = linear_model.Lasso()
print(cross_val_score(lasso, X, y, cv=3))


X:  [[ 0.03807591  0.05068012  0.06169621 ... -0.00259226  0.01990842
  -0.01764613]
 [-0.00188202 -0.04464164 -0.05147406 ... -0.03949338 -0.06832974
  -0.09220405]
 [ 0.08529891  0.05068012  0.04445121 ... -0.00259226  0.00286377
  -0.02593034]
 ...
 [-0.05637009 -0.04464164  0.09295276 ...  0.02545259  0.02605609
   0.04034337]
 [-0.06000263  0.05068012  0.01535029 ... -0.00259226 -0.03075121
  -0.0010777 ]
 [-0.04910502  0.05068012 -0.00512814 ...  0.07120998  0.06123791
  -0.03835666]]
y:  [151.  75. 141. 206. 135.  97. 138.  63. 110. 310. 101.  69. 179. 185.
 118. 171. 166. 144.  97. 168.  68.  49.  68. 245. 184. 202. 137.  85.
 131. 283. 129.  59. 341.  87.  65. 102. 265. 276. 252.  90. 100.  55.
  61.  92. 259.  53. 190. 142.  75. 142. 155. 225.  59. 104. 182. 128.
  52.  37. 170. 170.  61. 144.  52. 128.  71. 163. 150.  97. 160. 178.
  48. 270. 202. 111.  85.  42. 170. 200. 252. 113. 143.  51.  52. 210.
  65. 141.  55. 134.  42. 111.  98. 164.  48.  96.  90. 162. 150. 279.
  9

In [1]:
pip install --pre --upgrade gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/16/4a/c529159de5a417eb2f574941ccd9f937a47cafffaf1a3e485c6e2a8a4153/gensim-4.0.0b0-cp36-cp36m-manylinux1_x86_64.whl (24.0MB)
[K     |████████████████████████████████| 24.0MB 163kB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.0.0b0


In [None]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
