<a href="https://colab.research.google.com/github/hubertwel/paragraph-similarity/blob/main/paragraph-similarity/paragraph_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [77]:
# UNDER CONSTRUCTION
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags, strip_multiple_whitespaces
from gensim.parsing.preprocessing import remove_stopwords, strip_short
from gensim.parsing.preprocessing import strip_non_alphanum, split_alphanum
from gensim.parsing.preprocessing import strip_punctuation
from gensim.utils import simple_preprocess
from gensim.utils import RULE_KEEP, RULE_DISCARD, RULE_DEFAULT
from gensim.test.utils import get_tmpfile
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize
import numpy as np
import os
import smart_open
import csv
import collections
import random
import re
import optuna

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        csv_reader = csv.DictReader(f, quoting=csv.QUOTE_ALL)
        #CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces, remove_stopwords, strip_short, strip_non_alphanum, split_alphanum]
        CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_multiple_whitespaces]
        for i, row in enumerate(csv_reader):
          line = row['x']
          line = remove_urls(line)
          line_list = preprocess_string(line, CUSTOM_FILTERS)
          line = " ".join(line_list)
          tokens = simple_preprocess(line)
          if tokens_only:
            yield tokens
          else:
            # For training data, add tags
            yield TaggedDocument(tokens, [i])

def remove_urls(text):
  text = re.sub(r'https?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE)
  return text

def trim_rule(word, count, min_count):
    stop_words = set(stopwords.words('english')) 
    # This rule is only used to prune vocabulary during the current method call.
    # It is better to print docs with stopwords and with words of any length.
    if ((word in stop_words) or (len(word) < 3)):
        return utils.RULE_DISCARD  # throw out
    else:
        return utils.RULE_DEFAULT  # apply default rule, i.e. min_count

# Define an objective function to be maximized
def objective(trial):
  classifier_name = trial.suggest_categorical("classifier", ["LogReg"])
  # Setup values for the hyperparameters:
  if classifier_name == 'LogReg':
    logreg_c = trial.suggest_float("logreg_c", 1e-10, 1e10, log=True)
    clf = linear_model.LogisticRegression(C=logreg_c)
  # Scoring method:
    k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
    score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=-1, scoring='accuracy')
    accuracy = score.mean()
    return accuracy
    
# Set file names for train and test data
# test_data_dir = os.path.join(gensim.__path__[0], 'test', 'test_data')
test_data_dir = os.path.join('/content/drive/MyDrive/Colab Notebooks/data/', 'gouvfr', 'CorpusRandomTwitter')
print("test_data_dir: %s" % test_data_dir)
train_file = os.path.join(test_data_dir, 'randomtweets3.txt')
test_file = os.path.join(test_data_dir, 'randomtweets4.txt')

train_corpus = list(read_corpus(train_file, tokens_only=True))
train_corpus_tagged = list(read_corpus(train_file))
test_corpus = list(read_corpus(test_file, tokens_only=True))

print('train_corpus the beginning: ', train_corpus[:2])
print('train_corpus length %d' % len(train_corpus))
print('train_corpus_tagged the beginning: ', train_corpus_tagged[:2])
print('test_corpus the beginning: ', test_corpus[:2])
print('test_corpus length %d' %len(test_corpus))
print()

# Build a vocabulary
model = Doc2Vec(dm=0, vector_size=80, min_count=3, epochs=50, hs=1, dbow_words=1, trim_rule=trim_rule)
model.build_vocab(train_corpus_tagged)

# Train the model on the corpus
model.train(train_corpus_tagged, total_examples=model.corpus_count, epochs=model.epochs)
print()

# Assessing the model
ranks = []
first_ranks = []
second_ranks = []
inferred_vectors = []
errors = 0
for doc_id in range(len(train_corpus_tagged)):
    inferred_vector = model.infer_vector(train_corpus_tagged[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    # sanity check (self-similarity)
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    first_ranks.append(sims[0][0])
    inferred_vectors.append(inferred_vector)
        
print('sims: ', sims[:3])
print('ranks: ', ranks)
counter = collections.Counter(ranks)
print(counter)
print()

# Testing the model with one random document
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
print('RANDOM TEST DOCUMENT ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus_tagged[doc_id].words)))

# Compare and print the most/second-most/third-most/median/least similar documents from the train corpus
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('THIRD-MOST', 2), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus_tagged[sims[index][0]].words)))

filename = get_tmpfile("parsim_doc2vec_model")
model.save(filename)

# Assessing the model
ranks_test = []
first_ranks_test = []
inferred_vectors_test = []
for doc_id in range(len(test_corpus)):
    inferred_vector_test = model.infer_vector(test_corpus[doc_id])
    sims_test = model.dv.most_similar([inferred_vector_test], topn=len(model.dv))
    # sanity check (self-similarity)
    rank_test = [docid for docid, sim in sims_test].index(doc_id)
    ranks_test.append(rank_test)
    first_ranks_test.append(sims_test[0][0])
    inferred_vectors_test.append(inferred_vector_test)

print('first_ranks_test: ', first_ranks_test)

# Cross validatiom
tags_array_train = np.array(first_ranks)
vectors_2Darray_train = np.array(inferred_vectors)
tags_array_test = np.array(first_ranks_test)
vectors_2Darray_test = np.array(inferred_vectors_test)
y_train, X_train = tags_array_train, vectors_2Darray_train
y_test, X_test = tags_array_test, vectors_2Darray_test

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)
clf = LogisticRegression(solver='liblinear', max_iter=300, class_weight='balanced', multi_class='auto')
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=-1, scoring='accuracy')
print('score: ', score)
print('Validation accuracy: {}'.format(round(np.mean(score)*100, 3)))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test accuracy: {:.3f}".format(accuracy_score(y_test, y_pred)))

test_data_dir: /content/drive/MyDrive/Colab Notebooks/data/gouvfr/CorpusRandomTwitter
train_corpus the beginning:  [['rt', 'americanidol', 'watch', 'as', 'ddlovato', 'gives', 'an', 'amazing', 'performance', 'of', 'her', 'single', 'stonecold', 'demionidol'], ['rt', 'dipti_varun', 'varun', 'dhawan', 'junaid', 'is', 'rookie', 'cop', 'and', 'heâ', 'kind', 'of', 'discovering', 'himself', 'as', 'the', 'movie', 'goes', 'on', 'varun_dvn', 'dishoom']]
train_corpus length 1000
train_corpus_tagged the beginning:  [TaggedDocument(words=['rt', 'americanidol', 'watch', 'as', 'ddlovato', 'gives', 'an', 'amazing', 'performance', 'of', 'her', 'single', 'stonecold', 'demionidol'], tags=[0]), TaggedDocument(words=['rt', 'dipti_varun', 'varun', 'dhawan', 'junaid', 'is', 'rookie', 'cop', 'and', 'heâ', 'kind', 'of', 'discovering', 'himself', 'as', 'the', 'movie', 'goes', 'on', 'varun_dvn', 'dishoom'], tags=[1])]
test_corpus the beginning:  [['live', 'morning', 'news', 'weather', 'and', 'traffic', 'updates',

[32m[I 2021-02-14 00:54:35,851][0m A new study created in memory with name: no-name-8ea7681e-e74a-4dcf-bfd0-f42654a3d763[0m


first_ranks_test:  [139, 763, 530, 492, 994, 450, 349, 529, 238, 131, 676, 697, 814, 656, 662, 693, 582, 168, 87, 77, 142, 893, 208, 412, 340, 82, 312, 50, 507, 235, 786, 36, 647, 42, 788, 7, 145, 496, 124, 177, 545, 893, 498, 914, 485, 355, 671, 367, 76, 82, 435, 127, 535, 281, 194, 68, 413, 298, 389, 789, 621, 567, 82, 939, 978, 545, 845, 240, 738, 251, 781, 764, 96, 270, 22, 692, 285, 76, 131, 546, 994, 738, 399, 336, 503, 82, 553, 844, 647, 709, 87, 733, 221, 483, 768, 463, 131, 481, 147, 778, 634, 242, 131, 17, 149, 103, 141, 612, 657, 514, 791, 435, 95, 163, 229, 921, 117, 994, 78, 769, 647, 886, 700, 562, 314, 340, 131, 131, 742, 764, 974, 788, 165, 9, 576, 352, 557, 683, 438, 660, 292, 572, 775, 722, 715, 764, 121, 962, 201, 859, 607, 131, 845, 117, 659, 974, 277, 79, 87, 934, 872, 682, 399, 811, 352, 993, 259, 76, 414, 397, 621, 261, 964, 364, 720, 278, 992, 360, 775, 967, 628, 507, 812, 591, 199, 854, 909, 127, 426, 571, 452, 32, 764, 208, 163, 170, 770, 604, 430, 369, 795, 3

[32m[I 2021-02-14 00:54:38,244][0m Trial 0 finished with value: 0.03400000000000001 and parameters: {'classifier': 'LogReg', 'logreg_c': 1.2166976553876506e-10}. Best is trial 0 with value: 0.03400000000000001.[0m
[32m[I 2021-02-14 00:55:10,393][0m Trial 1 finished with value: 0.03400000000000001 and parameters: {'classifier': 'LogReg', 'logreg_c': 2.2032212239411023e-06}. Best is trial 0 with value: 0.03400000000000001.[0m
[32m[I 2021-02-14 00:55:56,711][0m Trial 2 finished with value: 0.10700000000000001 and parameters: {'classifier': 'LogReg', 'logreg_c': 30.49364042784589}. Best is trial 2 with value: 0.10700000000000001.[0m
[32m[I 2021-02-14 00:56:35,231][0m Trial 3 finished with value: 0.03400000000000001 and parameters: {'classifier': 'LogReg', 'logreg_c': 7.519448417128716e-06}. Best is trial 2 with value: 0.10700000000000001.[0m
[32m[I 2021-02-14 00:57:20,269][0m Trial 4 finished with value: 0.10600000000000001 and parameters: {'classifier': 'LogReg', 'logreg_c':

score:  [0.12 0.11 0.11 0.13 0.09 0.09 0.03 0.15 0.12 0.1 ]
Validation accuracy: 10.5
Test accuracy: 0.513


In [None]:
pip show scikit-learn

Name: scikit-learn
Version: 0.22.2.post1
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: None
Author-email: None
License: new BSD
Location: /usr/local/lib/python3.6/dist-packages
Requires: numpy, scipy, joblib
Required-by: yellowbrick, umap-learn, textgenrnn, sklearn, sklearn-pandas, pynndescent, mlxtend, lucid, lightgbm, librosa, imbalanced-learn, fancyimpute


In [None]:
pip install optuna

Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/91/88/9c53460b97c61bce926dfe9dce51e4887c283416ff89ed30af0b73f44efa/optuna-2.5.0-py3-none-any.whl (287kB)
[K     |█▏                              | 10kB 18.3MB/s eta 0:00:01[K     |██▎                             | 20kB 18.7MB/s eta 0:00:01[K     |███▍                            | 30kB 10.6MB/s eta 0:00:01[K     |████▋                           | 40kB 7.6MB/s eta 0:00:01[K     |█████▊                          | 51kB 4.2MB/s eta 0:00:01[K     |██████▉                         | 61kB 4.7MB/s eta 0:00:01[K     |████████                        | 71kB 4.9MB/s eta 0:00:01[K     |█████████▏                      | 81kB 5.2MB/s eta 0:00:01[K     |██████████▎                     | 92kB 5.0MB/s eta 0:00:01[K     |███████████▍                    | 102kB 4.1MB/s eta 0:00:01[K     |████████████▌                   | 112kB 4.1MB/s eta 0:00:01[K     |█████████████▊                  | 122kB 4.1MB/s eta 0:

In [None]:
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score
diabetes = datasets.load_diabetes()
X = diabetes.data[:150]
y = diabetes.target[:150]
print('X: ', X)
print('y: ', y)
y = diabetes.target[:150]
lasso = linear_model.Lasso()
print(cross_val_score(lasso, X, y, cv=3))


X:  [[ 0.03807591  0.05068012  0.06169621 ... -0.00259226  0.01990842
  -0.01764613]
 [-0.00188202 -0.04464164 -0.05147406 ... -0.03949338 -0.06832974
  -0.09220405]
 [ 0.08529891  0.05068012  0.04445121 ... -0.00259226  0.00286377
  -0.02593034]
 ...
 [-0.05637009 -0.04464164  0.09295276 ...  0.02545259  0.02605609
   0.04034337]
 [-0.06000263  0.05068012  0.01535029 ... -0.00259226 -0.03075121
  -0.0010777 ]
 [-0.04910502  0.05068012 -0.00512814 ...  0.07120998  0.06123791
  -0.03835666]]
y:  [151.  75. 141. 206. 135.  97. 138.  63. 110. 310. 101.  69. 179. 185.
 118. 171. 166. 144.  97. 168.  68.  49.  68. 245. 184. 202. 137.  85.
 131. 283. 129.  59. 341.  87.  65. 102. 265. 276. 252.  90. 100.  55.
  61.  92. 259.  53. 190. 142.  75. 142. 155. 225.  59. 104. 182. 128.
  52.  37. 170. 170.  61. 144.  52. 128.  71. 163. 150.  97. 160. 178.
  48. 270. 202. 111.  85.  42. 170. 200. 252. 113. 143.  51.  52. 210.
  65. 141.  55. 134.  42. 111.  98. 164.  48.  96.  90. 162. 150. 279.
  9

In [None]:
pip install --pre --upgrade gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/16/4a/c529159de5a417eb2f574941ccd9f937a47cafffaf1a3e485c6e2a8a4153/gensim-4.0.0b0-cp36-cp36m-manylinux1_x86_64.whl (24.0MB)
[K     |████████████████████████████████| 24.0MB 163kB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.0.0b0


In [None]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
