<a href="https://colab.research.google.com/github/hubertwel/paragraph-similarity/blob/main/paragraph-similarity/paragraph_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [57]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags, strip_multiple_whitespaces
from gensim.parsing.preprocessing import remove_stopwords, strip_short
from gensim.parsing.preprocessing import strip_non_alphanum, split_alphanum
from gensim.parsing.preprocessing import strip_punctuation
from gensim.utils import simple_preprocess
from gensim.utils import RULE_KEEP, RULE_DISCARD, RULE_DEFAULT
from gensim.test.utils import get_tmpfile
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize
import numpy as np
import os
import smart_open
import csv
import collections
import random
import re
import optuna

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        csv_reader = csv.DictReader(f, quoting=csv.QUOTE_ALL)
        CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_multiple_whitespaces]
        for i, row in enumerate(csv_reader):
          line = row['x']
          line = remove_urls(line)
          line_list = preprocess_string(line, CUSTOM_FILTERS)
          line = " ".join(line_list)
          tokens = simple_preprocess(line)
          if tokens_only:
            yield tokens
          else:
            # For training data, add tags
            yield TaggedDocument(tokens, [i])

def remove_urls(text):
  text = re.sub(r'https?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE)
  return text

def trim_rule(word, count, min_count):
    stop_words = set(stopwords.words('english')) 
    # This rule is only used to prune vocabulary during the current method call,
    # so that documents can be printed with stopwords and with words of any length
    if ((word in stop_words) or (len(word) < 3)):
        return utils.RULE_DISCARD  # throw out
    else:
        return utils.RULE_DEFAULT  # apply default rule, i.e. min_count

# Define an objective function to be maximized
def objective(trial):
  # Optimize hyperparameters: 
    penalty = trial.suggest_categorical("penalty", ['l1', 'l2'])
    c = trial.suggest_float("C", 5e-1, 15e-1, log=True)
    fit_intercept = trial.suggest_categorical("fit_intercept", [True, False])
    intercept_scaling = trial.suggest_float("intercept_scaling", 1e-1, 2e0, log=True)
    clf = LogisticRegression(penalty=penalty, C=c, fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, solver='liblinear', max_iter=300, class_weight='balanced', multi_class='auto')
  # Scoring method:
    k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
    score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=-1, scoring='accuracy')
    accuracy = np.mean(score)
    return accuracy

def main():    
  global X_train, y_train

  # Set file names for train and test data
  test_data_dir = os.path.join('/content/drive/MyDrive/Colab Notebooks/data/', 'gouvfr', 'CorpusRandomTwitter')
  print("test_data_dir: %s" % test_data_dir)
  train_file = os.path.join(test_data_dir, 'randomtweets3.txt')
  test_file = os.path.join(test_data_dir, 'randomtweets4.txt')

  train_corpus = list(read_corpus(train_file, tokens_only=True))
  train_corpus_tagged = list(read_corpus(train_file))
  test_corpus = list(read_corpus(test_file, tokens_only=True))

  print('train_corpus (the beginning): ', train_corpus[:2])
  print('train_corpus length %d' % len(train_corpus))
  print('train_corpus_tagged (the beginning): ', train_corpus_tagged[:2])
  print('test_corpus (the beginning): ', test_corpus[:2])
  print('test_corpus length %d' %len(test_corpus))
  print()

  # Build the model and its vocabulary
  model = Doc2Vec(dm=0, vector_size=80, min_count=3, epochs=50, hs=1, dbow_words=1, trim_rule=trim_rule)
  model.build_vocab(train_corpus_tagged)

  # Train the model on the train corpus
  model.train(train_corpus_tagged, total_examples=model.corpus_count, epochs=model.epochs)
  print()

  # Assess the model on the train corpus
  ranks = []
  first_ranks = []
  second_ranks = []
  inferred_vectors = []
  for doc_id in range(len(train_corpus_tagged)):
    inferred_vector = model.infer_vector(train_corpus_tagged[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    # sanity check (self-similarity)
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    first_ranks.append(sims[0][0])
    inferred_vectors.append(inferred_vector)
        
  print('sims: ', sims[:3])
  print('ranks: ', ranks)
  counter = collections.Counter(ranks)
  print(counter)
  print()

  # Test the model with one random document
  # Pick a random document from the test corpus and infer a vector from the model
  doc_id = random.randint(0, len(test_corpus) - 1)
  inferred_vector = model.infer_vector(test_corpus[doc_id])
  sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
  print('RANDOM TEST DOCUMENT ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))

  # Compare and print the most/second-most/third-most/median/least similar documents from the train corpus
  print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
  for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('THIRD-MOST', 2), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus_tagged[sims[index][0]].words)))

  # Save the model
  filename = get_tmpfile("parsim_doc2vec_model")
  model.save(filename)

  # Assess the model on the independent data set (test corpus)
  ranks_test = []
  first_ranks_test = []
  inferred_vectors_test = []
  for doc_id in range(len(test_corpus)):
    inferred_vector_test = model.infer_vector(test_corpus[doc_id])
    sims_test = model.dv.most_similar([inferred_vector_test], topn=len(model.dv))
    first_ranks_test.append(sims_test[0][0])
    inferred_vectors_test.append(inferred_vector_test)

  print('first_ranks_test: ', first_ranks_test)

  # Prepare vectors for cross validatiom
  tags_array_train = np.array(first_ranks)
  vectors_2Darray_train = np.array(inferred_vectors)
  tags_array_test = np.array(first_ranks_test)
  vectors_2Darray_test = np.array(inferred_vectors_test)
  y_train, X_train = tags_array_train, vectors_2Darray_train
  y_test, X_test = tags_array_test, vectors_2Darray_test

  # Create Optuna study
  study = optuna.create_study(direction="maximize")
  study.optimize(objective, n_trials=30)
  
  # Cross validation
  clf = LogisticRegression(penalty=study.best_params["penalty"], C=study.best_params["C"], fit_intercept=study.best_params["fit_intercept"], intercept_scaling=study.best_params["intercept_scaling"], solver='liblinear', max_iter=300, class_weight='balanced', multi_class='auto')
  k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
  score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=-1, scoring='accuracy')
  print('score: ', score)
  print('Validation accuracy: {}'.format(round(np.mean(score)*100, 3)))
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print("Test accuracy: {:.3f}".format(accuracy_score(y_test, y_pred)))
  
main()

test_data_dir: /content/drive/MyDrive/Colab Notebooks/data/gouvfr/CorpusRandomTwitter
train_corpus (the beginning):  [['rt', 'americanidol', 'watch', 'as', 'ddlovato', 'gives', 'an', 'amazing', 'performance', 'of', 'her', 'single', 'stonecold', 'demionidol'], ['rt', 'dipti_varun', 'varun', 'dhawan', 'junaid', 'is', 'rookie', 'cop', 'and', 'heâ', 'kind', 'of', 'discovering', 'himself', 'as', 'the', 'movie', 'goes', 'on', 'varun_dvn', 'dishoom']]
train_corpus length 1000
train_corpus_tagged (the beginning):  [TaggedDocument(words=['rt', 'americanidol', 'watch', 'as', 'ddlovato', 'gives', 'an', 'amazing', 'performance', 'of', 'her', 'single', 'stonecold', 'demionidol'], tags=[0]), TaggedDocument(words=['rt', 'dipti_varun', 'varun', 'dhawan', 'junaid', 'is', 'rookie', 'cop', 'and', 'heâ', 'kind', 'of', 'discovering', 'himself', 'as', 'the', 'movie', 'goes', 'on', 'varun_dvn', 'dishoom'], tags=[1])]
test_corpus (the beginning):  [['live', 'morning', 'news', 'weather', 'and', 'traffic', 'upd

[32m[I 2021-02-19 01:19:55,174][0m A new study created in memory with name: no-name-f0a9fffc-0b33-45cb-abaa-486f9497e94b[0m


first_ranks_test:  [391, 763, 542, 636, 436, 450, 349, 714, 238, 131, 676, 697, 814, 656, 275, 46, 582, 325, 87, 77, 142, 983, 208, 412, 340, 382, 964, 169, 594, 235, 529, 800, 647, 42, 494, 802, 145, 496, 262, 223, 545, 893, 686, 914, 50, 355, 671, 367, 76, 82, 106, 250, 535, 665, 347, 68, 413, 298, 389, 789, 621, 567, 82, 939, 64, 545, 766, 67, 682, 251, 781, 764, 96, 270, 516, 918, 285, 76, 131, 546, 994, 502, 399, 336, 503, 82, 294, 844, 642, 709, 87, 733, 221, 483, 768, 463, 131, 481, 147, 778, 634, 242, 131, 17, 317, 429, 141, 906, 657, 514, 131, 106, 95, 163, 797, 247, 117, 994, 593, 769, 647, 886, 636, 197, 544, 901, 131, 131, 463, 764, 225, 788, 939, 9, 576, 352, 557, 683, 438, 660, 457, 264, 775, 500, 715, 633, 177, 962, 201, 225, 748, 131, 507, 117, 390, 776, 277, 79, 87, 270, 707, 682, 399, 895, 352, 595, 259, 285, 383, 397, 621, 261, 330, 364, 720, 278, 223, 360, 775, 967, 874, 507, 82, 591, 139, 36, 909, 127, 426, 83, 452, 20, 925, 87, 285, 170, 234, 604, 430, 369, 795, 9

[32m[I 2021-02-19 01:20:37,671][0m Trial 0 finished with value: 0.10700000000000001 and parameters: {'penalty': 'l2', 'C': 0.6063930030273453, 'fit_intercept': True, 'intercept_scaling': 0.31635679552112306}. Best is trial 0 with value: 0.10700000000000001.[0m
[32m[I 2021-02-19 01:21:21,001][0m Trial 1 finished with value: 0.10700000000000001 and parameters: {'penalty': 'l2', 'C': 0.978983813654303, 'fit_intercept': True, 'intercept_scaling': 0.15724816968031408}. Best is trial 0 with value: 0.10700000000000001.[0m
[32m[I 2021-02-19 01:21:57,071][0m Trial 2 finished with value: 0.002 and parameters: {'penalty': 'l1', 'C': 0.6299645198821744, 'fit_intercept': True, 'intercept_scaling': 0.7136818201828395}. Best is trial 0 with value: 0.10700000000000001.[0m
[32m[I 2021-02-19 01:22:37,648][0m Trial 3 finished with value: 0.10700000000000001 and parameters: {'penalty': 'l2', 'C': 0.5866806026384628, 'fit_intercept': False, 'intercept_scaling': 0.4342934399565916}. Best is trial

score:  [0.14 0.15 0.1  0.06 0.13 0.06 0.09 0.16 0.1  0.11]
Validation accuracy: 11.0
Test accuracy: 0.549


In [None]:
pip install -U scikit-learn

Collecting scikit-learn
[?25l  Downloading https://files.pythonhosted.org/packages/e2/4c/6111b9a325f29527d7f262e2ee8c730d354b47a728d955e186dacad57a0d/scikit_learn-0.24.1-cp36-cp36m-manylinux2010_x86_64.whl (22.2MB)
[K     |████████████████████████████████| 22.2MB 66.3MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading https://files.pythonhosted.org/packages/f7/12/ec3f2e203afa394a149911729357aa48affc59c20e2c1c8297a60f33f133/threadpoolctl-2.1.0-py3-none-any.whl
Installing collected packages: threadpoolctl, scikit-learn
  Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.24.1 threadpoolctl-2.1.0


In [2]:
pip install optuna

Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/91/88/9c53460b97c61bce926dfe9dce51e4887c283416ff89ed30af0b73f44efa/optuna-2.5.0-py3-none-any.whl (287kB)
[K     |████████████████████████████████| 296kB 9.5MB/s 
Collecting alembic
[?25l  Downloading https://files.pythonhosted.org/packages/a9/53/daab5c96e22e9ed1c9f8ca4e3256e72213ade42d519b6254c32e59610967/alembic-1.5.4.tar.gz (1.1MB)
[K     |████████████████████████████████| 1.1MB 13.9MB/s 
[?25hCollecting cliff
[?25l  Downloading https://files.pythonhosted.org/packages/a2/d6/7d9acb68a77acd140be7fececb7f2701b2a29d2da9c54184cb8f93509590/cliff-3.7.0-py3-none-any.whl (80kB)
[K     |████████████████████████████████| 81kB 7.4MB/s 
Collecting colorlog
  Downloading https://files.pythonhosted.org/packages/5e/39/0230290df0519d528d8d0ffdfd900150ed24e0076d13b1f19e279444aab1/colorlog-4.7.2-py2.py3-none-any.whl
Collecting cmaes>=0.6.0
  Downloading https://files.pythonhosted.org/packages/21/a2/21775c7343e7dd345e5e1

In [3]:
pip install --pre --upgrade gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/16/4a/c529159de5a417eb2f574941ccd9f937a47cafffaf1a3e485c6e2a8a4153/gensim-4.0.0b0-cp36-cp36m-manylinux1_x86_64.whl (24.0MB)
[K     |████████████████████████████████| 24.0MB 1.5MB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.0.0b0


In [None]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
