<a href="https://colab.research.google.com/github/hubertwel/paragraph-similarity/blob/main/paragraph-similarity/paragraph_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [61]:
# UNDER CONSTRUCTION
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags, strip_multiple_whitespaces
#from gensim.parsing.preprocessing import remove_stopwords, strip_short
from gensim.utils import simple_preprocess
from gensim.test.utils import get_tmpfile
from django.contrib.admin.utils import flatten
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, make_scorer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
#from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import os
import smart_open
import csv
import collections
import math
import random
import re
import optuna

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        csv_reader = csv.DictReader(f, quoting=csv.QUOTE_ALL)
        CUSTOM_FILTERS = [strip_tags, strip_multiple_whitespaces]
        for i, row in enumerate(csv_reader):
          line = row['x']
          line = remove_urls(line)
          line_list = preprocess_string(line, CUSTOM_FILTERS)
          line = " ".join(line_list)
          #print('line: ', line)
          tokens = simple_preprocess(line)
          if tokens_only:
            yield tokens
          else:
            # For training data, add tags
            yield TaggedDocument(tokens, [i])

def remove_urls(text):
  text = re.sub(r'https?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE)
  return text

def obj(trial):
    # C
    svc_c = trial.suggest_loguniform('C', 1e0, 1e2)
    # kernel
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf'])
    # SVC
    clf = SVC(C=svc_c, kernel=kernel)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    # 5-fold cross validation
    score = cross_val_score(clf, X_train, y_train, n_jobs=-1, cv=5)
    accuracy = score.mean()
    return 1.0 - accuracy

# Set file names for train and test data
# test_data_dir = os.path.join(gensim.__path__[0], 'test', 'test_data')
test_data_dir = os.path.join('/content/drive/MyDrive/Colab Notebooks/data/', 'gouvfr', 'CorpusRandomTwitter')
print("test_data_dir: %s" % test_data_dir)
train_file = os.path.join(test_data_dir, 'randomtweets3.txt')
test_file = os.path.join(test_data_dir, 'randomtweets4.txt')

train_corpus = list(read_corpus(train_file, tokens_only=True))
train_corpus_tagged = list(read_corpus(train_file))
test_corpus = list(read_corpus(test_file, tokens_only=True))
test_corpus_tagged = list(read_corpus(test_file))
'''
print('train_corpus the beginning: ', train_corpus[:2])
print(len(train_corpus))
print('train_corpus_tagged the beginning: ', train_corpus_tagged[:2])
print(len(train_corpus))
print('test_corpus the beginning: ', test_corpus[:2])
print(len(test_corpus))
print('test_corpus_tagged the beginning: ', test_corpus_tagged[:2])
print(len(train_corpus))
print()
'''
N = len(train_corpus)
#print("N %d" % N)
vocabulary = flatten(train_corpus)
# Remove duplicates from a list
tmp_list = list(dict.fromkeys(vocabulary))
vocabulary = list(dict.fromkeys(tmp_list))

df = {}
tf = {}
for word in vocabulary:
  # tf: number of occurrences of word in document (paragraph) divided by document length
  tf[word] = [doc.count(word)/len(doc) for doc in train_corpus]
       
  # df: number of documents containing word divided by the number of documents (paragraphs)
  df[word] = sum([word in doc for doc in train_corpus])/N

# 3. after you have your term frequencies and document frequencies, go over each line in the text and 
# calculate its TF-IDF representation, which will be a vector
tfidf_docs = []
for doc_index, doc in enumerate(train_corpus):
  tfidf_doc = []
  for word in vocabulary:
    # a tfidf value of word in this doc (sentence)
    score = tf[word][doc_index] * math.log(1/df[word], 10)
    # a list of tfdif values for this doc (sentence)
    tfidf_doc.append(score) 
  tfidf_docs.append(tfidf_doc)

#print('tfidf_docs: ', tfidf_docs)

# Build a vocabulary
model = Doc2Vec(dm=0, vector_size=80, min_count=3, epochs=50, hs=1, dbow_words=1)
model.build_vocab(train_corpus_tagged)

# Train the model on the corpus
model.train(train_corpus_tagged, total_examples=model.corpus_count, epochs=model.epochs)
print()

# Assessing the model
ranks = []
first_ranks = []
second_ranks = []
inferred_vectors = []
for doc_id in range(len(train_corpus_tagged)):
    inferred_vector = model.infer_vector(train_corpus_tagged[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    # sanity check (self-similarity)
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    first_ranks.append(sims[0][0])
    #second_ranks.append(sims[1])
    inferred_vectors.append(inferred_vector)
print('sims: ', sims[:3])
print('ranks: ', ranks)
counter = collections.Counter(ranks)
print(counter)
print()

# Testing the model
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
print('RANDOM DOCUMENT ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus_tagged[doc_id].words)))

# Compare and print the most/second-most/third-most/median/least similar documents from the train corpus
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('THIRD-MOST', 2), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus_tagged[sims[index][0]].words)))

filename = get_tmpfile("parsim_doc2vec_model")
model.save(filename)

# Assessing the model
ranks2 = []
first_ranks2 = []
inferred_vectors2 = []
for doc_id in range(len(test_corpus)):
    inferred_vector2 = model.infer_vector(test_corpus[doc_id])
    sims2 = model.dv.most_similar([inferred_vector2], topn=len(model.dv))
    # sanity check (self-similarity)
    rank2 = [docid for docid, sim in sims2].index(doc_id)
    ranks2.append(rank2)
    first_ranks2.append(sims2[0][0])
    inferred_vectors2.append(inferred_vector2)

print('first ranks2: ', first_ranks2)

# Cross validatiom
tags_array_train = np.array(first_ranks)
vectors_2Darray_train = np.array(inferred_vectors)
tags_array_test = np.array(first_ranks2)
vectors_2Darray_test = np.array(inferred_vectors2)
y_train, X_train = tags_array_train, vectors_2Darray_train
y_test, X_test = tags_array_test, vectors_2Darray_test

#clf = SVC()
#study = optuna.create_study()
#study.optimize(obj, n_trials=20)
#clf = SVC(C=study.best_params["C"], kernel=study.best_params["kernel"])
#clf = LogisticRegression()
#clf = LogisticRegression(solver='lbfgs', max_iter=450, n_jobs=-1, verbose=1)
clf = LogisticRegression(solver='lbfgs', max_iter=100, class_weight='balanced', multi_class='auto')
#clf = KNeighborsClassifier(n_neighbors=3)
#k_fold = KFold(n_splits=5, shuffle=True, random_state=0)
#sk_fold = StratifiedKFold(n_splits=5) #
k_fold = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=-1, scoring='accuracy')
print('score: ', score)
print('Valid acc: {}'.format(round(np.mean(score)*100, 4)))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Test acc: {}'.format(accuracy_score(y_test, y_pred)))
#print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
#print('proba: ', clf.predict_proba(X_test)[:,1])

test_data_dir: /content/drive/MyDrive/Colab Notebooks/data/gouvfr/CorpusRandomTwitter

sims:  [(999, 0.9571948051452637), (807, 0.954908013343811), (677, 0.9538499712944031)]
ranks:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 22, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 27, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 23, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 21, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 0, 0, 0,


The least populated class in y has only 1 members, which is less than n_splits=10.


The least populated class in y has only 1 members, which is less than n_splits=10.


The least populated class in y has only 1 members, which is less than n_splits=10.



score:  [0.08 0.14 0.14 0.12 0.13 0.12 0.15 0.16 0.1  0.06 0.11 0.14 0.14 0.1
 0.12 0.11 0.14 0.13 0.09 0.07 0.1  0.15 0.14 0.12 0.12 0.12 0.14 0.13
 0.11 0.06]
Valid acc: 11.8
Test acc: 0.482


In [22]:
pip install optuna

Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/91/88/9c53460b97c61bce926dfe9dce51e4887c283416ff89ed30af0b73f44efa/optuna-2.5.0-py3-none-any.whl (287kB)
[K     |█▏                              | 10kB 14.0MB/s eta 0:00:01[K     |██▎                             | 20kB 6.7MB/s eta 0:00:01[K     |███▍                            | 30kB 7.5MB/s eta 0:00:01[K     |████▋                           | 40kB 5.8MB/s eta 0:00:01[K     |█████▊                          | 51kB 4.3MB/s eta 0:00:01[K     |██████▉                         | 61kB 4.8MB/s eta 0:00:01[K     |████████                        | 71kB 5.1MB/s eta 0:00:01[K     |█████████▏                      | 81kB 4.8MB/s eta 0:00:01[K     |██████████▎                     | 92kB 5.1MB/s eta 0:00:01[K     |███████████▍                    | 102kB 4.1MB/s eta 0:00:01[K     |████████████▌                   | 112kB 4.1MB/s eta 0:00:01[K     |█████████████▊                  | 122kB 4.1MB/s eta 0:00

In [None]:
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score
diabetes = datasets.load_diabetes()
X = diabetes.data[:150]
y = diabetes.target[:150]
print('X: ', X)
print('y: ', y)
y = diabetes.target[:150]
lasso = linear_model.Lasso()
print(cross_val_score(lasso, X, y, cv=3))


X:  [[ 0.03807591  0.05068012  0.06169621 ... -0.00259226  0.01990842
  -0.01764613]
 [-0.00188202 -0.04464164 -0.05147406 ... -0.03949338 -0.06832974
  -0.09220405]
 [ 0.08529891  0.05068012  0.04445121 ... -0.00259226  0.00286377
  -0.02593034]
 ...
 [-0.05637009 -0.04464164  0.09295276 ...  0.02545259  0.02605609
   0.04034337]
 [-0.06000263  0.05068012  0.01535029 ... -0.00259226 -0.03075121
  -0.0010777 ]
 [-0.04910502  0.05068012 -0.00512814 ...  0.07120998  0.06123791
  -0.03835666]]
y:  [151.  75. 141. 206. 135.  97. 138.  63. 110. 310. 101.  69. 179. 185.
 118. 171. 166. 144.  97. 168.  68.  49.  68. 245. 184. 202. 137.  85.
 131. 283. 129.  59. 341.  87.  65. 102. 265. 276. 252.  90. 100.  55.
  61.  92. 259.  53. 190. 142.  75. 142. 155. 225.  59. 104. 182. 128.
  52.  37. 170. 170.  61. 144.  52. 128.  71. 163. 150.  97. 160. 178.
  48. 270. 202. 111.  85.  42. 170. 200. 252. 113. 143.  51.  52. 210.
  65. 141.  55. 134.  42. 111.  98. 164.  48.  96.  90. 162. 150. 279.
  9

In [1]:
pip install --pre --upgrade gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/16/4a/c529159de5a417eb2f574941ccd9f937a47cafffaf1a3e485c6e2a8a4153/gensim-4.0.0b0-cp36-cp36m-manylinux1_x86_64.whl (24.0MB)
[K     |████████████████████████████████| 24.0MB 40.2MB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.0.0b0


In [None]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
