## Doc2Vec semantic search

Small semantic search engine with doc2vec. Using genism.

In [1]:
import gensim
from gensim import utils as gu
from gensim import models as gm
import logging
import smart_open
import json
import os
import collections

vector_size = 300
window_size = 15
min_count = 1
sampling_threshold = 1e-5
negative_size = 5
train_epoch = 100
dm = 0 #0 = dbow; 1 = dmpv
worker_count = 1 #number of parallel processes

PRETRAINED_EMB = ""
DOCS_PATH = "../data/News_Category_Dataset_v2.json"
TRAIN_PATH = "../data/train_news.json"
TEST_PATH = "../data/test_news.json"
OUTPUT_PATH = ""

#enable logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
# build splitted dataset
import pandas as pd
import numpy as np

data = pd.read_json("../data/News_Category_Dataset_v2.json", lines = True)
data = data[['headline', 'short_description']]
msk = np.random.rand(data.shape[0]) > 0.8
train = data[msk]
test = data[~msk]
data = None
train.to_json(TRAIN_PATH, orient = 'records', lines = True)
test.to_json(TEST_PATH, orient = 'records', lines = True)

In [3]:
# Set file names for train and test data
test_data_dir = '{}'.format(os.sep).join([gensim.__path__[0], 'test', 'test_data'])
lee_train_file = test_data_dir + os.sep + 'lee_background.cor'
lee_test_file = test_data_dir + os.sep + 'lee.cor'

In [4]:
def read_corpus(fname, tokens_only=False):
    with smart_open.smart_open(fname) as f:
        for i, line in enumerate(f):
            line = json.loads(line)
            line = line['headline'] +" "+ line['short_description']
            if tokens_only:
                yield gu.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gm.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])
            

def read_corpus_lee(fname, tokens_only=False):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                yield gu.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gm.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

In [5]:
train_corpus = list(read_corpus(TRAIN_PATH))
test_corpus = list(read_corpus(TEST_PATH, tokens_only=True))

Instantiate a Doc2Vec model with a vector size with 50 words and iterating over the training corpus 40 times. We set the minimum word count to 2 in order to discard words with very few occurrences

In [6]:
model = gm.doc2vec.Doc2Vec(dm=1, vector_size=300, window=12, hs=0, min_count=5, dbow_words=1, sample=1e-5)
#model = gm.doc2vec.Doc2Vec(dm=1, vector_size=300, window=10, hs=0, min_count=5, dbow_words=1, sample=1e-5)

In [7]:
model.build_vocab(train_corpus)
print("Vocabulary final size: {}".format(len(model.wv.vocab)))

2019-02-06 20:22:29,500 : INFO : collecting all words and their counts
2019-02-06 20:22:29,503 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2019-02-06 20:22:29,571 : INFO : PROGRESS: at example #10000, processed 234551 words (3461205/s), 20176 word types, 10000 tags
2019-02-06 20:22:29,645 : INFO : PROGRESS: at example #20000, processed 485992 words (3413118/s), 28790 word types, 20000 tags
2019-02-06 20:22:29,726 : INFO : PROGRESS: at example #30000, processed 795496 words (3881623/s), 36577 word types, 30000 tags
2019-02-06 20:22:29,812 : INFO : PROGRESS: at example #40000, processed 1130329 words (3903683/s), 43243 word types, 40000 tags
2019-02-06 20:22:29,814 : INFO : collected 43327 word types and 40124 unique tags from a corpus of 40124 examples and 1134346 words
2019-02-06 20:22:29,815 : INFO : Loading a fresh vocabulary
2019-02-06 20:22:29,855 : INFO : effective_min_count=5 retains 14662 unique words (33% of original 43327, drops 28665)
2019-

Vocabulary final size: 14662


In [8]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=100, start_alpha=0.01, end_alpha=0.01)

2019-02-06 20:22:30,873 : INFO : training model with 3 workers on 14662 vocabulary and 300 features, using sg=0 hs=0 sample=1e-05 negative=5 window=12
2019-02-06 20:22:31,897 : INFO : EPOCH 1 - PROGRESS: at 48.48% examples, 155878 words/s, in_qsize 6, out_qsize 0
2019-02-06 20:22:32,912 : INFO : EPOCH 1 - PROGRESS: at 95.78% examples, 168068 words/s, in_qsize 5, out_qsize 0
2019-02-06 20:22:32,967 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-06 20:22:32,975 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-06 20:22:32,987 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-06 20:22:32,987 : INFO : EPOCH - 1 : training on 1134346 raw words (357896 effective words) took 2.1s, 170226 effective words/s
2019-02-06 20:22:33,998 : INFO : EPOCH 2 - PROGRESS: at 46.85% examples, 151046 words/s, in_qsize 5, out_qsize 0
2019-02-06 20:22:35,015 : INFO : EPOCH 2 - PROGRESS: at 88.43% examples, 153401 words/s, in_qsize 6, 

2019-02-06 20:22:58,964 : INFO : EPOCH - 13 : training on 1134346 raw words (358345 effective words) took 2.1s, 170380 effective words/s
2019-02-06 20:22:59,973 : INFO : EPOCH 14 - PROGRESS: at 45.22% examples, 145664 words/s, in_qsize 6, out_qsize 0
2019-02-06 20:23:01,004 : INFO : EPOCH 14 - PROGRESS: at 92.80% examples, 161807 words/s, in_qsize 6, out_qsize 0
2019-02-06 20:23:01,128 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-06 20:23:01,146 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-06 20:23:01,150 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-06 20:23:01,151 : INFO : EPOCH - 14 : training on 1134346 raw words (358784 effective words) took 2.2s, 164457 effective words/s
2019-02-06 20:23:02,158 : INFO : EPOCH 15 - PROGRESS: at 48.48% examples, 157696 words/s, in_qsize 5, out_qsize 0
2019-02-06 20:23:03,168 : INFO : EPOCH 15 - PROGRESS: at 93.53% examples, 164903 words/s, in_qsize 5, out_qsize

2019-02-06 20:23:26,689 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-06 20:23:26,690 : INFO : EPOCH - 26 : training on 1134346 raw words (358350 effective words) took 2.1s, 171390 effective words/s
2019-02-06 20:23:27,732 : INFO : EPOCH 27 - PROGRESS: at 50.15% examples, 158471 words/s, in_qsize 5, out_qsize 0
2019-02-06 20:23:28,754 : INFO : EPOCH 27 - PROGRESS: at 98.01% examples, 170096 words/s, in_qsize 3, out_qsize 0
2019-02-06 20:23:28,758 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-06 20:23:28,765 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-06 20:23:28,773 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-06 20:23:28,774 : INFO : EPOCH - 27 : training on 1134346 raw words (358224 effective words) took 2.1s, 172388 effective words/s
2019-02-06 20:23:29,815 : INFO : EPOCH 28 - PROGRESS: at 50.15% examples, 158867 words/s, in_qsize 5, out_qsize 0
2019-02-06 20:23:30,

2019-02-06 20:23:53,726 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-06 20:23:53,734 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-06 20:23:53,735 : INFO : EPOCH - 39 : training on 1134346 raw words (358251 effective words) took 2.1s, 174689 effective words/s
2019-02-06 20:23:54,742 : INFO : EPOCH 40 - PROGRESS: at 48.50% examples, 157436 words/s, in_qsize 5, out_qsize 0
2019-02-06 20:23:55,771 : INFO : EPOCH 40 - PROGRESS: at 95.78% examples, 167886 words/s, in_qsize 6, out_qsize 0
2019-02-06 20:23:55,833 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-06 20:23:55,835 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-06 20:23:55,840 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-06 20:23:55,840 : INFO : EPOCH - 40 : training on 1134346 raw words (358231 effective words) took 2.1s, 170514 effective words/s
2019-02-06 20:23:56,879 : INFO : EPOCH 41 -

2019-02-06 20:24:20,997 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-06 20:24:21,010 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-06 20:24:21,019 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-06 20:24:21,019 : INFO : EPOCH - 52 : training on 1134346 raw words (357356 effective words) took 2.1s, 170575 effective words/s
2019-02-06 20:24:22,026 : INFO : EPOCH 53 - PROGRESS: at 49.33% examples, 161191 words/s, in_qsize 5, out_qsize 0
2019-02-06 20:24:23,030 : INFO : EPOCH 53 - PROGRESS: at 96.54% examples, 171628 words/s, in_qsize 5, out_qsize 0
2019-02-06 20:24:23,079 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-06 20:24:23,098 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-06 20:24:23,105 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-06 20:24:23,106 : INFO : EPOCH - 53 : training on 1134346 raw words (358500 effec

2019-02-06 20:24:48,180 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-06 20:24:48,194 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-06 20:24:48,202 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-06 20:24:48,202 : INFO : EPOCH - 65 : training on 1134346 raw words (358597 effective words) took 2.0s, 175168 effective words/s
2019-02-06 20:24:49,232 : INFO : EPOCH 66 - PROGRESS: at 49.33% examples, 157646 words/s, in_qsize 5, out_qsize 0
2019-02-06 20:24:50,238 : INFO : EPOCH 66 - PROGRESS: at 96.54% examples, 169552 words/s, in_qsize 5, out_qsize 0
2019-02-06 20:24:50,287 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-06 20:24:50,298 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-06 20:24:50,304 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-06 20:24:50,305 : INFO : EPOCH - 66 : training on 1134346 raw words (358293 effec

2019-02-06 20:25:15,435 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-06 20:25:15,450 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-06 20:25:15,456 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-06 20:25:15,457 : INFO : EPOCH - 78 : training on 1134346 raw words (358607 effective words) took 2.1s, 169490 effective words/s
2019-02-06 20:25:16,474 : INFO : EPOCH 79 - PROGRESS: at 47.65% examples, 153409 words/s, in_qsize 5, out_qsize 0
2019-02-06 20:25:17,482 : INFO : EPOCH 79 - PROGRESS: at 95.04% examples, 167580 words/s, in_qsize 5, out_qsize 0
2019-02-06 20:25:17,563 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-06 20:25:17,566 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-06 20:25:17,571 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-06 20:25:17,572 : INFO : EPOCH - 79 : training on 1134346 raw words (358398 effec

2019-02-06 20:25:42,708 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-06 20:25:42,717 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-06 20:25:42,724 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-06 20:25:42,725 : INFO : EPOCH - 91 : training on 1134346 raw words (358841 effective words) took 2.1s, 172588 effective words/s
2019-02-06 20:25:43,738 : INFO : EPOCH 92 - PROGRESS: at 49.33% examples, 159970 words/s, in_qsize 5, out_qsize 0
2019-02-06 20:25:44,742 : INFO : EPOCH 92 - PROGRESS: at 95.03% examples, 167906 words/s, in_qsize 6, out_qsize 0
2019-02-06 20:25:44,812 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-06 20:25:44,824 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-06 20:25:44,830 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-06 20:25:44,831 : INFO : EPOCH - 92 : training on 1134346 raw words (358017 effec

Wall time: 3min 30s


Compare every doc best match of similarity to itself.

In [9]:
ranks = []
second_ranks = []
size_of_test = 1000

for doc_id in range(len(train_corpus[:size_of_test])):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])

2019-02-06 20:26:01,621 : INFO : precomputing L2-norms of doc weight vectors


In [10]:
import collections
counter = collections.Counter(ranks)
counter

Counter({0: 986,
         317: 1,
         29: 1,
         1: 5,
         267: 1,
         2: 1,
         45: 1,
         8: 1,
         3: 1,
         14: 1,
         5167: 1})

In [11]:
print("Accuracy of {}".format(counter[0] / size_of_test))

Accuracy of 0.986


This basically means we got a something like a good BoW model that also has good semantic search capabilities.

In [12]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (999): «don young suggests the holocaust happened because jewish people weren armed it offensive for anyone to manipulate the history of the holocaust to score political points the anti defamation league said»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d300,n5,w12,mc5,s1e-05,t3):

MOST (999, 0.8221635818481445): «don young suggests the holocaust happened because jewish people weren armed it offensive for anyone to manipulate the history of the holocaust to score political points the anti defamation league said»

MEDIAN (27988, -0.020854011178016663): «myths about raising interfaith kids this time of year many interfaith families are preparing to feast on latkes light hanukkah candles at the thanksgiving table and then move on to making christmas cookies but beyond holiday celebrations is it good idea to raise kids in two religions»

LEAST (31500, -0.4947623610496521): «kate moss party look brings back memories photos want more be sure to check out huffpost style on twitte

In [13]:
# set the model seed so inference doesn't randomly initialized 
model.random.seed(1234)
doc = "migrants found in sinking boat"
inf_vec = model.infer_vector(doc.split(" "))
inf_vec

most = model.docvecs.most_similar([inf_vec], topn=10)

print(most)
for i in most:
    print('Doc({}): «{}»\n'.format(i[0], ' '.join(train_corpus[i[0]].words)))

[(11137, 0.8991857767105103), (18108, 0.881403923034668), (11419, 0.8743504285812378), (10616, 0.865625262260437), (22774, 0.8648278117179871), (36288, 0.8639869093894958), (23645, 0.8635109663009644), (36743, 0.8605598211288452), (25113, 0.8595790863037109), (8796, 0.8590948581695557)]
Doc(11137): «italy coast guard says it rescued over migrants in past days refugees and migrants continue to attempt perilous journey to europe»

Doc(18108): «myanmar says it found more than bangladeshi migrants in single boat»

Doc(11419): «german cardinal makes impassioned plea for migrants from refugee boat someone who lets people drown in the mediterranean also drowns god»

Doc(10616): «four migrants found dead hundreds saved from boat in mediterranean some people were saved in three rescue missions on tuesday»

Doc(22774): «migrant boat to europe sinks some feared dead»

Doc(36288): «malta rhodes and the placid mediterranean the island of rhodes perches on bluff overlooking white washed homes narrow

In [None]:
# lacks test with never before seen documents