In [1]:
from time import time
from os.path import join

DIR = './objects'

DICTIONARY = 'wiki_gensim_70.dict'
CORPUS = 'wiki_corpus_gensim_70.mm'
TFIDF_MODEL = 'gensim_70_tfidf.model'
#TFIDF_VECS = 'gensim_70_tfidf.vecs'
SIMILARITIES = 'wiki_gensim_70_tfidf.similarities'
TITLES = 'titles_gensim_70.pkl'

In [2]:
from gensim.models import TfidfModel
from gensim import corpora

word_dict = corpora.Dictionary.load(join(DIR, DICTIONARY))
corpus = corpora.MmCorpus(join(DIR, CORPUS))

print(len(corpus), "docs and", len(word_dict), "tokens in dictionary.")



5396106 docs and 558522 tokens in dictionary.


# Apply Latent Semantic Indexing (LSI/LSA) or LDA?
Ref: https://www.oreilly.com/learning/how-do-i-compare-document-similarity-using-python    
Good resource: https://medium.com/nanonets/topic-modeling-with-lsa-psla-lda-and-lda2vec-555ff65b0b05  
(Distributed: https://radimrehurek.com/gensim/distributed.html)

In [5]:
from gensim.models import TfidfModel

start = time()

model = TfidfModel(corpus)
model.save(join(DIR, TFIDF_MODEL))

print("this cell cost about:", (time() - start) / 60, 'mins')

this cell cost about: 3.726845208803813 mins


In [10]:
from gensim.similarities import Similarity

start = time()

tfidf_vecs = model[corpus]
#tfidf_vecs.save(join(DIR, TFIDF_VECS))
similarities = Similarity('./similarity_corpus/', tfidf_vecs, num_features=len(word_dict))

similarities.save(join(DIR, SIMILARITIES))
# load by: similarities.similarity.load(join(DIR, 'deerwester.index'))
print("this cell cost about:", (time() - start) / 60, 'mins')

this cell cost about: 51.84820192257563 mins


# Find best relevant documents

In [11]:
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import stem_text

def preprocess_sent(text):
    return stem_text(remove_stopwords(strip_punctuation(text)))

claim1 = "Nikolaj Coster-Waldau worked with the Fox Broadcasting Company."
query_bow = word_dict.doc2bow(preprocess_sent(claim1).split())

In [13]:
#index=similarities.Similarity('E:\\cm_test',tfidf[corpus_tfidf],len(dictionary))
import numpy as np

start = time()

query_doc_tf_idf = model[query_bow]
query_doc_similarities = similarities[query_doc_tf_idf]

doc_ids = np.argsort(query_doc_similarities)[-50:][::-1]

print("It cost around", time() - start, "seconds for one claim.")

It cost around 1.4145798683166504 seconds for one claim.


In [14]:
import pickle

titles = pickle.load(open(join(DIR, TITLES), "rb" ))

for doc_id in doc_ids:
    print("title: ", titles[doc_id])

title:  Coster
title:  Ved_verdens_ende
title:  Waldau
title:  Waldau_-LRB-surname-RRB-
title:  New_Amsterdam_-LRB-TV_series-RRB-
title:  Nikolaj_Coster-Waldau
title:  A_Second_Chance_-LRB-2014_film-RRB-
title:  Nikolaj
title:  Coster_-LRB-disambiguation-RRB-
title:  A_Thousand_Times_Good_Night
title:  The_Baker_-LRB-film-RRB-
title:  Nukaaka_Coster-Waldau
title:  Sven_Coster
title:  Kalle_Coster
title:  De_Coster
title:  Jean-Baptiste_De_Coster_-LRB-Jesuit-RRB-
title:  Saskia_De_Coster
title:  Tracy_Coster
title:  The_Other_Woman_-LRB-2014_film-RRB-
title:  Project_Laurens_Janszoon_Coster
title:  Dick_Coster
title:  Nahetal-Waldau
title:  Samuel_Coster
title:  Harry_Waldau
title:  Small_Crimes
title:  Shot_Caller_-LRB-film-RRB-
title:  Salomon_Coster
title:  Stan_Coster
title:  Arnold_Coster
title:  Gustav_Waldau
title:  Mama_-LRB-2013_film-RRB-
title:  Kunsthallen_Nikolaj
title:  Nikolaj_Plads
title:  Lionel_Lockridge
title:  Nikolaj_Hess
title:  45_-LRB-film-RRB-
title:  Willem_Jaco