## Gensim text retrieval semantic engine

* Gensim text retrieval semantic engine with Latent Semantic Indexing (LSA in TR).
* Dataset is https://www.kaggle.com/rmisra/news-category-dataset with 202372 entries. 

In [1]:
import json
import gensim

In [2]:
DATA_PATH = "../data/News_Category_Dataset_v2.json"
DATA_LEN = 202372

### Corpus preprocessing

In [3]:
from gensim.parsing.preprocessing import STOPWORDS

def tokenize(text):
    return [token for token in gensim.utils.simple_preprocess(text) if token not in STOPWORDS]

def iter_news(file):
    for line in open(file):
        line = json.loads(line)['headline'] + json.loads(line)['short_description']
        tokens = tokenize(line)
        yield line, tokens

In [4]:
# stream just tokens
doc_stream = (tokens for _, tokens in iter_news(DATA_PATH))

# build dict
%time id2word_news = gensim.corpora.Dictionary(doc_stream)
print(id2word_news)

Wall time: 17.7 s
Dictionary(168877 unique tokens: ['america', 'children', 'day', 'husband', 'killed']...)


In [5]:
# ignore words that appear in less than 20 documents or more than 10% documents
id2word_news.filter_extremes(no_below=20, no_above=0.1)
print(id2word_news)

Dictionary(14366 unique tokens: ['america', 'children', 'day', 'husband', 'killed']...)


In [6]:
class NewsCorpus():
    
    def __init__(self, file, dictionary):
        self.file = file
        self.dict = dictionary
        
    def __iter__(self):
        self.titles = []
        for title, tokens in iter_news(self.file):
            self.titles.append(title)
            yield self.dict.doc2bow(tokens)
        
            
# create a stream of bag-of-words vectors
news_corpus = NewsCorpus(DATA_PATH, id2word_news)
vector = next(iter(news_corpus))

In [7]:
# store corpus
%time gensim.corpora.MmCorpus.serialize('../data/news_bow.mm', news_corpus)

# store dictionary
id2word_news.save('../data/news.dict')

Wall time: 19.3 s


In [8]:
# load dictionary
id2word_news = gensim.corpora.Dictionary.load('../data/news.dict')

# load corpus
mm_corpus = gensim.corpora.MmCorpus('../data/news_bow.mm')
print(mm_corpus)

MmCorpus(200853 documents, 14366 features, 2548042 non-zero entries)


In [9]:
%time lsi = gensim.models.lsimodel.LsiModel(corpus=mm_corpus, id2word=id2word_news)

Wall time: 46.4 s


In [10]:
lsi.save('../data/lsi_news.model')

In [11]:
lsi = lsi.load('../data/lsi_news.model')

In [12]:
# build the index
from gensim import similarities
%time index = similarities.MatrixSimilarity(lsi[mm_corpus])
index.save('../data/lsi_news.index')

Wall time: 48.3 s


## Search example

In [13]:
import gensim

lsi = gensim.models.lsimodel.LsiModel.load('../data/lsi_news.model')
index = gensim.similarities.MatrixSimilarity.load('../data/lsi_news.index')
dictionary = gensim.corpora.Dictionary.load('../data/news.dict')

In [34]:
# transform doc into lsi vector space (we need the model for this)
doc = "boat"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]

# query doc (we need the index for this)
sims = index[vec_lsi]

# sort by similarity
sims = sorted(enumerate(sims), key=lambda item: -item[1])

In [35]:
# print most similar docs
print(sims[:10])

[(40531, 0.72241807), (81065, 0.6100451), (40433, 0.60876673), (93370, 0.6077832), (75909, 0.60237277), (78568, 0.59833413), (47225, 0.58883095), (76253, 0.5884743), (25472, 0.5865745), (98799, 0.5863053)]


In [42]:
import json
sim_rank = sims[:10]
sim_ids = [i[0] for i in sim_rank]

fp = open(DATA_PATH)
result = []
for line_idx, line in enumerate(fp):
    if line_idx in sim_ids:
        sim_idx = sim_ids.index(line_idx)
        sim_val = sim_rank[sim_idx]
        line = json.loads(line)
        result.append(
            (sim_val, line['headline'], line['short_description'])
        )

print(result)


[((25472, 0.5865745), 'Blue Whale Found Dead On Northern California Beach Likely Struck By Ship', '79-foot female suffered blunt force trauma and several broken bones.'), ((40433, 0.60876673), 'Thousands Of Snow Geese Thought Dead After Landing On Toxic Mining Pit', 'Berkeley Pit is nearly 700 acres of acidic, deadly water.'), ((40531, 0.72241807), 'Four Migrants Drown Off Coast Of Morocco', 'They were in an inflatable boat.'), ((47225, 0.58883095), 'More Than 130 Bodies Recovered From Migrant Boat Capsized Off Egypt', 'The ship was carrying Africans headed for Italy.'), ((75909, 0.60237277), '6 Infants Drown When Migrant Boat Capsizes Off Greek Island', 'ATHENS, Nov 1 (Reuters) - Eleven migrants including six infants drowned when their boat capsized off the Greek island of'), ((76253, 0.5884743), 'Coast Guard Crew Travels Thousands Of Miles To Rescue 36 Stranded Fishermen', 'The fishermen spent more than 10 hours in skiffs after abandoning ship.'), ((78568, 0.59833413), 'Dozens Of End

In [45]:
result.sort(key=lambda tup:tup[0][1], order= -1)

TypeError: 'order' is an invalid keyword argument for sort()

In [46]:
result

[((98799, 0.5863053), 'About 700 Migrants Rescued Off Coast Of Libya', ''),
 ((25472, 0.5865745),
  'Blue Whale Found Dead On Northern California Beach Likely Struck By Ship',
  '79-foot female suffered blunt force trauma and several broken bones.'),
 ((76253, 0.5884743),
  'Coast Guard Crew Travels Thousands Of Miles To Rescue 36 Stranded Fishermen',
  'The fishermen spent more than 10 hours in skiffs after abandoning ship.'),
 ((47225, 0.58883095),
  'More Than 130 Bodies Recovered From Migrant Boat Capsized Off Egypt',
  'The ship was carrying Africans headed for Italy.'),
 ((78568, 0.59833413),
  'Dozens Of Endangered Seals Wash Up Dead, Starving On California Beaches',
  'The threatened Guadalupe fur seal could be the latest victim of the unusually warm waters in the eastern Pacific Ocean.'),
 ((75909, 0.60237277),
  '6 Infants Drown When Migrant Boat Capsizes Off Greek Island',
  'ATHENS, Nov 1 (Reuters) - Eleven migrants including six infants drowned when their boat capsized off