## Gensim text retrieval semantic engine

* Gensim text retrieval semantic engine with Latent Semantic Indexing (LSA in TR).
* Dataset is https://www.kaggle.com/rmisra/news-category-dataset with 202372 entries. 

In [1]:
import json
import gensim

In [2]:
DATA_PATH = "../data/News_Category_Dataset_v2.json"
DATA_LEN = 202372

### Corpus preprocessing

In [3]:
from gensim.parsing.preprocessing import STOPWORDS

def tokenize(text):
    return [token for token in gensim.utils.simple_preprocess(text) if token not in STOPWORDS]

def iter_news(file):
    for line in open(file):
        line = json.loads(line)['headline'] + json.loads(line)['short_description']
        tokens = tokenize(line)
        yield line, tokens

In [4]:
# stream just tokens
doc_stream = (tokens for _, tokens in iter_news(DATA_PATH))

# build dict
%time id2word_news = gensim.corpora.Dictionary(doc_stream)
print(id2word_news)

Wall time: 15.7 s
Dictionary(168877 unique tokens: ['america', 'children', 'day', 'husband', 'killed']...)


In [5]:
# ignore words that appear in less than 20 documents or more than 10% documents
id2word_news.filter_extremes(no_below=20, no_above=0.1)
print(id2word_news)

Dictionary(14366 unique tokens: ['america', 'children', 'day', 'husband', 'killed']...)


In [6]:
class NewsCorpus():
    
    def __init__(self, file, dictionary):
        self.file = file
        self.dict = dictionary
        
    def __iter__(self):
        self.titles = []
        for title, tokens in iter_news(self.file):
            self.titles.append(title)
            yield self.dict.doc2bow(tokens)
        
            
# create a stream of bag-of-words vectors
news_corpus = NewsCorpus(DATA_PATH, id2word_news)
vector = next(iter(news_corpus))

In [7]:
# store corpus
%time gensim.corpora.MmCorpus.serialize('../data/news_bow.mm', news_corpus)

# store dictionary
id2word_news.save('../data/news.dict')

Wall time: 18.8 s


In [8]:
# load dictionary
id2word_news = gensim.corpora.Dictionary.load('../data/news.dict')

# load corpus
mm_corpus = gensim.corpora.MmCorpus('../data/news_bow.mm')
print(mm_corpus)

MmCorpus(200853 documents, 14366 features, 2548042 non-zero entries)


In [9]:
%time tfidf = gensim.models.TfidfModel(mm_corpus, id2word=id2word_news)
tfidf.save('../data/tfidf_news.model')

Wall time: 3.12 s


In [10]:
%time lsi = gensim.models.lsimodel.LsiModel(corpus=tfidf[mm_corpus], id2word=id2word_news)

Wall time: 51.5 s


In [11]:
lsi.save('../data/lsi_news.model')

In [12]:
lsi = lsi.load('../data/lsi_news.model')

In [13]:
# build the index
from gensim import similarities
%time index = similarities.MatrixSimilarity(lsi[mm_corpus])
index.save('../data/lsi_news.index')

Wall time: 51.9 s


## Search example

In [30]:
import gensim

lsi = gensim.models.lsimodel.LsiModel.load('../data/lsi_news.model')
tfidf = gensim.models.TfidfModel.load('../data/tfidf_news.model')
index = gensim.similarities.MatrixSimilarity.load('../data/lsi_news.index')
dictionary = gensim.corpora.Dictionary.load('../data/news.dict')

In [31]:
# transform doc into lsi vector space (we need the model for this)
doc = "illness"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[tfidf[vec_bow]]

# query doc (we need the index for this)
sims = index[vec_lsi]

# sort by similarity
sims = sorted(enumerate(sims), key=lambda item: -item[1])

In [26]:
# print most similar docs
print(sims[:10])

[(104044, 0.8559768), (120903, 0.80635357), (199133, 0.80370283), (100521, 0.79226947), (92275, 0.7846557), (96541, 0.75075185), (74773, 0.73616123), (133521, 0.69531834), (125753, 0.69314253), (89698, 0.6643342)]


In [27]:
import json
sim_rank = sims[:10]
sim_ids = [i[0] for i in sim_rank]

fp = open(DATA_PATH)
result = []
for line_idx, line in enumerate(fp):
    if line_idx in sim_ids:
        sim_idx = sim_ids.index(line_idx)
        sim_val = sim_rank[sim_idx]
        line = json.loads(line)
        result.append(
            (sim_val, line['headline'], line['short_description'])
        )

result.sort(key=lambda tup:tup[0][1], reverse = True)

for r in result:
    print("| {} | {} | {} | {} |".format(r[0][1], r[0][0], r[1].replace("|"," "), r[2].replace("|", " ")))


| 0.8559768199920654 | 104044 | Staring Into the Abyss of the Criminalization of Persons With Mental Illness |  |
| 0.8063535690307617 | 120903 | Suspected Seattle Gunman Suffers From Severe Mental Illness: Lawyers |  |
| 0.8037028312683105 | 199133 | Does Depression Exist? | If you call your sadness, irritability, loneliness, disappointments, and overwhelm "the mental disorder of depression," does calling all that pain make it "the mental disorder of depression"? |
| 0.7922694683074951 | 100521 | 'There's No Shame' In Talking About Mental Illness |  |
| 0.7846556901931763 | 92275 | Mental Illness and Identity: Would I Shed My Bipolar Disorder Skin? |  |
| 0.7507518529891968 | 96541 | My Mental Collection |  |
| 0.7361612319946289 | 74773 | The Psychological Toll Of Racism In The Wake Of Mizzou | How writing about racism everyday interacts with my mental illness. |
| 0.695318341255188 | 133521 | How to Get Rid of Secrets? Tell Them | by guest blogger Cristina Negrón It took my own ment