<a href="https://radimrehurek.com/gensim/tutorial.html">Tutorials</a><br/>
<a href="https://radimrehurek.com/gensim/apiref.html">API</a>

In [None]:
#conda install -c conda-forge gensim
#for version 3.4.0

In [41]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.CRITICAL)

In [1]:
from gensim.test.utils import common_texts

# Vectorize Corpora

tokenization, stemming, etc... not handled by package

In [2]:
from gensim import corpora
dictionary = corpora.Dictionary(common_texts) #inputs iterator (list, file object, ...) of words
print('token2id:', dictionary.token2id) #word mapped to just id
print('doc2bow:', dictionary.doc2bow(common_texts[1])) #tuple (id,freq), ignoring those with 0 freq
# dictionary.save(pathtofile)
# dictionary=corpora.Dictionary.load(pathtofile)

token2id: {'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}
doc2bow: [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]


Corpus = list (iterator) of bow (list of tuples)

In [3]:
corpus=[dictionary.doc2bow(linelist) for linelist in common_texts]
print(corpus)

[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]]


# Transformations

In [4]:
from gensim import models

## Tfidf (term frequency–inverse document frequency)

In [5]:
tfidf = models.TfidfModel(corpus) #transforms from bow (id,freq) to (id,tf x idf)  -- has no params
print('applied to one doc:', tfidf[corpus[0]])
print('applied to whole:', '\n'.join(map(str,tfidf[corpus])))

applied to one doc: [(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
applied to whole: [(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555)]
[(2, 0.5710059809418182), (5, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(1, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (6, 0.6282580468670046), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(4, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


## <a href="https://radimrehurek.com/gensim/models/lsimodel.html#module-gensim.models.lsimodel">lsi (Latent Semantic Indexing)</a>

In [27]:
lsi = models.LsiModel(tfidf[corpus], id2word=dictionary, num_topics=2) #inputs either bow or tfidf model
lsi.print_topics(2) #print words 'belonging' to each topic
# lsi.save(pathtofile)

[(0,
  '0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"time" + 0.060*"response" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"'),
 (1,
  '-0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"time" + -0.320*"response" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"')]

## <a href="https://radimrehurek.com/gensim/models/ldamodel.html">lda (Latent Dirichlet Allocation)</a>

In [25]:
lda=models.LdaModel(corpus,id2word=dictionary,num_topics=3) #inputs bow
print(lda[corpus[1]]) #distribution of topics
for i,l in enumerate(lda[corpus]): #distribution of topics for all doc
    print(i,l)
lda.print_topics()
lda.update(corpus) #update (online training) with more data

[(0, 0.8963071), (1, 0.048678253), (2, 0.055014614)]
0 [(0, 0.82634884), (1, 0.08513247), (2, 0.08851871)]
1 [(0, 0.89627016), (1, 0.048678327), (2, 0.05505152)]
2 [(0, 0.8572984), (1, 0.06803836), (2, 0.07466328)]
3 [(0, 0.7829753), (1, 0.06865222), (2, 0.14837246)]
4 [(0, 0.81499547), (1, 0.08530275), (2, 0.09970176)]
5 [(0, 0.16784121), (1, 0.65781194), (2, 0.17434685)]
6 [(0, 0.11177107), (1, 0.7646773), (2, 0.123551615)]
7 [(0, 0.084061265), (1, 0.37287787), (2, 0.5430609)]
8 [(0, 0.08876538), (1, 0.09079955), (2, 0.8204351)]


In [21]:
lda.print_topics()

[(0,
  '0.185*"user" + 0.184*"response" + 0.184*"time" + 0.116*"system" + 0.101*"survey" + 0.100*"computer" + 0.032*"trees" + 0.023*"eps" + 0.023*"human" + 0.018*"graph"'),
 (1,
  '0.284*"graph" + 0.268*"trees" + 0.195*"minors" + 0.106*"survey" + 0.019*"system" + 0.019*"user" + 0.018*"eps" + 0.018*"human" + 0.018*"interface" + 0.018*"response"'),
 (2,
  '0.230*"system" + 0.167*"interface" + 0.162*"human" + 0.162*"eps" + 0.092*"computer" + 0.091*"user" + 0.017*"trees" + 0.016*"time" + 0.016*"graph" + 0.016*"survey"')]

## <a href="https://radimrehurek.com/gensim/models/hdpmodel.html">hdp (Hierarchical Dirichlet Process)</a>

In [24]:
hdp = models.HdpModel(corpus, id2word=dictionary)
hdp.print_topics(3)

[(0,
  '0.426*minors + 0.130*survey + 0.115*human + 0.070*interface + 0.063*computer + 0.052*time + 0.031*trees + 0.027*graph + 0.026*system + 0.025*user'),
 (1,
  '0.292*system + 0.116*user + 0.115*minors + 0.099*graph + 0.098*eps + 0.081*time + 0.077*response + 0.060*computer + 0.028*human + 0.021*survey'),
 (2,
  '0.383*eps + 0.215*interface + 0.061*human + 0.058*survey + 0.054*user + 0.040*trees + 0.036*response + 0.036*minors + 0.036*graph + 0.029*time')]

# Similarities

Goal: with a courpus and a query string, find the simiarities of each doc w.r.t. the query

In [36]:
from gensim import models
from gensim import similarities

#build lsi space from the corpus
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

#embed query in lsi space
query = ['human','computer','interaction']
query_vec = lsi[dictionary.doc2bow(query)]  
print(query_vec)

# transform corpus to LSI space and index it
index = similarities.MatrixSimilarity(lsi[corpus]) #or use similarities.Similarity class for large corpus
# index.save()
# index=similarities.MatrixSimilarity.load()

# perform a similarity query against the corpus
sims = index[query_vec]
print(sims)

[(0, 0.4618210045327153), (1, -0.0700276652789999)]
[ 0.998093    0.93748635  0.9984453   0.9865886   0.90755945 -0.12416792
 -0.10639259 -0.09879464  0.05004176]
