<a href="https://radimrehurek.com/gensim/tutorial.html">Tutorials</a><br/>
<a href="https://radimrehurek.com/gensim/apiref.html">API</a>

In [1]:
# !pip install gensim
import gensim
gensim.__version__

'3.4.0'

In [41]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.CRITICAL)

In [1]:
from gensim.test.utils import common_texts

# Preprocess

## Tokenize

In [25]:
import gensim
gensim.utils.simple_preprocess('This is a sentence, right?', deacc=True)# deacc=True removes punctuations

['this', 'is', 'sentence', 'right']

## Vectorize Corpora

In [54]:
from gensim import corpora
dictionary = corpora.Dictionary(common_texts) #inputs iterator (list, file object, ...) of words
print('dictionary[0]:', dictionary[0]) #word mapped to just id
print('token2id:', dictionary.token2id) #word mapped to just id
print('doc2bow:', dictionary.doc2bow(common_texts[1])) #tuple (id,freq), ignoring those with 0 freq
# dictionary.save(pathtofile)
# dictionary=corpora.Dictionary.load(pathtofile)

dictionary[0]: computer
token2id: {'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}
doc2bow: [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]


Corpus = list (iterator) of bow (list of tuples)

In [3]:
corpus=[dictionary.doc2bow(linelist) for linelist in common_texts]
print(corpus)

[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]]


# Transformations

In [4]:
from gensim import models

## Tfidf (term frequency–inverse document frequency)

In [5]:
tfidf = models.TfidfModel(corpus) #transforms from bow (id,freq) to (id,tf x idf)  -- has no params
print('applied to one doc:', tfidf[corpus[0]])
print('applied to whole:', '\n'.join(map(str,tfidf[corpus])))

applied to one doc: [(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
applied to whole: [(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555)]
[(2, 0.5710059809418182), (5, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(1, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (6, 0.6282580468670046), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(4, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


## <a href="https://radimrehurek.com/gensim/models/lsimodel.html#module-gensim.models.lsimodel">lsi (Latent Semantic Indexing)</a>

In [27]:
lsi = models.LsiModel(tfidf[corpus], id2word=dictionary, num_topics=2) #inputs either bow or tfidf model
lsi.print_topics(2) #print words 'belonging' to each topic
# lsi.save(pathtofile)

[(0,
  '0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"time" + 0.060*"response" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"'),
 (1,
  '-0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"time" + -0.320*"response" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"')]

## <a href="https://radimrehurek.com/gensim/models/ldamodel.html">lda (Latent Dirichlet Allocation)</a>

In [25]:
lda=models.LdaModel(corpus,id2word=dictionary,num_topics=3) #inputs bow
print(lda[corpus[1]]) #distribution of topics
for i,l in enumerate(lda[corpus]): #distribution of topics for all doc
    print(i,l)
lda.print_topics()
lda.update(corpus) #update (online training) with more data

[(0, 0.8963071), (1, 0.048678253), (2, 0.055014614)]
0 [(0, 0.82634884), (1, 0.08513247), (2, 0.08851871)]
1 [(0, 0.89627016), (1, 0.048678327), (2, 0.05505152)]
2 [(0, 0.8572984), (1, 0.06803836), (2, 0.07466328)]
3 [(0, 0.7829753), (1, 0.06865222), (2, 0.14837246)]
4 [(0, 0.81499547), (1, 0.08530275), (2, 0.09970176)]
5 [(0, 0.16784121), (1, 0.65781194), (2, 0.17434685)]
6 [(0, 0.11177107), (1, 0.7646773), (2, 0.123551615)]
7 [(0, 0.084061265), (1, 0.37287787), (2, 0.5430609)]
8 [(0, 0.08876538), (1, 0.09079955), (2, 0.8204351)]


In [21]:
lda.print_topics()

[(0,
  '0.185*"user" + 0.184*"response" + 0.184*"time" + 0.116*"system" + 0.101*"survey" + 0.100*"computer" + 0.032*"trees" + 0.023*"eps" + 0.023*"human" + 0.018*"graph"'),
 (1,
  '0.284*"graph" + 0.268*"trees" + 0.195*"minors" + 0.106*"survey" + 0.019*"system" + 0.019*"user" + 0.018*"eps" + 0.018*"human" + 0.018*"interface" + 0.018*"response"'),
 (2,
  '0.230*"system" + 0.167*"interface" + 0.162*"human" + 0.162*"eps" + 0.092*"computer" + 0.091*"user" + 0.017*"trees" + 0.016*"time" + 0.016*"graph" + 0.016*"survey"')]

## <a href="https://radimrehurek.com/gensim/models/hdpmodel.html">hdp (Hierarchical Dirichlet Process)</a>

In [24]:
hdp = models.HdpModel(corpus, id2word=dictionary)
hdp.print_topics(3)

[(0,
  '0.426*minors + 0.130*survey + 0.115*human + 0.070*interface + 0.063*computer + 0.052*time + 0.031*trees + 0.027*graph + 0.026*system + 0.025*user'),
 (1,
  '0.292*system + 0.116*user + 0.115*minors + 0.099*graph + 0.098*eps + 0.081*time + 0.077*response + 0.060*computer + 0.028*human + 0.021*survey'),
 (2,
  '0.383*eps + 0.215*interface + 0.061*human + 0.058*survey + 0.054*user + 0.040*trees + 0.036*response + 0.036*minors + 0.036*graph + 0.029*time')]

# Similarities

Goal: with a courpus and a query string, find the simiarities of each doc w.r.t. the query

In [36]:
from gensim import models
from gensim import similarities

#build lsi space from the corpus
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

#embed query in lsi space
query = ['human','computer','interaction']
query_vec = lsi[dictionary.doc2bow(query)]  
print(query_vec)

# transform corpus to LSI space and index it
index = similarities.MatrixSimilarity(lsi[corpus]) #or use similarities.Similarity class for large corpus
# index.save()
# index=similarities.MatrixSimilarity.load()

# perform a similarity query against the corpus
sims = index[query_vec]
print(sims)

[(0, 0.4618210045327153), (1, -0.0700276652789999)]
[ 0.998093    0.93748635  0.9984453   0.9865886   0.90755945 -0.12416792
 -0.10639259 -0.09879464  0.05004176]


# <a href="https://radimrehurek.com/gensim/models/phrases.html">Phrase Modelling</a> 

In [139]:
import nltk,gensim

sentences = sum([list(nltk.corpus.webtext.sents(fileid)) for fileid in nltk.corpus.webtext.fileids()],[])

bigram = gensim.models.Phrases(sentences, min_count=5, threshold=100)  # bigram model, higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[sentences], min_count=5, threshold=100)  # trigram model
   
bigram = gensim.models.phrases.Phraser(bigram)
trigram = gensim.models.phrases.Phraser(trigram)

print(sentences[0])
print(bigram[sentences[0]])
print(trigram[bigram[sentences[0]]])
# bigram[sentences] # can also apply to whole corpus (list of list)

# phrases.add_vocab([["hello", "world"], ["meow"]])  # update model with new sentences


['Cookie', 'Manager', ':', '"', 'Don', "'", 't', 'allow', 'sites', 'that', 'set', 'removed', 'cookies', 'to', 'set', 'future', 'cookies', '"', 'should', 'stay', 'checked', 'When', 'in', 'full', 'screen', 'mode', 'Pressing', 'Ctrl', '-', 'N', 'should', 'open', 'a', 'new', 'browser', 'when', 'only', 'download', 'dialog', 'is', 'left', 'open', 'add', 'icons', 'to', 'context', 'menu', 'So', 'called', '"', 'tab', 'bar', '"', 'should', 'be', 'made', 'a', 'proper', 'toolbar', 'or', 'given', 'the', 'ability', 'collapse', '/', 'expand', '.']
['Cookie_Manager', ':', '"', 'Don', "'", 't', 'allow', 'sites', 'that', 'set', 'removed', 'cookies', 'to', 'set', 'future', 'cookies', '"', 'should', 'stay', 'checked', 'When', 'in', 'full_screen', 'mode', 'Pressing', 'Ctrl', '-', 'N', 'should', 'open', 'a', 'new', 'browser', 'when', 'only', 'download', 'dialog', 'is', 'left', 'open', 'add', 'icons', 'to', 'context_menu', 'So', 'called', '"', 'tab', 'bar', '"', 'should', 'be', 'made', 'a', 'proper', 'toolba

# Word2Vec

In [34]:
from gensim.models import Word2Vec
from nltk.corpus import brown
model = Word2Vec(brown.sents(),size=10) #training

In [35]:
print(model.wv['human'])  #get vector representation
print(model.wv.similarity('university','school'))
print(len(model.wv.vocab)) #vocab size
print(model.wv.most_similar(positive=['university'], topn = 3)) #'add' university
print(model.wv.doesnt_match('breakfast cereal dinner lunch'.split())) #Finding a word that is not in a list

0.882792481859794
15173
[('membership', 0.9912745952606201), ('combination', 0.987086296081543), ('treatment', 0.9851009845733643)]
dinner


In [36]:
model.wv.most_similar(positive=['woman','king'], negative=['man'], topn = 1)

[('Steel', 0.9899702072143555)]

## pretrained by nltk

In [11]:
import gensim
from nltk.data import find
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
modelnltk = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

In [38]:
# print(model.wv['human'])  #get vector representation
print(modelnltk.similarity('university','school'))
print(len(modelnltk.vocab)) #vocab size
print(modelnltk.most_similar(positive=['university'], topn = 3))
print(modelnltk.doesnt_match('breakfast cereal dinner lunch'.split())) #Finding a word that is not in a list

0.5080746061254015
43981
[('universities', 0.7003918886184692), ('faculty', 0.6780906915664673), ('undergraduate', 0.6587096452713013)]
cereal


In [42]:
modelnltk.most_similar(positive=['woman','king'], negative=['man'], topn = 1) # king-man+woman
modelnltk.most_similar(positive=['Tokyo','Germany'], negative=['Berlin'], topn = 1) # Germany-Berlin+Tokyo

[('Japan', 0.7969787120819092)]

## pretrained by Google

In [44]:
# https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM
# https://github.com/mmihaltz/word2vec-GoogleNews-vectors
# http://mccormickml.com/2016/04/12/googles-pretrained-word2vec-model-in-python/
modelgoog=gensim.models.KeyedVectors.load_word2vec_format('~/GoogleNews-vectors-negative300.bin', binary=True) 

In [45]:
modelgoog.most_similar(positive=['Beijing','Germany'], negative=['Berlin'], topn = 1)

[('China', 0.7755854725837708)]

## pretrained GloVe by Stanford

In [8]:
import urllib.request,zipfile
import os,tempfile
os.chdir(tempfile.gettempdir())
urllib.request.urlretrieve('http://nlp.stanford.edu/data/glove.6B.zip','glove.6B.zip')
zipfile.ZipFile('glove.6B.zip').extractall()

In [13]:
gensim.scripts.glove2word2vec.glove2word2vec('glove.6B.100d.txt', 'glove2vec')
glove=gensim.models.KeyedVectors.load_word2vec_format('glove2vec', binary=False)

In [17]:
print(glove.most_similar(positive=['woman','king'], negative=['man'], topn = 1)) # king-man+woman
print(glove.most_similar(positive=['tokyo','germany'], negative=['berlin'], topn = 1)) # Germany-Berlin+Tokyo

[('queen', 0.7698541283607483)]
[('japan', 0.8432861566543579)]


## self-trained Word2Vec

https://machinelearningmastery.com/develop-word-embeddings-python-gensim/

In [6]:
from gensim.models import Word2Vec
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
             ['this', 'is', 'the', 'second', 'sentence'],
             ['yet', 'another', 'sentence'],
             ['one', 'more', 'sentence'],
             ['and', 'the', 'final', 'sentence']]

model = Word2Vec(sentences, size=10, window=5, min_count=1, workers=4)# train
print(model)# summarize

print(list(model.wv.vocab)) # summarize vocabulary
print(model.wv['sentence'])   # access vector for one word

# model.save('model.bin')# save model
# new_model = Word2Vec.load('model.bin')# load model

Word2Vec(vocab=14, size=10, alpha=0.025)
['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec', 'second', 'yet', 'another', 'one', 'more', 'and', 'final']
[ 0.02409168 -0.02793393  0.03854181 -0.01606321  0.0270295   0.0039328
  0.01337573 -0.04209328 -0.02356309  0.01950325]


# Topic Modelling

https://radimrehurek.com/gensim/models/ldamodel.html

https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

## Internal LDA

In [147]:
import nltk, gensim
stopwords=nltk.corpus.stopwords.words('english')+['from', 'subject', 're', 'edu', 'use']

In [148]:
corp = [nltk.corpus.webtext.raw(fileid) for fileid in nltk.corpus.webtext.fileids()] #list of strings (docs)
corp = [gensim.utils.simple_preprocess(doc,deacc=True) for doc in corp] #list of list of words
corp = [[w for w in d if w not in stopwords] for d in corp] #remove stopwords
vocab = gensim.corpora.dictionary.Dictionary(corp) #Dictionary(~set) of unique words
corpbow = [vocab.doc2bow(line) for line in corp] # list of list of tuples (wordid, number of occurences)

In [149]:
#training
lda = gensim.models.LdaModel(corpbow, num_topics=10, id2word=vocab) #id2word necessary for printing topics in terms of words
# lda.save(fn)
# lda = gensim.models.LdaModel.load(fn)

In [150]:
# load unseen corpus 
corp2 = [nltk.corpus.nps_chat.raw(fileid) for fileid in nltk.corpus.nps_chat.fileids()]
corp2 = [gensim.utils.simple_preprocess(doc,deacc=True) for doc in corp2]
corp2 = [[w for w in d if w not in stopwords] for d in corp2]
corp2bow = [vocab.doc2bow(line) for line in corp2]
#incremental training
lda.update(corp2bow)

In [152]:
print(lda.log_perplexity(corpbow)) #lower better
print(gensim.models.CoherenceModel(model=lda, texts=corp, dictionary=vocab, coherence='c_v').get_coherence()) #higher better

-9.329627165212187
0.4024744707366629


$\uparrow$ Choose num_topics as the "elbow" in a plot of Coherence vs. num_topics

In [163]:
print(lda.print_topics()) # word distribution of a topic
print(lda.show_topics(formatted=False)) # same as above as list of tuples
print(lda[corpbow[3]]) # find topic distribution of a doc

[(0, '0.011*"word" + 0.010*"girl" + 0.006*"guy" + 0.005*"like" + 0.004*"post" + 0.003*"get" + 0.003*"yeah" + 0.003*"man" + 0.003*"woman" + 0.003*"know"'), (1, '0.012*"girl" + 0.010*"guy" + 0.009*"word" + 0.005*"like" + 0.004*"woman" + 0.004*"know" + 0.004*"post" + 0.004*"get" + 0.003*"yeah" + 0.003*"class"'), (2, '0.284*"word" + 0.086*"class" + 0.058*"post" + 0.038*"uh" + 0.031*"lol" + 0.026*"user" + 0.021*"emotion" + 0.015*"hi" + 0.010*"join" + 0.010*"part"'), (3, '0.015*"word" + 0.008*"girl" + 0.004*"like" + 0.004*"firefox" + 0.004*"guy" + 0.003*"post" + 0.003*"page" + 0.003*"class" + 0.003*"get" + 0.003*"bookmarks"'), (4, '0.390*"word" + 0.081*"class" + 0.081*"post" + 0.047*"uh" + 0.035*"user" + 0.027*"statement" + 0.017*"join" + 0.014*"system" + 0.009*"part" + 0.008*"hi"'), (5, '0.009*"girl" + 0.006*"guy" + 0.006*"page" + 0.005*"firefox" + 0.004*"window" + 0.004*"new" + 0.004*"bookmarks" + 0.004*"tab" + 0.004*"browser" + 0.004*"like"'), (6, '0.023*"word" + 0.015*"guy" + 0.010*"girl

## umass Mallet

In [164]:
import urllib.request,zipfile
import os,tempfile
os.chdir(tempfile.gettempdir())
urllib.request.urlretrieve('http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip','mallet.zip')
zipfile.ZipFile('mallet.zip').extractall()
path=tempfile.gettempdir()+'/mallet-2.0.8/bin/mallet'
os.chmod(path,0777)

/private/var/folders/r7/n9dny1wj46q8njz2gds66kqr0000gp/T


In [None]:
ldamallet = gensim.models.wrappers.LdaMallet(path, corpus=corpbow, num_topics=10, id2word=vocab)