## Topics and Transformation

In [2]:
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
import tempfile
import os.path

TEMP_FOLDER = 'D:\OneDrive\ANLY580\datasets'
print('Folder "{}" will be used to save temporary dictionary and corpus.'.format(TEMP_FOLDER))

Folder "D:\OneDrive\ANLY580\datasets" will be used to save temporary dictionary and corpus.


### Transformation interface

In [4]:
from gensim import corpora, models, similarities
if os.path.isfile(os.path.join(TEMP_FOLDER, 'deerwester.dict')):
    dictionary = corpora.Dictionary.load(os.path.join(TEMP_FOLDER, 'deerwester.dict'))
    corpus = corpora.MmCorpus(os.path.join(TEMP_FOLDER, 'deerwester.mm'))
    print("Used files generated from first tutorial")
else:
    print("Please run first tutorial to generate data set")

2017-10-25 12:59:30,486 : INFO : 'pattern' package not found; tag filters are not available for English
2017-10-25 12:59:30,524 : INFO : loading Dictionary object from D:\OneDrive\ANLY580\datasets\deerwester.dict
2017-10-25 12:59:30,530 : INFO : loaded D:\OneDrive\ANLY580\datasets\deerwester.dict
2017-10-25 12:59:30,536 : INFO : loaded corpus index from D:\OneDrive\ANLY580\datasets\deerwester.mm.index
2017-10-25 12:59:30,537 : INFO : initializing corpus reader from D:\OneDrive\ANLY580\datasets\deerwester.mm
2017-10-25 12:59:30,542 : INFO : accepted corpus with 9 documents, 12 features, 28 non-zero entries


Used files generated from first tutorial


In [5]:
print(dictionary[0])
print(dictionary[1])
print(dictionary[2])

human
interface
computer


### Creating a transformation

In [6]:
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model

2017-10-25 12:59:30,557 : INFO : collecting document frequencies
2017-10-25 12:59:30,558 : INFO : PROGRESS: processing document #0
2017-10-25 12:59:30,560 : INFO : calculating IDF weights for 9 documents and 11 features (28 matrix non-zeros)


### Transforming vectors

In [7]:
doc_bow = [(0, 1), (1, 1)]
print(tfidf[doc_bow]) # step 2 -- use the model to transform vectors

[(0, 0.7071067811865476), (1, 0.7071067811865476)]


In [8]:
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
    print(doc)

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(2, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.3244870206138555), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.44424552527467476)]
[(1, 0.5710059809418182), (4, 0.4170757362022777), (5, 0.4170757362022777), (8, 0.5710059809418182)]
[(0, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)]
[(4, 0.45889394536615247), (6, 0.6282580468670046), (7, 0.6282580468670046)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(3, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


In [9]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

2017-10-25 12:59:30,585 : INFO : using serial LSI version on this node
2017-10-25 12:59:30,586 : INFO : updating model with new documents
2017-10-25 12:59:30,588 : INFO : preparing a new chunk of documents
2017-10-25 12:59:30,589 : INFO : using 100 extra samples and 2 power iterations
2017-10-25 12:59:30,590 : INFO : 1st phase: constructing (12, 102) action matrix
2017-10-25 12:59:30,592 : INFO : orthonormalizing (12, 102) action matrix
2017-10-25 12:59:30,618 : INFO : 2nd phase: running dense svd on (12, 9) matrix
2017-10-25 12:59:30,631 : INFO : computing the final decomposition
2017-10-25 12:59:30,632 : INFO : keeping 2 factors (discarding 47.565% of energy spectrum)
2017-10-25 12:59:30,635 : INFO : processed documents up to #9
2017-10-25 12:59:30,636 : INFO : topic #0(1.594): -0.703*"trees" + -0.538*"graph" + -0.402*"minors" + -0.187*"survey" + -0.061*"system" + -0.060*"time" + -0.060*"response" + -0.058*"user" + -0.049*"computer" + -0.035*"interface"
2017-10-25 12:59:30,638 : INFO

In [10]:
lsi.print_topics(2)

2017-10-25 12:59:30,643 : INFO : topic #0(1.594): -0.703*"trees" + -0.538*"graph" + -0.402*"minors" + -0.187*"survey" + -0.061*"system" + -0.060*"time" + -0.060*"response" + -0.058*"user" + -0.049*"computer" + -0.035*"interface"
2017-10-25 12:59:30,645 : INFO : topic #1(1.476): -0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"time" + -0.320*"response" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"


[(0,
  '-0.703*"trees" + -0.538*"graph" + -0.402*"minors" + -0.187*"survey" + -0.061*"system" + -0.060*"time" + -0.060*"response" + -0.058*"user" + -0.049*"computer" + -0.035*"interface"'),
 (1,
  '-0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"time" + -0.320*"response" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"')]

In [11]:
for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
    print(doc)

[(0, -0.066007833960903567), (1, -0.52007033063618602)]
[(0, -0.19667592859142471), (1, -0.76095631677000464)]
[(0, -0.089926399724463701), (1, -0.72418606267525165)]
[(0, -0.075858476521781376), (1, -0.63205515860034356)]
[(0, -0.10150299184980109), (1, -0.57373084830029519)]
[(0, -0.70321089393783154), (1, 0.16115180214025795)]
[(0, -0.87747876731198393), (1, 0.16758906864659415)]
[(0, -0.90986246868185883), (1, 0.14086553628719023)]
[(0, -0.61658253505692873), (1, -0.0539290756638936)]


In [12]:
model = models.TfidfModel(corpus, normalize=True)

2017-10-25 13:00:39,355 : INFO : collecting document frequencies
2017-10-25 13:00:39,357 : INFO : PROGRESS: processing document #0
2017-10-25 13:00:39,358 : INFO : calculating IDF weights for 9 documents and 11 features (28 matrix non-zeros)


In [13]:
model = models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300)

NameError: name 'tfidf_corpus' is not defined