In [1]:
from gensim import corpora, models, similarities



In [7]:
documents = ["Human machine interface for lab computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

In [8]:
print(documents)

['Human machine interface for lab computer applications', 'A survey of user opinion of computer system response time', 'The EPS user interface management system', 'System and human system engineering testing of EPS', 'Relation of user perceived response time to error measurement', 'The generation of random binary unordered trees', 'The intersection graph of paths in trees', 'Graph minors IV Widths of trees and well quasi ordering', 'Graph minors A survey']


In [9]:
# remove common words and tokenize them
stoplist = set('for a of the and to in'.split())

In [10]:
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]

In [11]:
print(texts)

[['human', 'machine', 'interface', 'lab', 'computer', 'applications'], ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'management', 'system'], ['system', 'human', 'system', 'engineering', 'testing', 'eps'], ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'], ['generation', 'random', 'binary', 'unordered', 'trees'], ['intersection', 'graph', 'paths', 'trees'], ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'], ['graph', 'minors', 'survey']]


In [12]:
# remove words those appear only once
all_tokens = sum(texts, [])

print(all_tokens)

['human', 'machine', 'interface', 'lab', 'computer', 'applications', 'survey', 'user', 'opinion', 'computer', 'system', 'response', 'time', 'eps', 'user', 'interface', 'management', 'system', 'system', 'human', 'system', 'engineering', 'testing', 'eps', 'relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement', 'generation', 'random', 'binary', 'unordered', 'trees', 'intersection', 'graph', 'paths', 'trees', 'graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering', 'graph', 'minors', 'survey']


In [13]:
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) ==1)

print(tokens_once)

set(['generation', 'random', 'iv', 'engineering', 'relation', 'measurement', 'unordered', 'binary', 'management', 'ordering', 'machine', 'quasi', 'testing', 'paths', 'lab', 'applications', 'intersection', 'perceived', 'widths', 'well', 'error', 'opinion'])


In [14]:
texts = [[word for word in text if word not in tokens_once]
         for text in texts]

print(texts)

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]


In [15]:
dictionary = corpora.Dictionary(texts)

print(dictionary)

Dictionary(12 unique tokens: [u'minors', u'graph', u'system', u'trees', u'eps']...)


In [16]:
dictionary.save('deerwester.dict')  # save as binary file at the dictionary at local directory

In [17]:
dictionary.save_as_text('deerwester_text.dict')  # save as text file at the local directory

In [18]:
print(dictionary.token2id) # show pairs of "word : word-ID number" 

{u'minors': 11, u'graph': 10, u'system': 6, u'trees': 9, u'eps': 8, u'computer': 1, u'survey': 5, u'user': 7, u'human': 2, u'time': 4, u'interface': 0, u'response': 3}


In [19]:
new_doc = "Human computer interaction" # temporary data to see role of below function

new_vec = dictionary.doc2bow(new_doc.lower().split()) # return "word-ID : Frequency of appearance""
print(new_vec)

[(1, 1), (2, 1)]


In [20]:
corpus = [dictionary.doc2bow(text) for text in texts]

print(corpus)

[[(0, 1), (1, 1), (2, 1)], [(1, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(0, 1), (6, 1), (7, 1), (8, 1)], [(2, 1), (6, 2), (8, 1)], [(3, 1), (4, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(5, 1), (10, 1), (11, 1)]]


In [21]:
corpora.MmCorpus.serialize('deerwester.mm', corpus) # save corpus at local directory

In [22]:
corpus = corpora.MmCorpus('deerwester.mm') # try to load the saved corpus from local

print(list(corpus)) # to show corpus which was read above, need to print(list( )) 

[[(0, 1.0), (1, 1.0), (2, 1.0)], [(1, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0)], [(0, 1.0), (6, 1.0), (7, 1.0), (8, 1.0)], [(2, 1.0), (6, 2.0), (8, 1.0)], [(3, 1.0), (4, 1.0), (7, 1.0)], [(9, 1.0)], [(9, 1.0), (10, 1.0)], [(9, 1.0), (10, 1.0), (11, 1.0)], [(5, 1.0), (10, 1.0), (11, 1.0)]]


In [23]:
dictionary = corpora.Dictionary.load('deerwester.dict') # try to load saved dic.from local

print(dictionary)

Dictionary(12 unique tokens: [u'minors', u'graph', u'system', u'trees', u'eps']...)


In [24]:
print(corpus)

MmCorpus(9 documents, 12 features, 28 non-zero entries)


In [25]:
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model

print(tfidf)

TfidfModel(num_docs=9, num_nnz=28)


In [29]:
corpus_tfidf = tfidf[corpus]  # map corpus object into tfidf space

print(corpus_tfidf)

<gensim.interfaces.TransformedCorpus object at 0x0000000009BA2CF8>


In [30]:
for doc in corpus_tfidf: # show tfidf-space mapped words
    print(doc)

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(1, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.44424552527467476), (6, 0.3244870206138555), (7, 0.3244870206138555)]
[(0, 0.5710059809418182), (6, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(2, 0.49182558987264147), (6, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (4, 0.6282580468670046), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(5, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


In [31]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # initialize LSI 
print(lsi)

LsiModel(num_terms=12, num_topics=2, decay=1.0, chunksize=20000)


In [32]:
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus
print(corpus_lsi)

<gensim.interfaces.TransformedCorpus object at 0x0000000009BA2D30>


In [33]:
topic = lsi.print_topics(2)

In [34]:
print(topic)

[(0, u'0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"response" + 0.060*"time" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"'), (1, u'-0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"time" + -0.320*"response" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"')]


In [35]:
for doc in corpus_lsi:
    print(doc)

[(0, 0.066007833960903692), (1, -0.52007033063618491)]
[(0, 0.19667592859142474), (1, -0.76095631677000508)]
[(0, 0.089926399724464118), (1, -0.7241860626752511)]
[(0, 0.075858476521781251), (1, -0.63205515860034311)]
[(0, 0.10150299184980116), (1, -0.57373084830029575)]
[(0, 0.70321089393783121), (1, 0.16115180214025779)]
[(0, 0.87747876731198327), (1, 0.16758906864659406)]
[(0, 0.90986246868185805), (1, 0.14086553628719031)]
[(0, 0.61658253505692828), (1, -0.053929075663893447)]


In [36]:
lsi.save('model.lsi')  # save output model at local directory

In [37]:
lsi = models.LsiModel.load('model.lsi') # try to load above saved model
print(lsi)

LsiModel(num_terms=12, num_topics=2, decay=1.0, chunksize=20000)


In [38]:
doc = "Human computer interaction"  # give new document to calculate similarity degree with already obtained topics

vec_bow = dictionary.doc2bow(doc.lower().split())  # put newly obtained document to existing dictionary object
print(vec_bow)  # show result of above 

[(1, 1), (2, 1)]


In [39]:
vec_lsi = lsi[vec_bow] # convert new document (henceforth, call it "query") to LSI space
print(vec_lsi)

[(0, 0.07910475117444854), (1, -0.57328352430794027)]


In [40]:
index = similarities.MatrixSimilarity(lsi[corpus]) # transform corpus to LSI space and indexize it
print(index)

MatrixSimilarity<9 docs, 2 features>


In [41]:
index.save('deerwester.index') # save index object at local directory

In [42]:
index = similarities.MatrixSimilarity.load('deerwester.index')

In [43]:
print(index)

MatrixSimilarity<9 docs, 2 features>


In [44]:
sims = index[vec_lsi] # calculate degree of similarity of the query to existing corpus
print(sims)

[ 0.99994081  0.99467081  0.99994278  0.999879    0.99935204 -0.08804217
 -0.0515742  -0.02366471  0.1938726 ]


In [45]:
print(list(enumerate(sims))) # output (document_number , document similarity)

[(0, 0.99994081), (1, 0.99467081), (2, 0.99994278), (3, 0.999879), (4, 0.99935204), (5, -0.08804217), (6, -0.0515742), (7, -0.023664713), (8, 0.1938726)]


In [46]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])  # sort output object as per similarity ( largest similarity document comes first )
print(sims)

[(2, 0.99994278), (0, 0.99994081), (3, 0.999879), (4, 0.99935204), (1, 0.99467081), (8, 0.1938726), (7, -0.023664713), (6, -0.0515742), (5, -0.08804217)]
