In [1]:
# import and setup modules we'll be using in this notebook
import logging
import itertools

import numpy as np
import gensim

logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

def head(stream, n=10):
    """Convenience fnc: return the first `n` elements of the stream, as plain list."""
    return list(itertools.islice(stream, n))



In [2]:
from gensim.utils import smart_open, simple_preprocess
from gensim.corpora.wikicorpus import _extract_pages, filter_wiki
from gensim.parsing.preprocessing import STOPWORDS
from xml.etree.cElementTree import iterparse

def my_extract_pages(f):
    elems = (elem for _, elem in iterparse(f, events=("end",)))
    page_tag = "rev"
    for elem in elems:
        if elem.tag == page_tag and elem.text != None:
            text = elem.text
            yield text
            elem.clear()

def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

def iter_wiki(dump_file):
    """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple."""
    ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
    for text in my_extract_pages(smart_open(dump_file)):
        text = filter_wiki(text)
        tokens = tokenize(text)
        if len(tokens) < 50:
            continue  # ignore short articles and various meta-articles
        yield tokens

In [3]:
doc_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\AllTopics.xml'
stream = iter_wiki(doc_path)

for tokens in itertools.islice(iter_wiki(doc_path), 8):
    print (tokens[:10])
doc_stream = (tokens for tokens in iter_wiki(doc_path))
%time id2word_wiki = gensim.corpora.Dictionary(doc_stream)
print(id2word_wiki)

INFO : adding document #0 to Dictionary(0 unique tokens: [])


['artificial', 'intelligence', 'ai', 'intelligence', 'exhibited', 'machines', 'science', 'field', 'ai', 'research']
['association', 'computing', 'machinery', 'acm', 'international', 'learned', 'society', 'computing', 'founded', 'world']
['user', 'interacts', 'application', 'software', 'typical', 'desktop', 'application', 'software', 'layer', 'interfaces']
['link', 'programming', 'language', 'theory', 'link', 'computational', 'complexity', 'theory', 'link', 'graphics']
['computational', 'linguistics', 'field', 'concerned', 'statistical', 'rule', 'based', 'modeling', 'natural', 'language']
['language', 'hello', 'world', 'source', 'code', 'known', 'hello', 'world', 'snippet', 'seminal']
['computational', 'chemistry', 'branch', 'chemistry', 'uses', 'simulation', 'assist', 'solving', 'chemical', 'problems']
['diagram', 'complexity', 'classes', 'provided', 'np', 'existence', 'problems', 'np', 'outside', 'np']


INFO : built Dictionary(125701 unique tokens: ['azimov', 'soibelman', 'nuovo', 'brit', 'clarify']...) from 3919 documents (total 2112796 corpus positions)


Wall time: 14.5 s
Dictionary(125701 unique tokens: ['azimov', 'soibelman', 'nuovo', 'brit', 'clarify']...)


In [4]:
# ignore words that appear in less than 20 documents or more than 10% documents
id2word_wiki.filter_extremes(no_below=10, no_above=0.1)
print(id2word_wiki)


INFO : discarding 111000 tokens: [('beginning', 407), ('applied', 475), ('anytime', 8), ('outsource', 1), ('enormity', 1), ('st', 409), ('kinect', 3), ('case', 718), ('neats', 1), ('unanswered', 6)]...
INFO : keeping 14701 tokens which were in no less than 10 and no more than 391 (=10.0%) documents
INFO : resulting dictionary: Dictionary(14701 unique tokens: ['anticipating', 'yoga', 'inherent', 'zeta', 'euclidean']...)


Dictionary(14701 unique tokens: ['anticipating', 'yoga', 'inherent', 'zeta', 'euclidean']...)


In [6]:

class WikiCorpus(object):
    def __init__(self, dump_file, dictionary, clip_docs=None):
        """
        Parse the first `clip_docs` Wikipedia documents from file `dump_file`.
        Yield each document in turn, as a list of tokens (unicode strings).
        
        """
        self.dump_file = dump_file
        self.dictionary = dictionary
        self.clip_docs = clip_docs
    
    def __iter__(self):
        for tokens in itertools.islice(iter_wiki(self.dump_file), self.clip_docs):
            yield self.dictionary.doc2bow(tokens)
    
    def __len__(self):
        return self.clip_docs

# create a stream of bag-of-words vectors
wiki_corpus = WikiCorpus(doc_path, id2word_wiki)
vector = next(iter(wiki_corpus))
# print(vector)  # print the first vector in the stream

In [7]:
# what is the most common word in that first article?
most_index, most_count = max(vector, key=lambda x: x[1])
print(id2word_wiki[most_index], most_count)

print(id2word_wiki[68])

import heapq
print(heapq.nlargest(3, vector, key=lambda x: x[1]))


ai 136
cyprus
[(4462, 136), (11654, 107), (5247, 81)]


In [8]:
wiki_bow_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_bow.mm'
%time gensim.corpora.MmCorpus.serialize(wiki_bow_path, wiki_corpus)

INFO : storing corpus in Matrix Market format to D:\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_bow.mm
INFO : saving sparse matrix to D:\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_bow.mm
INFO : PROGRESS: saving document #0
INFO : PROGRESS: saving document #1000
INFO : PROGRESS: saving document #2000
INFO : PROGRESS: saving document #3000
INFO : saved 3919x14701 matrix, density=1.257% (724187/57613219)
INFO : saving MmCorpus index to D:\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_bow.mm.index


Wall time: 15.1 s


In [9]:
mm_corpus = gensim.corpora.MmCorpus(wiki_bow_path)
print(mm_corpus)

print(len([ x for x in iter(mm_corpus)]))

INFO : loaded corpus index from D:\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_bow.mm.index
INFO : initializing corpus reader from D:\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_bow.mm
INFO : accepted corpus with 3919 documents, 14701 features, 724187 non-zero entries


MmCorpus(3919 documents, 14701 features, 724187 non-zero entries)
3919


In [24]:
clipped_corpus = gensim.utils.ClippedCorpus(mm_corpus, 4000)  # use fewer documents during training, LDA is slow
# ClippedCorpus new in gensim 0.10.1
# copy&paste it from https://github.com/piskvorky/gensim/blob/0.10.1/gensim/utils.py#L467 if necessary (or upgrade your gensim)
%time lda_model = gensim.models.LdaModel(clipped_corpus, num_topics=3, id2word=id2word_wiki, passes=6)

INFO : using symmetric alpha at 0.3333333333333333
INFO : using symmetric eta at 6.802258349772124e-05
INFO : using serial LDA version on this node
INFO : running online LDA training, 3 topics, 6 passes over the supplied corpus of 3919 documents, updating model once every 2000 documents, evaluating perplexity every 3919 documents, iterating 50x with a convergence threshold of 0.001000
INFO : PROGRESS: pass 0, at document #2000/3919
INFO : merging changes from 2000 documents into a model of 3919 documents
INFO : topic #0 (0.333): 0.005*"conference" + 0.004*"film" + 0.002*"computational" + 0.002*"award" + 0.002*"engineering" + 0.001*"poet" + 0.001*"king" + 0.001*"genre" + 0.001*"born" + 0.001*"million"
INFO : topic #1 (0.333): 0.002*"conference" + 0.002*"computational" + 0.002*"narrative" + 0.002*"film" + 0.002*"award" + 0.002*"acm" + 0.001*"engineering" + 0.001*"programming" + 0.001*"learning" + 0.001*"criticism"
INFO : topic #2 (0.333): 0.003*"computational" + 0.003*"algorithm" + 0.002

Wall time: 1min 41s


In [11]:
_ = lda_model.print_topics()  # print a few most important words for each LDA topic

INFO : topic #0 (0.100): 0.008*"models" + 0.007*"computational" + 0.005*"programming" + 0.004*"equations" + 0.004*"distribution" + 0.004*"engineering" + 0.004*"design" + 0.004*"modeling" + 0.003*"code" + 0.003*"energy"
INFO : topic #1 (0.100): 0.054*"px" + 0.012*"triangle" + 0.010*"regular" + 0.008*"circle" + 0.007*"cell" + 0.007*"square" + 0.006*"symmetry" + 0.006*"angle" + 0.006*"truncated" + 0.006*"tiling"
INFO : topic #2 (0.100): 0.008*"students" + 0.006*"award" + 0.005*"women" + 0.005*"prize" + 0.005*"conference" + 0.004*"college" + 0.004*"http" + 0.003*"literacy" + 0.003*"tales" + 0.003*"www"
INFO : topic #3 (0.100): 0.012*"notation" + 0.011*"composite" + 0.006*"prime" + 0.005*"arithmetic" + 0.005*"symbols" + 0.005*"base" + 0.004*"table" + 0.004*"decimal" + 0.004*"sequence" + 0.004*"symbol"
INFO : topic #4 (0.100): 0.013*"theorem" + 0.008*"proof" + 0.008*"algorithm" + 0.007*"algebra" + 0.007*"logic" + 0.006*"matrix" + 0.005*"groups" + 0.005*"finite" + 0.005*"graph" + 0.004*"conje

In [15]:
%time tfidf_model = gensim.models.TfidfModel(mm_corpus, id2word=id2word_wiki)

INFO : collecting document frequencies
INFO : PROGRESS: processing document #0
INFO : calculating IDF weights for 3919 documents and 14700 features (724187 matrix non-zeros)


Wall time: 1.8 s


In [16]:
%time lsi_model = gensim.models.LsiModel(tfidf_model[mm_corpus], id2word=id2word_wiki, num_topics=3)

INFO : using serial LSI version on this node
INFO : updating model with new documents
INFO : preparing a new chunk of documents
INFO : using 100 extra samples and 2 power iterations
INFO : 1st phase: constructing (14701, 103) action matrix
INFO : orthonormalizing (14701, 103) action matrix
INFO : 2nd phase: running dense svd on (103, 3919) matrix
INFO : computing the final decomposition
INFO : keeping 3 factors (discarding 84.997% of energy spectrum)
INFO : processed documents up to #3919
INFO : topic #0(6.822): 0.129*"conference" + 0.110*"computational" + 0.102*"theorem" + 0.095*"algorithm" + 0.092*"programming" + 0.090*"award" + 0.089*"born" + 0.074*"proof" + 0.073*"usa" + 0.068*"logic"
INFO : topic #1(5.062): -0.335*"born" + -0.279*"usa" + 0.214*"theorem" + 0.160*"algorithm" + -0.141*"conference" + 0.128*"proof" + 0.103*"algebra" + 0.100*"notation" + 0.099*"vector" + -0.098*"germany"
INFO : topic #2(4.607): -0.533*"born" + -0.471*"usa" + -0.140*"germany" + -0.124*"theorem" + -0.109*

Wall time: 3.44 s


In [17]:
# cache the transformed corpora to disk, for use in later notebooks
wiki_tfidf_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_tfidf.mm'
wiki_lsa_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_lsa.mm'
%time gensim.corpora.MmCorpus.serialize(wiki_tfidf_path, tfidf_model[mm_corpus])
%time gensim.corpora.MmCorpus.serialize(wiki_lsa_path, lsi_model[tfidf_model[mm_corpus]])
# gensim.corpora.MmCorpus.serialize('./data/wiki_lda.mm', lda_model[mm_corpus])

INFO : storing corpus in Matrix Market format to D:\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_tfidf.mm
INFO : saving sparse matrix to D:\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_tfidf.mm
INFO : PROGRESS: saving document #0
INFO : PROGRESS: saving document #1000
INFO : PROGRESS: saving document #2000
INFO : PROGRESS: saving document #3000
INFO : saved 3919x14701 matrix, density=1.257% (724187/57613219)
INFO : saving MmCorpus index to D:\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_tfidf.mm.index
INFO : storing corpus in Matrix Market format to D:\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_lsa.mm
INFO : saving sparse matrix to D:\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_lsa.mm


Wall time: 4.28 s


INFO : PROGRESS: saving document #0
INFO : PROGRESS: saving document #1000
INFO : PROGRESS: saving document #2000
INFO : PROGRESS: saving document #3000
INFO : saved 3919x3 matrix, density=100.000% (11757/11757)
INFO : saving MmCorpus index to D:\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_lsa.mm.index


Wall time: 3.66 s


In [22]:
# evaluate on 1k documents **not** used in LDA training
test1_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\Test_doc_convexhull.xml'
test2_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\Test_doc_Allegory.xml'
test3_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\Test_doc_Ethics.xml'

# doc_stream = [tokens for tokens in iter_wiki(test1_path))  # generator
test_doc_1 = [tokens for tokens in iter_wiki(test1_path)]
#print(lda_model.print_topics(max(lda_model[id2word_wiki.doc2bow(test_doc_1[0])], key=lambda item: item[1])[0]))

#for x in heapq.nlargest(3, id2word_wiki.doc2bow(test_doc_1[0]), key=lambda x: x[1]):
#    print(x[0], id2word_wiki[x[0]], x[1])

#print(max(lda_model[id2word_wiki.doc2bow(test_doc_1[0])], key=lambda item: item[1]))
#print(max(lda_model[id2word_wiki.doc2bow(test_doc_1[0])], key=lambda item: item[1])[0])

print(lda_model[id2word_wiki.doc2bow(test_doc_1[0])])

lda_model.print_topics()

part1 = [lda_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc_1]
#print(part1)

# doc_stream = [tokens for tokens in iter_wiki(test1_path))  # generator
test_doc_2 = [tokens for tokens in iter_wiki(test2_path)]
part2 = [lda_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc_2]

print(lda_model[id2word_wiki.doc2bow(test_doc_2[0])])

# doc_stream = [tokens for tokens in iter_wiki(test1_path))  # generator
test_doc_3 = [tokens for tokens in iter_wiki(test3_path)]
part3 = [lda_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc_3]
print(lda_model[id2word_wiki.doc2bow(test_doc_3[0])])

INFO : topic #0 (0.100): 0.008*"models" + 0.007*"computational" + 0.005*"programming" + 0.004*"equations" + 0.004*"distribution" + 0.004*"engineering" + 0.004*"design" + 0.004*"modeling" + 0.003*"code" + 0.003*"energy"
INFO : topic #1 (0.100): 0.054*"px" + 0.012*"triangle" + 0.010*"regular" + 0.008*"circle" + 0.007*"cell" + 0.007*"square" + 0.006*"symmetry" + 0.006*"angle" + 0.006*"truncated" + 0.006*"tiling"
INFO : topic #2 (0.100): 0.008*"students" + 0.006*"award" + 0.005*"women" + 0.005*"prize" + 0.005*"conference" + 0.004*"college" + 0.004*"http" + 0.003*"literacy" + 0.003*"tales" + 0.003*"www"
INFO : topic #3 (0.100): 0.012*"notation" + 0.011*"composite" + 0.006*"prime" + 0.005*"arithmetic" + 0.005*"symbols" + 0.005*"base" + 0.004*"table" + 0.004*"decimal" + 0.004*"sequence" + 0.004*"symbol"
INFO : topic #4 (0.100): 0.013*"theorem" + 0.008*"proof" + 0.008*"algorithm" + 0.007*"algebra" + 0.007*"logic" + 0.006*"matrix" + 0.005*"groups" + 0.005*"finite" + 0.005*"graph" + 0.004*"conje

[(1, 0.14286066754093796), (4, 0.85569766503756939)]
[(1, 0.018565152002948199), (5, 0.24466592756443808), (6, 0.27926708632394187), (7, 0.33510336781368494), (8, 0.12169899118677541)]
[(0, 0.1017609445581847), (2, 0.11228334793916089), (7, 0.76585487779527406), (8, 0.019575307072415293)]


In [23]:
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part2)]))
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part3)]))
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part2, part3)]))


0.00593754613919
0.0
0.643950721481


In [None]:
part1 = [lsi_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc_1]

part2 = [lsi_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc_2]

part3 = [lsi_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc_3]

print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part2)]))
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part3)]))
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part2, part3)]))

In [18]:
# evaluate on 1k documents **not** used in LDA training
test1_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\Test_doc_Ethics.xml'
test2_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\Test_doc_Allegory.xml'
test3_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\Test_doc_AI.xml'

# doc_stream = [tokens for tokens in iter_wiki(test1_path))  # generator
test_doc_1 = [tokens for tokens in iter_wiki(test1_path)]
#print(lda_model.print_topics(max(lda_model[id2word_wiki.doc2bow(test_doc_1[0])], key=lambda item: item[1])[0]))

#for x in heapq.nlargest(3, id2word_wiki.doc2bow(test_doc_1[0]), key=lambda x: x[1]):
#    print(x[0], id2word_wiki[x[0]], x[1])

#print(max(lda_model[id2word_wiki.doc2bow(test_doc_1[0])], key=lambda item: item[1]))
#print(max(lda_model[id2word_wiki.doc2bow(test_doc_1[0])], key=lambda item: item[1])[0])

print(lsi_model[id2word_wiki.doc2bow(test_doc_1[0])])

lsi_model.print_topics()

part1 = [lsi_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc_1]
#print(part1)

# doc_stream = [tokens for tokens in iter_wiki(test1_path))  # generator
test_doc_2 = [tokens for tokens in iter_wiki(test2_path)]
part2 = [lda_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc_2]

print(lsi_model[id2word_wiki.doc2bow(test_doc_2[0])])

# doc_stream = [tokens for tokens in iter_wiki(test1_path))  # generator
test_doc_3 = [tokens for tokens in iter_wiki(test3_path)]
part3 = [lsi_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc_3]
print(lsi_model[id2word_wiki.doc2bow(test_doc_3[0])])

INFO : topic #0(6.822): 0.129*"conference" + 0.110*"computational" + 0.102*"theorem" + 0.095*"algorithm" + 0.092*"programming" + 0.090*"award" + 0.089*"born" + 0.074*"proof" + 0.073*"usa" + 0.068*"logic"
INFO : topic #1(5.062): -0.335*"born" + -0.279*"usa" + 0.214*"theorem" + 0.160*"algorithm" + -0.141*"conference" + 0.128*"proof" + 0.103*"algebra" + 0.100*"notation" + 0.099*"vector" + -0.098*"germany"
INFO : topic #2(4.607): -0.533*"born" + -0.471*"usa" + -0.140*"germany" + -0.124*"theorem" + -0.109*"russia" + -0.096*"france" + 0.090*"narrative" + -0.089*"britain" + -0.087*"algorithm" + -0.087*"soviet"


[(0, 15.968758136814536), (1, -1.7760275323027801), (2, 3.2291864003224635)]
[(0, 8.1312391723860191), (1, -2.5683564885463861), (2, 4.5365893948980007)]
[(0, 1.3649376216191202), (1, 1.5611714875775313), (2, -0.64951863492836948)]


In [19]:
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part2)]))
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part3)]))
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part2, part3)]))

-0.00390474844191
0.475280566786
0.0258858146634


In [None]:
import re, math
from collections import Counter

WORD = re.compile(r'\w+')

def get_cosine(vec1, vec2):
     intersection = set(vec1.keys()) & set(vec2.keys())
     numerator = sum([vec1[x] * vec2[x] for x in intersection])

     sum1 = sum([vec1[x]**2 for x in vec1.keys()])
     sum2 = sum([vec2[x]**2 for x in vec2.keys()])
     denominator = math.sqrt(sum1) * math.sqrt(sum2)

     if not denominator:
        return 0.0
     else:
        return float(numerator) / denominator

def text_to_vector(text):
     words = WORD.findall(text)
     return Counter(words)

text1 = test1_path
text2 = test2_path

vector1 = text_to_vector(text1)
vector2 = text_to_vector(text2)

cosine = get_cosine(vector1, vector2)

print ('Cosine:', cosine)