In [1]:
# import and setup modules we'll be using in this notebook
import logging
import itertools

import numpy as np
import gensim

logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

def head(stream, n=10):
    """Convenience fnc: return the first `n` elements of the stream, as plain list."""
    return list(itertools.islice(stream, n))



In [2]:
from gensim.utils import smart_open, simple_preprocess
from gensim.corpora.wikicorpus import _extract_pages, filter_wiki
from gensim.parsing.preprocessing import STOPWORDS
from xml.etree.cElementTree import iterparse

def my_extract_pages(f):
    elems = (elem for _, elem in iterparse(f, events=("end",)))
    page_tag = "rev"
    for elem in elems:
        if elem.tag == page_tag and elem.text != None:
            text = elem.text
            yield text
            elem.clear()

def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

def iter_wiki(dump_file):
    """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple."""
    ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
    for text in my_extract_pages(smart_open(dump_file)):
        text = filter_wiki(text)
        tokens = tokenize(text)
        if len(tokens) < 50:
            continue  # ignore short articles and various meta-articles
        yield tokens

In [3]:
doc_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\AllTopics.xml'
stream = iter_wiki(doc_path)

for tokens in itertools.islice(iter_wiki(doc_path), 8):
    print (tokens[:10])
doc_stream = (tokens for tokens in iter_wiki(doc_path))
%time id2word_wiki = gensim.corpora.Dictionary(doc_stream)
print(id2word_wiki)

INFO : adding document #0 to Dictionary(0 unique tokens: [])


['artificial', 'intelligence', 'ai', 'intelligence', 'exhibited', 'machines', 'science', 'field', 'ai', 'research']
['association', 'computing', 'machinery', 'acm', 'international', 'learned', 'society', 'computing', 'founded', 'world']
['user', 'interacts', 'application', 'software', 'typical', 'desktop', 'application', 'software', 'layer', 'interfaces']
['link', 'programming', 'language', 'theory', 'link', 'computational', 'complexity', 'theory', 'link', 'graphics']
['computational', 'linguistics', 'field', 'concerned', 'statistical', 'rule', 'based', 'modeling', 'natural', 'language']
['language', 'hello', 'world', 'source', 'code', 'known', 'hello', 'world', 'snippet', 'seminal']
['computational', 'chemistry', 'branch', 'chemistry', 'uses', 'simulation', 'assist', 'solving', 'chemical', 'problems']
['diagram', 'complexity', 'classes', 'provided', 'np', 'existence', 'problems', 'np', 'outside', 'np']


INFO : built Dictionary(125701 unique tokens: ['fractionibus', 'michiyo', 'veṇvāroha', 'daikichi', 'ecstatic']...) from 3919 documents (total 2112796 corpus positions)


Wall time: 14.7 s
Dictionary(125701 unique tokens: ['fractionibus', 'michiyo', 'veṇvāroha', 'daikichi', 'ecstatic']...)


In [4]:
# ignore words that appear in less than 20 documents or more than 10% documents
id2word_wiki.filter_extremes(no_below=10, no_above=0.1)
print(id2word_wiki)


INFO : discarding 111000 tokens: [('source', 395), ('superhumanly', 2), ('culture', 404), ('known', 1244), ('level', 484), ('bowel', 2), ('devalues', 2), ('beginning', 407), ('sensorimotor', 3), ('fodor', 4)]...
INFO : keeping 14701 tokens which were in no less than 10 and no more than 391 (=10.0%) documents
INFO : resulting dictionary: Dictionary(14701 unique tokens: ['worldview', 'desert', 'slow', 'army', 'captain']...)


Dictionary(14701 unique tokens: ['worldview', 'desert', 'slow', 'army', 'captain']...)


In [5]:

class WikiCorpus(object):
    def __init__(self, dump_file, dictionary, clip_docs=None):
        """
        Parse the first `clip_docs` Wikipedia documents from file `dump_file`.
        Yield each document in turn, as a list of tokens (unicode strings).
        
        """
        self.dump_file = dump_file
        self.dictionary = dictionary
        self.clip_docs = clip_docs
    
    def __iter__(self):
        for tokens in itertools.islice(iter_wiki(self.dump_file), self.clip_docs):
            yield self.dictionary.doc2bow(tokens)
    
    def __len__(self):
        return self.clip_docs

# create a stream of bag-of-words vectors
wiki_corpus = WikiCorpus(doc_path, id2word_wiki)
vector = next(iter(wiki_corpus))
# print(vector)  # print the first vector in the stream

In [6]:
# what is the most common word in that first article?
most_index, most_count = max(vector, key=lambda x: x[1])
print(id2word_wiki[most_index], most_count)

print(id2word_wiki[68])

import heapq
print(heapq.nlargest(3, vector, key=lambda x: x[1]))


ai 136
detection
[(3096, 136), (4793, 107), (12118, 81)]


In [7]:
wiki_bow_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_bow.mm'
%time gensim.corpora.MmCorpus.serialize(wiki_bow_path, wiki_corpus)

INFO : storing corpus in Matrix Market format to D:\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_bow.mm
INFO : saving sparse matrix to D:\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_bow.mm
INFO : PROGRESS: saving document #0
INFO : PROGRESS: saving document #1000
INFO : PROGRESS: saving document #2000
INFO : PROGRESS: saving document #3000
INFO : saved 3919x14701 matrix, density=1.257% (724187/57613219)
INFO : saving MmCorpus index to D:\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_bow.mm.index


Wall time: 16.8 s


In [8]:
mm_corpus = gensim.corpora.MmCorpus(wiki_bow_path)
print(mm_corpus)

print(len([ x for x in iter(mm_corpus)]))

INFO : loaded corpus index from D:\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_bow.mm.index
INFO : initializing corpus reader from D:\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_bow.mm
INFO : accepted corpus with 3919 documents, 14701 features, 724187 non-zero entries


MmCorpus(3919 documents, 14701 features, 724187 non-zero entries)
3919


In [9]:
clipped_corpus = gensim.utils.ClippedCorpus(mm_corpus, 4000)  # use fewer documents during training, LDA is slow
# ClippedCorpus new in gensim 0.10.1
# copy&paste it from https://github.com/piskvorky/gensim/blob/0.10.1/gensim/utils.py#L467 if necessary (or upgrade your gensim)
%time lda_model = gensim.models.LdaModel(clipped_corpus, num_topics=3, id2word=id2word_wiki, passes=6)

INFO : using symmetric alpha at 0.3333333333333333
INFO : using symmetric eta at 6.802258349772124e-05
INFO : using serial LDA version on this node
INFO : running online LDA training, 3 topics, 6 passes over the supplied corpus of 3919 documents, updating model once every 2000 documents, evaluating perplexity every 3919 documents, iterating 50x with a convergence threshold of 0.001000
INFO : PROGRESS: pass 0, at document #2000/3919
INFO : merging changes from 2000 documents into a model of 3919 documents
INFO : topic #0 (0.333): 0.003*"film" + 0.002*"award" + 0.002*"computational" + 0.002*"prize" + 0.002*"genre" + 0.002*"narrative" + 0.001*"poet" + 0.001*"algorithm" + 0.001*"tales" + 0.001*"canto"
INFO : topic #1 (0.333): 0.004*"conference" + 0.003*"film" + 0.002*"programming" + 0.002*"genre" + 0.002*"computational" + 0.002*"algorithm" + 0.002*"students" + 0.002*"machine" + 0.001*"tales" + 0.001*"code"
INFO : topic #2 (0.333): 0.003*"computational" + 0.002*"conference" + 0.002*"enginee

Wall time: 1min 51s


In [10]:
_ = lda_model.print_topics()  # print a few most important words for each LDA topic

INFO : topic #0 (0.333): 0.003*"gospels" + 0.003*"film" + 0.003*"greece" + 0.002*"genre" + 0.002*"women" + 0.002*"france" + 0.002*"narrative" + 0.002*"british" + 0.002*"award" + 0.002*"gr"
INFO : topic #1 (0.333): 0.004*"conference" + 0.004*"models" + 0.004*"logic" + 0.003*"algorithm" + 0.003*"computational" + 0.003*"students" + 0.003*"composite" + 0.003*"programming" + 0.003*"proof" + 0.003*"machine"
INFO : topic #2 (0.333): 0.011*"born" + 0.010*"px" + 0.008*"theorem" + 0.007*"usa" + 0.005*"notation" + 0.005*"algebra" + 0.005*"vector" + 0.004*"matrix" + 0.004*"groups" + 0.003*"equations"


In [26]:
# evaluate on 1k documents **not** used in LDA training
test1_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\CompSci.xml'
test2_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\Literature.xml'
test3_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\Mathematics.xml'

# doc_stream = [tokens for tokens in iter_wiki(test1_path))  # generator
test_doc_1 = [tokens for tokens in iter_wiki(test1_path)]
part1 = [lda_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc_1]


topic_dic = {0:0, 1:0, 2:0}

for doc in part1:
    for p in doc:
        topic_dic[p[0]] += p[1]

print(topic_dic)

num_docs = len(part1)

print("Centroid : (", topic_dic[0]/num_docs, ", ", topic_dic[1]/num_docs, ", ", topic_dic[2]/num_docs, ")")

centroid_1 = [(x, topic_dic[x]/num_docs) for x in range(3)]

topic_dic = {0:0, 1:0, 2:0}

for doc in part2:
    for p in doc:
        topic_dic[p[0]] += p[1]

print(topic_dic)

num_docs = len(part2)

print("Centroid : (", topic_dic[0]/num_docs, ", ", topic_dic[1]/num_docs, ", ", topic_dic[2]/num_docs, ")")

centroid_2 = [(x, topic_dic[x]/num_docs) for x in range(3)]

topic_dic = {0:0, 1:0, 2:0}

for doc in part3:
    for p in doc:
        topic_dic[p[0]] += p[1]

print(topic_dic)

num_docs = len(part3)

print("Centroid : (", topic_dic[0]/num_docs, ", ", topic_dic[1]/num_docs, ", ", topic_dic[2]/num_docs, ")")

centroid_3 = [(x, topic_dic[x]/num_docs) for x in range(3)]

print(part1[0])
print(part1[1])

# doc_stream = [tokens for tokens in iter_wiki(test1_path))  # generator
test_doc_2 = [tokens for tokens in iter_wiki(test2_path)]
part2 = [lda_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc_2]
print(part2[0])
print(part2[1])

# doc_stream = [tokens for tokens in iter_wiki(test1_path))  # generator
test_doc_3 = [tokens for tokens in iter_wiki(test3_path)]
part3 = [lda_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc_3]
print(part3[0])
print(part3[1])

{0: 100.8841118872226, 1: 842.95956837281324, 2: 125.88467491466653}
Centroid : ( 0.0938456854765 ,  0.784148435696 ,  0.117102023176 )
{0: 1500.1150636054119, 1: 263.28190585544775, 2: 14.52272062251264}
Centroid : ( 0.840400595857 ,  0.147496866025 ,  0.00813597793978 )
{0: 115.72818486309917, 1: 319.27097212756991, 2: 620.47922085580171}
Centroid : ( 0.109280627822 ,  0.301483448657 ,  0.58591050128 )
[(0, 0.016995560656335944), (1, 0.98291475414915641)]
[(0, 0.15233463669467232), (1, 0.84725772453144954)]
[(0, 0.94624383725445882), (1, 0.048830811403771796)]
[(0, 0.96525813627786394), (1, 0.033040514970716951)]
[(1, 0.68621808255902161), (2, 0.31347616092018554)]
[(2, 0.99053389457929131)]


In [24]:
all_tokens = set()
for tokens in test_doc_1:
    all_tokens |= set(tokens)

print(lda_model[id2word_wiki.doc2bow(list(all_tokens))])

[(0, 0.46647961193421716), (1, 0.40096047952909852), (2, 0.13255990853668426)]


In [17]:
print(type(part1))

<class 'list'>


In [16]:
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part2)]))
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part3)]))
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part2, part3)]))


0.279016973723
0.437059622703
0.20983812019


In [27]:
# evaluate on 1k documents **not** used in LDA training
test1_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\Test_doc_Ethics.xml'
test2_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\Test_doc_Allegory.xml'
test3_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\Test_doc_AI.xml'



# doc_stream = [tokens for tokens in iter_wiki(test1_path))  # generator
test_doc_1 = [tokens for tokens in iter_wiki(test1_path)]
part1 = [lda_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc_1]
#print(part1)

# doc_stream = [tokens for tokens in iter_wiki(test1_path))  # generator
test_doc_2 = [tokens for tokens in iter_wiki(test2_path)]
part2 = [lda_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc_2]

# doc_stream = [tokens for tokens in iter_wiki(test1_path))  # generator
test_doc_3 = [tokens for tokens in iter_wiki(test3_path)]
part3 = [lda_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc_3]

In [33]:
print(centroid_1)

print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip([centroid_3], part1)]))
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip([centroid_3], part2)]))
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip([centroid_3], part3)]))
print()
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip([centroid_1], part1)]))
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip([centroid_1], part2)]))
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip([centroid_1], part3)]))
print()
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip([centroid_2], part1)]))
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip([centroid_2], part2)]))
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip([centroid_2], part3)]))

[(0, 0.09384568547648614), (1, 0.78414843569564019), (2, 0.11710202317643398)]
0.479728631933
0.186654974109
0.877208213632

0.97161344392
0.168003631847
0.146674770281

0.463191073284
0.99249988287
0.00953489320499


In [None]:
%time tfidf_model = gensim.models.TfidfModel(mm_corpus, id2word=id2word_wiki)

In [None]:
%time lsi_model = gensim.models.LsiModel(tfidf_model[mm_corpus], id2word=id2word_wiki, num_topics=3)

In [None]:
# cache the transformed corpora to disk, for use in later notebooks
wiki_tfidf_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_tfidf.mm'
wiki_lsa_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_lsa.mm'
%time gensim.corpora.MmCorpus.serialize(wiki_tfidf_path, tfidf_model[mm_corpus])
%time gensim.corpora.MmCorpus.serialize(wiki_lsa_path, lsi_model[tfidf_model[mm_corpus]])
# gensim.corpora.MmCorpus.serialize('./data/wiki_lda.mm', lda_model[mm_corpus])

In [None]:
part1 = [lsi_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc_1]

part2 = [lsi_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc_2]

part3 = [lsi_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc_3]

print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part2)]))
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part3)]))
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part2, part3)]))