In [102]:
# import and setup modules we'll be using in this notebook
import logging
import itertools

import numpy as np
import gensim

logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

def head(stream, n=10):
    """Convenience fnc: return the first `n` elements of the stream, as plain list."""
    return list(itertools.islice(stream, n))

In [13]:
from gensim.utils import smart_open, simple_preprocess
from gensim.corpora.wikicorpus import _extract_pages, filter_wiki
from gensim.parsing.preprocessing import STOPWORDS
from xml.etree.cElementTree import iterparse

def my_extract_pages(f):
    elems = (elem for _, elem in iterparse(f, events=("end",)))
    page_tag = "rev"
    for elem in elems:
        if elem.tag == page_tag and elem.text != None:
            text = elem.text
            yield text
            elem.clear()

def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

def iter_wiki(dump_file):
    """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple."""
    ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
    for text in my_extract_pages(smart_open(dump_file)):
        text = filter_wiki(text)
        tokens = tokenize(text)
        if len(tokens) < 50:
            continue  # ignore short articles and various meta-articles
        yield tokens

In [59]:
doc_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\AllTopics.xml'
stream = iter_wiki(doc_path)

for tokens in itertools.islice(iter_wiki(doc_path), 8):
    print (tokens[:10])
doc_stream = (tokens for tokens in iter_wiki(doc_path))
%time id2word_wiki = gensim.corpora.Dictionary(doc_stream)
print(id2word_wiki)

INFO : adding document #0 to Dictionary(0 unique tokens: [])


['artificial', 'intelligence', 'ai', 'intelligence', 'exhibited', 'machines', 'science', 'field', 'ai', 'research']
['association', 'computing', 'machinery', 'acm', 'international', 'learned', 'society', 'computing', 'founded', 'world']
['user', 'interacts', 'application', 'software', 'typical', 'desktop', 'application', 'software', 'layer', 'interfaces']
['link', 'programming', 'language', 'theory', 'link', 'computational', 'complexity', 'theory', 'link', 'graphics']
['computational', 'linguistics', 'field', 'concerned', 'statistical', 'rule', 'based', 'modeling', 'natural', 'language']
['language', 'hello', 'world', 'source', 'code', 'known', 'hello', 'world', 'snippet', 'seminal']
['computational', 'chemistry', 'branch', 'chemistry', 'uses', 'simulation', 'assist', 'solving', 'chemical', 'problems']
['diagram', 'complexity', 'classes', 'provided', 'np', 'existence', 'problems', 'np', 'outside', 'np']


INFO : built Dictionary(125701 unique tokens: ['governor', 'photographic', 'arima', 'ramona', 'micromechanical']...) from 3919 documents (total 2112796 corpus positions)


Wall time: 26.3 s
Dictionary(125701 unique tokens: ['governor', 'photographic', 'arima', 'ramona', 'micromechanical']...)


In [61]:
# ignore words that appear in less than 20 documents or more than 10% documents
id2word_wiki.filter_extremes(no_below=10, no_above=0.1)
print(id2word_wiki)


INFO : discarding 0 tokens: []...
INFO : keeping 14701 tokens which were in no less than 10 and no more than 391 (=10.0%) documents
INFO : resulting dictionary: Dictionary(14701 unique tokens: ['governor', 'photographic', 'elementary', 'revered', 'clash']...)


Dictionary(14701 unique tokens: ['governor', 'photographic', 'elementary', 'revered', 'clash']...)


In [62]:

class WikiCorpus(object):
    def __init__(self, dump_file, dictionary, clip_docs=None):
        """
        Parse the first `clip_docs` Wikipedia documents from file `dump_file`.
        Yield each document in turn, as a list of tokens (unicode strings).
        
        """
        self.dump_file = dump_file
        self.dictionary = dictionary
        self.clip_docs = clip_docs
    
    def __iter__(self):
        for tokens in itertools.islice(iter_wiki(self.dump_file), self.clip_docs):
            yield self.dictionary.doc2bow(tokens)
    
    def __len__(self):
        return self.clip_docs

# create a stream of bag-of-words vectors
wiki_corpus = WikiCorpus(doc_path, id2word_wiki)
vector = next(iter(wiki_corpus))
print(vector)  # print the first vector in the stream

[(7, 3), (25, 2), (26, 1), (34, 1), (48, 1), (53, 1), (75, 8), (85, 1), (99, 3), (107, 6), (108, 1), (112, 1), (124, 1), (127, 1), (139, 4), (143, 1), (153, 1), (154, 4), (171, 1), (175, 1), (188, 1), (233, 8), (234, 1), (240, 2), (241, 1), (253, 1), (268, 2), (284, 3), (285, 1), (289, 1), (294, 1), (295, 3), (307, 1), (317, 2), (324, 4), (332, 4), (337, 1), (345, 2), (347, 1), (353, 2), (355, 4), (359, 1), (367, 1), (375, 3), (381, 2), (384, 1), (408, 1), (411, 2), (416, 2), (423, 1), (424, 1), (429, 1), (430, 107), (453, 1), (456, 2), (459, 1), (476, 1), (480, 6), (481, 3), (491, 1), (493, 2), (506, 1), (507, 2), (508, 2), (513, 1), (514, 2), (522, 1), (526, 3), (542, 1), (546, 1), (548, 1), (553, 1), (562, 2), (564, 1), (565, 2), (567, 1), (585, 1), (588, 1), (608, 2), (627, 2), (641, 3), (651, 1), (657, 14), (664, 1), (668, 3), (672, 2), (688, 1), (690, 5), (693, 1), (705, 4), (722, 2), (734, 3), (741, 1), (756, 1), (766, 4), (787, 1), (797, 1), (822, 1), (827, 1), (830, 1), (833, 

In [63]:
# what is the most common word in that first article?
most_index, most_count = max(vector, key=lambda x: x[1])
print(id2word_wiki[most_index], most_count)

print(id2word_wiki[68])

import heapq
print(heapq.nlargest(3, vector, key=lambda x: x[1]))


ai 136
averaging
[(8142, 136), (430, 107), (5198, 81)]


In [64]:
wiki_bow_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_bow.mm'
%time gensim.corpora.MmCorpus.serialize(wiki_bow_path, wiki_corpus)

INFO : storing corpus in Matrix Market format to D:\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_bow.mm
INFO : saving sparse matrix to D:\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_bow.mm
INFO : PROGRESS: saving document #0
INFO : PROGRESS: saving document #1000
INFO : PROGRESS: saving document #2000
INFO : PROGRESS: saving document #3000
INFO : saved 3919x14701 matrix, density=1.257% (724187/57613219)
INFO : saving MmCorpus index to D:\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_bow.mm.index


Wall time: 22.9 s


In [65]:
mm_corpus = gensim.corpora.MmCorpus(wiki_bow_path)
print(mm_corpus)

print(len([ x for x in iter(mm_corpus)]))

INFO : loaded corpus index from D:\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_bow.mm.index
INFO : initializing corpus reader from D:\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_bow.mm
INFO : accepted corpus with 3919 documents, 14701 features, 724187 non-zero entries


MmCorpus(3919 documents, 14701 features, 724187 non-zero entries)
3919


In [111]:
clipped_corpus = gensim.utils.ClippedCorpus(mm_corpus, 4000)  # use fewer documents during training, LDA is slow
# ClippedCorpus new in gensim 0.10.1
# copy&paste it from https://github.com/piskvorky/gensim/blob/0.10.1/gensim/utils.py#L467 if necessary (or upgrade your gensim)
%time lda_model = gensim.models.LdaModel(clipped_corpus, num_topics=4, id2word=id2word_wiki, passes=5, alpha='auto')

INFO : using autotuned alpha, starting with [0.25, 0.25, 0.25, 0.25]
INFO : using symmetric eta at 6.802258349772124e-05
INFO : using serial LDA version on this node
INFO : running online LDA training, 4 topics, 5 passes over the supplied corpus of 3919 documents, updating model once every 2000 documents, evaluating perplexity every 3919 documents, iterating 50x with a convergence threshold of 0.001000
INFO : PROGRESS: pass 0, at document #2000/3919
INFO : optimized alpha [0.37842610491898376, 0.37857957629467054, 0.34583772796098211, 0.37048910359942355]
INFO : merging changes from 2000 documents into a model of 3919 documents
INFO : topic #0 (0.378): 0.004*"conference" + 0.003*"computational" + 0.003*"film" + 0.003*"algorithm" + 0.002*"algorithms" + 0.002*"design" + 0.002*"web" + 0.002*"genre" + 0.002*"programming" + 0.001*"models"
INFO : topic #1 (0.379): 0.003*"film" + 0.003*"tales" + 0.003*"award" + 0.002*"computational" + 0.002*"prize" + 0.002*"engineering" + 0.002*"conference" +

Wall time: 2min 33s


In [112]:
_ = lda_model.print_topics()  # print a few most important words for each LDA topic

INFO : topic #0 (0.100): 0.006*"theorem" + 0.004*"algorithm" + 0.004*"notation" + 0.004*"proof" + 0.003*"algebra" + 0.003*"logic" + 0.003*"vector" + 0.003*"models" + 0.003*"matrix" + 0.003*"equations"
INFO : topic #1 (0.053): 0.015*"px" + 0.012*"gospels" + 0.009*"greece" + 0.005*"gr" + 0.005*"latin" + 0.005*"france" + 0.004*"epistles" + 0.004*"greek" + 0.004*"acts" + 0.003*"monastery"
INFO : topic #2 (0.085): 0.005*"film" + 0.003*"genre" + 0.003*"narrative" + 0.002*"tv" + 0.002*"love" + 0.002*"king" + 0.001*"person" + 0.001*"television" + 0.001*"released" + 0.001*"game"
INFO : topic #3 (0.085): 0.014*"born" + 0.010*"usa" + 0.006*"conference" + 0.005*"students" + 0.005*"germany" + 0.004*"award" + 0.004*"france" + 0.003*"million" + 0.003*"england" + 0.003*"institute"


In [None]:
%time tfidf_model = gensim.models.TfidfModel(mm_corpus, id2word=id2word_wiki)

In [None]:
%time lsi_model = gensim.models.LsiModel(tfidf_model[mm_corpus], id2word=id2word_wiki, num_topics=3)

In [None]:
# cache the transformed corpora to disk, for use in later notebooks
wiki_tfidf_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_tfidf.mm'
wiki_lsa_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\wiki_lsa.mm'
%time gensim.corpora.MmCorpus.serialize(wiki_tfidf_path, tfidf_model[mm_corpus])
%time gensim.corpora.MmCorpus.serialize(wiki_lsa_path, lsi_model[tfidf_model[mm_corpus]])
# gensim.corpora.MmCorpus.serialize('./data/wiki_lda.mm', lda_model[mm_corpus])

In [113]:
# evaluate on 1k documents **not** used in LDA training
test1_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\Test_doc_Ethics.xml'
test2_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\Test_doc_Allegory.xml'
test3_path = 'D:\\Study\Winter-2017\Machine Learning\Project\DistractionBuster\Simplex\Dump\Test_doc_AI.xml'

# doc_stream = [tokens for tokens in iter_wiki(test1_path))  # generator
test_doc_1 = [tokens for tokens in iter_wiki(test1_path)]
#print(lda_model.print_topics(max(lda_model[id2word_wiki.doc2bow(test_doc_1[0])], key=lambda item: item[1])[0]))

#for x in heapq.nlargest(3, id2word_wiki.doc2bow(test_doc_1[0]), key=lambda x: x[1]):
#    print(x[0], id2word_wiki[x[0]], x[1])

#print(max(lda_model[id2word_wiki.doc2bow(test_doc_1[0])], key=lambda item: item[1]))
#print(max(lda_model[id2word_wiki.doc2bow(test_doc_1[0])], key=lambda item: item[1])[0])

print(lda_model[id2word_wiki.doc2bow(test_doc_1[0])])

lda_model.print_topics()

part1 = [lda_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc_1]
#print(part1)

# doc_stream = [tokens for tokens in iter_wiki(test1_path))  # generator
test_doc_2 = [tokens for tokens in iter_wiki(test2_path)]
part2 = [lda_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc_2]

print(lda_model[id2word_wiki.doc2bow(test_doc_2[0])])

# doc_stream = [tokens for tokens in iter_wiki(test1_path))  # generator
test_doc_3 = [tokens for tokens in iter_wiki(test3_path)]
part3 = [lda_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc_3]
print(lda_model[id2word_wiki.doc2bow(test_doc_3[0])])

INFO : topic #0 (0.100): 0.006*"theorem" + 0.004*"algorithm" + 0.004*"notation" + 0.004*"proof" + 0.003*"algebra" + 0.003*"logic" + 0.003*"vector" + 0.003*"models" + 0.003*"matrix" + 0.003*"equations"
INFO : topic #1 (0.053): 0.015*"px" + 0.012*"gospels" + 0.009*"greece" + 0.005*"gr" + 0.005*"latin" + 0.005*"france" + 0.004*"epistles" + 0.004*"greek" + 0.004*"acts" + 0.003*"monastery"
INFO : topic #2 (0.085): 0.005*"film" + 0.003*"genre" + 0.003*"narrative" + 0.002*"tv" + 0.002*"love" + 0.002*"king" + 0.001*"person" + 0.001*"television" + 0.001*"released" + 0.001*"game"
INFO : topic #3 (0.085): 0.014*"born" + 0.010*"usa" + 0.006*"conference" + 0.005*"students" + 0.005*"germany" + 0.004*"award" + 0.004*"france" + 0.003*"million" + 0.003*"england" + 0.003*"institute"


[(0, 0.14421276005823253), (2, 0.53135314206025475), (3, 0.32438737447065991)]
[(1, 0.36331121809631484), (2, 0.63352242745552212)]
[(0, 0.99779825993637672)]


In [114]:
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part2)]))
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part3)]))
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part2, part3)]))


0.721322526286
0.225673774229
0.0


In [None]:
part1 = [lsi_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc_1]

part2 = [lsi_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc_2]

part3 = [lsi_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc_3]

print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part2)]))
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part3)]))
print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part2, part3)]))

In [None]:
import re, math
from collections import Counter

WORD = re.compile(r'\w+')

def get_cosine(vec1, vec2):
     intersection = set(vec1.keys()) & set(vec2.keys())
     numerator = sum([vec1[x] * vec2[x] for x in intersection])

     sum1 = sum([vec1[x]**2 for x in vec1.keys()])
     sum2 = sum([vec2[x]**2 for x in vec2.keys()])
     denominator = math.sqrt(sum1) * math.sqrt(sum2)

     if not denominator:
        return 0.0
     else:
        return float(numerator) / denominator

def text_to_vector(text):
     words = WORD.findall(text)
     return Counter(words)

text1 = test1_path
text2 = test2_path

vector1 = text_to_vector(text1)
vector2 = text_to_vector(text2)

cosine = get_cosine(vector1, vector2)

print ('Cosine:', cosine)