In [None]:
# imports:
import urllib.request
import csv
import os
import itertools
import logging
import json
import numpy as np
import gensim
from bs4 import BeautifulSoup
from urllib.parse import urlencode
from gensim.utils import smart_open, simple_preprocess
from gensim.corpora.wikicorpus import _extract_pages, filter_wiki
from gensim.parsing.preprocessing import STOPWORDS
from xml.etree.cElementTree import iterparse


In [None]:

# List Categories:
category_list = ["Mathematics","Technology","Music"]
testFolder =  "./Simplex1/test/"
root_folder = "./Simplex1/"
list.sort(category_list)
root_folder='./'+''.join([x[0] for x in category_list])+'/'

if not os.path.exists(root_folder):
    os.mkdir(root_folder)

wiki_bow_path = root_folder+'wiki_bow.mm'

In [None]:
# Download Page Ids:
#https://petscan.wmflabs.org/?language=en&project=wikipedia&depth=1&format=csv&categories=mathematics&doit=Do it!
for cat in category_list:
    url="https://petscan.wmflabs.org/?language=en&project=wikipedia&depth=1&format=csv&doit=Do%20it!&categories="+cat
    print(url)
    urllib.request.urlretrieve(url, root_folder+cat+".csv")

    print(cat+".csv")
    

In [None]:
# CSV to XML Download data:

def getData(ids,outputFile):
    url="https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xml&pageids="+ids
    req = urllib.request.urlopen(url)
    if req.getcode() == 200:
        soup = BeautifulSoup(req.read(), 'html.parser')
        s = soup.find_all('page')
        for si in s:
            outputFile.write(str(si))

def batchTrain(file):
    outputFile = open(file.replace(".csv",".xml"), 'a', encoding="utf8")
    outputFile.write("<pages>")
                    
    csvReader = csv.reader(open(file,'r'))
    totalRecords = sum(1 for row in csv.reader(open(file,'r',encoding="UTF-8")) )
    print (totalRecords)
    start = 0
    end = start + 50
   
    while (start <= totalRecords):
        pageIds = ""
        for row in itertools.islice(csv.reader(open(file,'r',encoding="UTF-8")),start,end):
            pageIds = pageIds + row[2] + "|"
        
        getData(pageIds,outputFile)
        start = end + 1
        end = start + 50
        if end> totalRecords:
            end=totalRecords
            
    outputFile.write("</pages>")
    

for path, subdirs, files in os.walk(root_folder):
    for file in files:
        if file.endswith('.csv'):
            if not os.path.exists(root_folder+file.replace(".csv",".xml")):
                batchTrain(path + file)

print ("Done")



In [None]:
# Train Model
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

def head(stream, n=10):
    """Convenience fnc: return the first `n` elements of the stream, as plain list."""
    return list(itertools.islice(stream, n))

def my_extract_pages(f):
    elems = (elem for _, elem in iterparse(f, events=("end",)))
    page_tag = "rev"
    for elem in elems:
        if elem.tag == page_tag and elem.text != None:
            text = elem.text
            yield text
            elem.clear()

def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

def iter_wiki(dump_file):
    """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple."""
    ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
    for text in my_extract_pages(smart_open(dump_file)):
        text = filter_wiki(text)
        tokens = tokenize(text)
        if len(tokens) < 50:
            continue  # ignore short articles and various meta-articles
        yield tokens
        

In [None]:

#print(id2word_wiki)


In [None]:
class WikiCorpus(object):
    def __init__(self, dump_file, dictionary, clip_docs=None):
        """
        Parse the first `clip_docs` Wikipedia documents from file `dump_file`.
        Yield each document in turn, as a list of tokens (unicode strings).
        
        """
        self.dump_file = dump_file
        self.dictionary = dictionary
        self.clip_docs = clip_docs
    
    def __iter__(self):
        for tokens in itertools.islice(iter_wiki(self.dump_file), self.clip_docs):
            yield self.dictionary.doc2bow(tokens)
    
    def __len__(self):
        return self.clip_docs
    
if not os.path.exists(wiki_bow_path):
    for path, subdirs, files in os.walk(root_folder):
        del subdirs[:]
        for file in files:
            if file.endswith('.xml'):
                doc_path = path + file
                print(doc_path)
                stream = iter_wiki(doc_path)
                for tokens in itertools.islice(iter_wiki(doc_path), 8):
                    print (tokens[:10])
                doc_stream = (tokens for tokens in iter_wiki(doc_path))
                %time id2word_wiki = gensim.corpora.Dictionary(doc_stream)
                print(id2word_wiki)

    id2word_wiki.filter_extremes(no_below=10, no_above=0.1)
    
    # create a stream of bag-of-words vectors
    wiki_corpus = WikiCorpus(doc_path, id2word_wiki)

    #wiki_bow_path = root_folder+'wiki_bow.mm'
    %time gensim.corpora.MmCorpus.serialize(wiki_bow_path, wiki_corpus)

mm_corpus = gensim.corpora.MmCorpus(wiki_bow_path)
print(mm_corpus)
clipped_corpus = gensim.utils.ClippedCorpus(mm_corpus, 4000) 
%time lda_model = gensim.models.LdaModel(clipped_corpus, num_topics=len(category_list), id2word=id2word_wiki, passes=10, alpha='asymmetric')


In [None]:
def calculateCentroid(topic_docs):
    test_doc = [tokens for tokens in iter_wiki(topic_docs)]
    part = [lda_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc]
    
    topic_dic={}
    
    for i in range(len(category_list)):
        topic_dic[i]=0
        
    for doc in part:
        for p in doc:
            topic_dic[p[0]] += p[1]
    
    centroid = [(x, topic_dic[x]/len(part)) for x in range(len(category_list))]
    return centroid
    
centroids_dict={}
for path, subdirs, files in os.walk(root_folder):
    del subdirs[:]
    for file in files:
        if file.endswith('.xml'):
            doc_path = path + file
            print(doc_path)
            centroid = calculateCentroid(doc_path)
            centroids_dict[file.replace(".xml","")]=centroid
            
print(centroids_dict)

In [None]:
def drawgraph(x_label,y,file,text_data):
    
    import matplotlib.pyplot as plt

    x = np.arange(len(x_label))  # the x locations for the groups
    width = 0.3       # the width of the bars

    fig, ax = plt.subplots()
    rects1 = ax.bar(x, y, width, color='blue')
    ax.set_ylabel('Weights')
    ax.set_title('Topic Distribution')
    #ax.set_xticks(x + width / 2)
    #ax.set_rotation(90)
    ax.set_xticklabels(category_list)
    ax.text(3, 8, text_data, style='italic',
        bbox={'facecolor':'green', 'alpha':0.5, 'pad':10})

    def autolabel(rects):
        """
        Attach a text label above each bar displaying its height
        """
        for rect in rects:
            height = rect.get_height()
            ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
                    '%d' % int(height),
                    ha='center', va='bottom')

    autolabel(rects1)
    plt.xticks(rotation=90)
    #plt.setp(plt.xticks()[0], rotation=45)
    #fig = plt.figure(figuresize=4, 5)
    plt.savefig(file.replace(".xml","")+'.png')
    plt.close(fig)

In [None]:
# Test 
def getPart(testFile):
    test_doc = [tokens for tokens in iter_wiki(testFile)]
    part = [lda_model[id2word_wiki.doc2bow(tokens)] for tokens in test_doc]
    return part


for path, subdirs, files in os.walk(testFolder):
    for file in files:
        if file.endswith('.xml'):
            doc_path = testFolder + file
            print(doc_path)
            path=getPart(doc_path)
            graph_data=[]
            #for topic,centroid in centroids_dict.items():
            #    print(topic, np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip([centroid], path)]))
            #    graph_data.append(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip([centroid], path)]))
            text_data=""
            for topic in category_list:
                cos_dis=np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip([centroids_dict[topic]], path)])
                text_data = text_data+ topic+":"+str(cos_dis)+"\n"
                graph_data.append(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip([centroids_dict[topic]], path)]))
            print(graph_data)
            drawgraph(list(centroids_dict.keys()),graph_data,root_folder+file,text_data)
            
    