In [1]:
import requests
from bs4 import BeautifulSoup
import os
import time
from keras.utils import get_file
try:
    from urllib.request import urlretrieve
except ImportError:
    from urllib import urlretrieve
import xml.sax

import subprocess
import mwparserfromhell
import json


Using TensorFlow backend.


In [2]:
import threading

In [44]:
import os
base_dir = '/home/ubuntu/.keras/datasets/wiki_partitions/'
tasks = [base_dir + s for s in os.listdir(base_dir)]
len(tasks)

16

In [9]:
import re


class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Used to handle the XML wiki dump. Copied 
    directly from the book and only edited self._books (from self._movies)"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._books = []
        self._curent_tag = None

    def characters(self, content):
        if self._curent_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        if name in ('title', 'text'):
            self._curent_tag = name
            self._buffer = []

    def endElement(self, name):
        if name == self._curent_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            book = process_article(**self._values)
            if book:
                self._books.append(book)

def process_article(title, text, return_wikicode = False):
    """Process a wikipedia article looking for books"""
    # Create a parsing object
    wikicode = mwparserfromhell.parse(text)
    if return_wikicode:
        return wikicode
    
    # Search through templates for the book template
    book = next((template for template in wikicode.filter_templates() 
                 if template.name.strip().lower() in ['infobox book']), None)
    if book:
        properties = {param.name.strip_code().strip(): param.value.strip_code().strip() 
                      for param in book.params
                      if param.value.strip_code().strip()
                     }
        links = [x.title.strip_code().strip() for x in wikicode.filter_wikilinks()]
        return (title, properties, links)

In [66]:
tasks

['/home/ubuntu/.keras/datasets/wiki_partitions/enwiki-20180901-pages-articles.xml_13.bz2',
 '/home/ubuntu/.keras/datasets/wiki_partitions/enwiki-20180901-pages-articles.xml_14.bz2',
 '/home/ubuntu/.keras/datasets/wiki_partitions/enwiki-20180901-pages-articles.xml_5.bz2',
 '/home/ubuntu/.keras/datasets/wiki_partitions/enwiki-20180901-pages-articles.xml_3.bz2',
 '/home/ubuntu/.keras/datasets/wiki_partitions/enwiki-20180901-pages-articles.xml_7.bz2',
 '/home/ubuntu/.keras/datasets/wiki_partitions/enwiki-20180901-pages-articles.xml_4.bz2',
 '/home/ubuntu/.keras/datasets/wiki_partitions/enwiki-20180901-pages-articles.xml_1.bz2',
 '/home/ubuntu/.keras/datasets/wiki_partitions/enwiki-20180901-pages-articles.xml_8.bz2',
 '/home/ubuntu/.keras/datasets/wiki_partitions/enwiki-20180901-pages-articles.xml_9.bz2',
 '/home/ubuntu/.keras/datasets/wiki_partitions/enwiki-20180901-pages-articles.xml_16.bz2',
 '/home/ubuntu/.keras/datasets/wiki_partitions/enwiki-20180901-pages-articles.xml_6.bz2',
 '/home

In [77]:
data_path = tasks[6]

# Object for handling xml
handler = WikiXmlHandler()

# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)


for i, line in enumerate(subprocess.Popen(['bzcat'], 
                             stdin = open(data_path), stdout = subprocess.PIPE).stdout):
    try:
        parser.feed(line)
    except StopIteration:
        break
    if len(handler._books) > 1:
        break

In [78]:
data_path

'/home/ubuntu/.keras/datasets/wiki_partitions/enwiki-20180901-pages-articles.xml_1.bz2'

In [79]:
len(handler._books)

2

In [54]:
def service(data_path):
    # Object for handling xml
    handler = WikiXmlHandler()

    # Parsing object
    parser = xml.sax.make_parser()
    parser.setContentHandler(handler)
    
    for i, line in enumerate(subprocess.Popen(['bzcat'], 
                             stdin = open(data_path), stdout = subprocess.PIPE).stdout):
        try:
            parser.feed(line)
        except StopIteration:
            break
        if handler._books:
            return handler._values
    
    return handler._values

In [49]:
from threading import Thread

threads = [Thread(target = service_function, kwargs={'data_path': path}) for path in tasks]
[t.start() for t in threads]
[t.join() for t in threads]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [55]:
from multiprocessing.dummy import Pool as ThreadPool

pool = ThreadPool(16)
results = pool.map(service, tasks)
pool.close()
pool.join()
results

[{},
 {},
 {},
 {},
 {},
 {},
 {'title': 'Animalia (book)',
  'text': "{{Use dmy dates|date=June 2013}} \n {{Infobox book| < !-- See Wikipedia:WikiProject_Novels or Wikipedia:WikiProject_Books -- > \n | name          = '''Animalia''' \n | image         = Animalia (book cover).jpg \n | caption       =  \n | alt           = Book cover: a larger picture framed by smaller pictures, all of which contain different animals, and title with author at the top \n | author        = [[Graeme Base]] \n | illustrator   = Graeme Base \n | country       = Australia \n | language      = English  \n | genre         = [[Picture books]] \n | publisher     = [[Harcourt Brace Jovanovich]] \n | release_date  = 1986 \n | pages         = 32 \n | isbn          = 0-810-91868-4 \n | oclc          =  \n }} \n < !--{{Infobox publication|image=Animalia.jpg|caption=Book cover}}-- > \n '''''Animalia''''' is an illustrated [[Children's literature|children's book]] by [[Graeme Base]]. It was originally published in 1986,

In [22]:
from concurrent.futures.thread import ThreadPoolExecutor
x = []
with ThreadPoolExecutor(max_workers = 16) as executor:
    for path in tasks:
        x.append(executor.submit(service, path))

/home/ubuntu/.keras/datasets/wiki_partitions/enwiki-20180901-pages-articles.xml_13.bz2/home/ubuntu/.keras/datasets/wiki_partitions/enwiki-20180901-pages-articles.xml_14.bz2
/home/ubuntu/.keras/datasets/wiki_partitions/enwiki-20180901-pages-articles.xml_5.bz2

/home/ubuntu/.keras/datasets/wiki_partitions/enwiki-20180901-pages-articles.xml_3.bz2
/home/ubuntu/.keras/datasets/wiki_partitions/enwiki-20180901-pages-articles.xml_7.bz2
/home/ubuntu/.keras/datasets/wiki_partitions/enwiki-20180901-pages-articles.xml_4.bz2
/home/ubuntu/.keras/datasets/wiki_partitions/enwiki-20180901-pages-articles.xml_1.bz2
/home/ubuntu/.keras/datasets/wiki_partitions/enwiki-20180901-pages-articles.xml_8.bz2
/home/ubuntu/.keras/datasets/wiki_partitions/enwiki-20180901-pages-articles.xml_9.bz2
/home/ubuntu/.keras/datasets/wiki_partitions/enwiki-20180901-pages-articles.xml_16.bz2
/home/ubuntu/.keras/datasets/wiki_partitions/enwiki-20180901-pages-articles.xml_6.bz2
/home/ubuntu/.keras/datasets/wiki_partitions/enwiki