# Introduction: Book Recommendation Using Word Embeddings

In [1]:
import requests
from bs4 import BeautifulSoup
import os
import time
from keras.utils import get_file
try:
    from urllib.request import urlretrieve
except ImportError:
    from urllib import urlretrieve
import xml.sax

import subprocess
import mwparserfromhell
import json

Using TensorFlow backend.


## Download Wikipedia Dump

In [2]:
index = requests.get('https://dumps.wikimedia.org/enwiki/').text
soup_index = BeautifulSoup(index, 'html.parser')
soup_index.contents

[<html>
 <head><title>Index of /enwiki/</title></head>
 <body bgcolor="white">
 <h1>Index of /enwiki/</h1><hr/><pre><a href="../">../</a>
 <a href="20180601/">20180601/</a>                                          21-Jul-2018 01:33                   -
 <a href="20180620/">20180620/</a>                                          02-Aug-2018 01:28                   -
 <a href="20180701/">20180701/</a>                                          22-Aug-2018 01:25                   -
 <a href="20180720/">20180720/</a>                                          02-Sep-2018 01:27                   -
 <a href="20180801/">20180801/</a>                                          11-Aug-2018 08:29                   -
 <a href="20180820/">20180820/</a>                                          23-Aug-2018 15:32                   -
 <a href="20180901/">20180901/</a>                                          13-Sep-2018 00:25                   -
 <a href="latest/">latest/</a>                                  

The next line of code finds the most recent dump.

In [4]:
# Find the links that are dates of dumps
dumps = [a['href'] for a in soup_index.find_all('a') if 
         a.has_attr('href') and a.text[:-1].isdigit()]
dumps[-1]

'20180901/'

Now we need to find the url extension for the actual XML data dump.

In [7]:
# Retrieve the html
dump_html = requests.get('https://dumps.wikimedia.org/enwiki/' + dumps[-1]).text

# Convert to a soup
soup_dump = BeautifulSoup(dump_html, 'html.parser')

# Find the XML pages
pages_xml = [a['href'] for a in soup_dump.find_all('a') if 
             a.has_attr('href') and a['href'].endswith('-pages-articles.xml.bz2')]
pages_xml

['/enwiki/20180901/enwiki-20180901-pages-articles.xml.bz2']

The next cell finds the name of the file and the url. 

In [11]:
wikipedia_dump = pages_xml[0].rsplit('/')[-1]
url = 'https://dumps.wikimedia.org/' + pages_xml[0]
url

'https://dumps.wikimedia.org//enwiki/20180901/enwiki-20180901-pages-articles.xml.bz2'

## Retrieve Data

Now we need to actually download the data. This can be done using the keras `get_file` utility which downloads the specified file at the specified url. If we already have the entire dataset downloaded, then we don't want to download it again! For that reason we first use a check to see if the data exists.

In [24]:
import sys

keras_home = '/data/wiki'
data_path = keras_home + wikipedia_dump

if not os.path.exists(data_path):
    print('Downloading')
    data_path = get_file(wikipedia_dump, url)
else:
    print(f'Already downloaded. File Size: {os.stat(data_path).st_size / 1e9} GB')

Already downloaded. File Size: 15.398410099 GB


# Sorting Through the Data

Now we need to write a number of helper functions to extract the information we need from the data. A lot of these functions are copied directly from the book.

In [23]:
class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Used to handle the XML wiki dump. Copied 
    directly from the book and only edited self._books (from self._movies)"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._books = []
        self._curent_tag = None

    def characters(self, content):
        if self._curent_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        if name in ('title', 'text'):
            self._curent_tag = name
            self._buffer = []

    def endElement(self, name):
        if name == self._curent_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            book = process_article(**self._values)
            if book:
                self._books.append(book)

In [25]:
# Object for handling xml
handler = WikiXmlHandler()

# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)

First we'll get one example of a book to try and understand what's going on.

### Function to  Process Article Text

This function is also taken directly from the book with minor modifications. The biggest change is in the line `book = ` where the `infobox` has been changed to `infobox book` to reflect that we are searching for books!

In [37]:
import re

def process_article(title, text, return_wikicode = False):
    """Process a wikipedia article looking for books"""
    # Create a parsing object
    wikicode = mwparserfromhell.parse(text)
    if return_wikicode:
        return wikicode
    
    # Search through templates for the book template
    book = next((template for template in wikicode.filter_templates() 
                 if template.name.strip().lower() in ['infobox book']), None)
    if book:
        properties = {param.name.strip_code().strip(): param.value.strip_code().strip() 
                      for param in book.params
                      if param.value.strip_code().strip()
                     }
        links = [x.title.strip_code().strip() for x in wikicode.filter_wikilinks()]
        return (title, properties, links)

We can use the `bzcat` utility http://www.qnx.com/developers/docs/6.5.0SP1.update/com.qnx.doc.neutrino_utilities/b/bzcat.html which decompresses a bz2 compressed file and sends the contents to standard out. Effectively what this code is doing is decompressing the file one line at a time and sending the line through the `parser`. This gets around the need to load the entire file into memory at once since it is probably too large in its uncompressed state. 

The first time, we set the code to break if the handler encouters any books so we can look at the output. 

In [28]:
for i, line in enumerate(subprocess.Popen(['bzcat'], 
                                         stdin = open(data_path), stdout = subprocess.PIPE).stdout):
    x = line
    try:
        parser.feed(line)
    except StopIteration:
        break
    if handler._books:
        break

The first entry is simply the title.

In [32]:
handler._books[0][0]

'Animalia (book)'

The second is all the parameters that are in the `infobox book` template on the wikipedia page.

In [33]:
handler._books[0][1]

{'1': '< !-- See Wikipedia:WikiProject_Novels or Wikipedia:WikiProject_Books -- >',
 'name': 'Animalia',
 'image': 'Animalia (book cover).jpg',
 'alt': 'Book cover: a larger picture framed by smaller pictures, all of which contain different animals, and title with author at the top',
 'author': 'Graeme Base',
 'illustrator': 'Graeme Base',
 'country': 'Australia',
 'language': 'English',
 'genre': 'Picture books',
 'publisher': 'Harcourt Brace Jovanovich',
 'release_date': '1986',
 'pages': '32',
 'isbn': '0-810-91868-4'}

The third entry is all of the Wikipedia links that occur on the page. These are links that go to __other wikipedia pages__ as opposed to external sources.

In [34]:
handler._books[0][2]

['Graeme Base',
 'Picture books',
 'Harcourt Brace Jovanovich',
 "Children's literature",
 'Graeme Base',
 'alliteration',
 'alphabet',
 'alligator',
 'butterfly',
 'colouring book',
 'Abrams Books',
 'Animalia (TV series)',
 'Venezuela',
 'Minimax (TV channel)',
 'Czech Republic',
 'Slovakia',
 'Greece',
 'ET1 (Greece)',
 "Australian Children's Television Foundation",
 'iPad',
 'iPhone',
 'iPod Touch',
 "Children's Book Council of Australia",
 "Children's Book of the Year Award: Picture Book",
 'Category:Alphabet books',
 "Category:1986 children's books",
 'Category:Picture books by Graeme Base',
 'Category:Puzzle books',
 "Category:Australian children's books"]

We'll gather this information for every article on Wikipedia that has an `infobox book` template on the page (this should be around 40,000). While this won't capture every book, it will give us a large selection to work with for making recommendations! 

In [45]:
processed = process_article(**handler._values, return_wikicode=True)
processed.filter_templates()[1]

"{{Infobox book| < !-- See Wikipedia:WikiProject_Novels or Wikipedia:WikiProject_Books -- > \n | name          = '''Animalia''' \n | image         = Animalia (book cover).jpg \n | caption       =  \n | alt           = Book cover: a larger picture framed by smaller pictures, all of which contain different animals, and title with author at the top \n | author        = [[Graeme Base]] \n | illustrator   = Graeme Base \n | country       = Australia \n | language      = English  \n | genre         = [[Picture books]] \n | publisher     = [[Harcourt Brace Jovanovich]] \n | release_date  = 1986 \n | pages         = 32 \n | isbn          = 0-810-91868-4 \n | oclc          =  \n }}"

### Retrieve Every Book on Wikipedia

In [None]:
from timeit import default_timer as timer


# Object for handling xml
handler = WikiXmlHandler()

# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)

start = timer()
recorded_count = 0

for i, line in enumerate(subprocess.Popen(['bzcat'], 
                                         stdin = open(data_path), stdout = subprocess.PIPE).stdout):
    # Process the line (entry)
    try:
        parser.feed(line)
    except StopIteration:
        break
    
    # Print progress information
    n_books = len(handler._books)
    if (n_books % 10 == 0) and (n_books != recorded_count):
        print(f'{n_books} books found. {round(timer() - start)} seconds elapsed.', end = '\r')
        # Make sure to only report found books once
        recorded_count = n_books

530 books found. 2787 seconds elapsed.

In [50]:
with open('generated/books.ndjson', 'wt') as fout:
    for book in handler._books:
         fout.write(json.dumps(book) + '\n')