In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup


In [2]:
class Character:
    """
    Helper class to hold information about character.
    """

    def __init__(self, url, name, books):
        self.url = url
        self.name = name
        self.books = books

In [3]:
class Scraper:
    def __init__(self):
        self.url = 'http://wiki.lspace.org/mediawiki/List_of_Pratchett_characters'
        self.base_url = 'http://wiki.lspace.org'

    def make_soup(self, url):
        page = requests.get(url)
        return BeautifulSoup(page.content, 'html.parser')

    def scrap_character_books(self, url):
        """
        Scraps books from character wiki page.
        :param url: url to character wiki page 
        :return: list of names of books for a character.
        """
        character = self.make_soup(url)
        books = character.find('a', title='Bibliography').parent.parent.find_next_sibling('td').find_all('a')
        books_list = []
        for book in books:
            # book_url = book.get('href')
            book_name = book.get_text()
            books_list.append(book_name)
        return books_list

    def scrap(self):
        """
        Main part of the scraper. 
        :return: List of Character objects.
        """
        wiki_page = self.make_soup(self.url)
        character_list = wiki_page.find('span',
                                        id="Ankh-Morpork_and_The_Watch").parent.find_next_sibling('ul').find_all('li')
        characters = []
        for character in character_list:
            char_url = character.a.get('href')
            char_name = character.a.get_text()
            print(char_name, char_url)
            books = self.scrap_character_books(self.base_url + char_url)
            characters.append(Character(self.base_url + char_url, char_name, books))
        return characters

In [4]:
def make_dataframe(characters):
    """
    Construct pandas dataframe from list of Characters
    :param characters: list of Characters
    :return: pandas dataframe.
    """
    df = pd.DataFrame(columns=['book', 'character_name'])
    for character in characters:
        books = character.books
        for book in books:
            df = df.append({'book': book, 'character_name': character.name}, ignore_index=True)
    return df


In [5]:
scraper = Scraper()
chars = scraper.scrap()
data_f = make_dataframe(chars)
print(data_f.to_string())


Mrs. Cake /mediawiki/Mrs._Cake
Fred Colon /mediawiki/Fred_Colon
Mrs. Marietta Cosmopilite /mediawiki/Mrs._Marietta_Cosmopilite
Detritus /mediawiki/Detritus
Cut-Me-Own-Throat Dibbler /mediawiki/Cut-Me-Own-Throat_Dibbler
Dorfl /mediawiki/Dorfl
Rufus Drumknott /mediawiki/Rufus_Drumknott
Gaspode /mediawiki/Gaspode
Carrot Ironfoundersson /mediawiki/Carrot_Ironfoundersson
Leonard of Quirm /mediawiki/Leonard_of_Quirm
Cheery Littlebottom /mediawiki/Cheery_Littlebottom
Nobby Nobbs /mediawiki/Nobby_Nobbs
Lady Sybil Ramkin /mediawiki/Lady_Sybil_Ramkin
Foul Ole Ron /mediawiki/Foul_Ole_Ron
Reg Shoe /mediawiki/Reg_Shoe
Mr. Slant /mediawiki/Mr._Slant
Angua von Überwald /mediawiki/Angua_von_%C3%9Cberwald
Lord Havelock Vetinari /mediawiki/Havelock_Vetinari
Samuel Vimes /mediawiki/Samuel_Vimes
Visit-the-Infidel-with-Explanatory-Pamphlets /mediawiki/Visit-the-Infidel-with-Explanatory-Pamphlets
Willikins /mediawiki/Willikins
                   book                                character_name
0          

Count number of characters in Men at Arms

In [6]:
print(len(data_f[data_f['book'] == 'Men at Arms']))


12
