In [2]:
import requests
import lxml.html

In [3]:
response = requests.get("https://gutenberg.net.au/ebooks01/0100251h.html")

In [10]:
tree = lxml.html.fromstring(response.text)
tree.xpath('/html/body/h1')[0].text_content()

'Anne of Windy Poplars'

In [13]:
lxml.html.tostring(tree.xpath('/html/body/h1')[0])

b'<h1>Animal Farm</h1>\n\n'

In [37]:
import re
from abc import ABC
from abc import abstractmethod

import lxml.html
import requests


class Scraper(ABC):
    def __init__(self, url):
        self.url = url
        self.response = requests.get(url)
        self.tree = lxml.html.fromstring(self.response.text)

    @abstractmethod
    def get_title(self):
        pass

    @abstractmethod
    def get_language(self):
        pass

    @abstractmethod
    def get_text(self):
        pass

    @abstractmethod
    def get_author(self):
        pass

    @abstractmethod
    def get_metadata(self):
        pass

    @abstractmethod
    def get_chapters(self):
        pass

    @abstractmethod
    def get_chapter(self, chapter):
        pass

    @abstractmethod
    def get_chapter_text(self, chapter):
        pass


class Gutenberg(ABC):
    def __init__(self, url):
        self.url = url
        self.response = requests.get(url)
        self.tree = lxml.html.fromstring(self.response.text)

    @staticmethod
    def _parse_ebook_metadata(text):
        metadata = {}
        for line in text.splitlines():
            match = re.match(r"(.*?):\s*(.*)", line)
            if match:
                key, value = match.groups()
                metadata[key] = value
        return metadata

    def _get_meta_data(self):
        return self.tree.xpath("/html/body/p[1]")[0].text_content()

    def get_title(self):
        return self._parse_ebook_metadata(self._get_meta_data())["Title"]

    def get_language(self):
        return self._parse_ebook_metadata(self._get_meta_data())["Language"]

    def get_author(self):
        return self._parse_ebook_metadata(self._get_meta_data())["Author"]

    def get_chapters(self):
        return [
            chapter.text_content()
            for chapter in self.tree.xpath("/html/body/h2")
            if chapter.text_content() != "Contents"
        ]  # First one is the table of contents

In [46]:
g = Gutenberg("https://gutenberg.net.au/ebooks05/0500141h.html")

In [38]:
g = Gutenberg("https://gutenberg.net.au/ebooks01/0100011h.html")

In [49]:
g._get_meta_data()

'Cigars had burned low, and we were beginning to sample the\n  disillusionment that usually afflicts old school friends who have met again\n  as men and found themselves with less in common than they had believed they\n  had. Rutherford wrote novels; Wyland was one of the Embassy secretaries; he\n  had just given us dinner at Tempelhof—not very cheerfully, I fancied,\n  but with the equanimity which a diplomat must always keep on tap for such\n  occasions. It seemed likely that nothing but the fact of being three celibate\n  Englishmen in a foreign capital could have brought us together, and I had\n  already reached the conclusion that the slight touch of priggishness which I\n  remembered in Wyland Tertius had not diminished with years and an M.V.O.\n  Rutherford I liked more; he had ripened well out of the skinny, precocious\n  infant whom I had once alternately bullied and patronized. The probability\n  that he was making much more money and having a more interesting life than\n  ei

In [47]:
g.get_title()

KeyError: 'Title'

In [48]:
g.get_language()

KeyError: 'Language'

In [36]:
g.get_author()

'George Orwell'

In [43]:
g.get_chapters()

['Chapter I',
 'Chapter II',
 'Chapter III',
 'Chapter IV',
 'Chapter V',
 'Chapter VI',
 'Chapter VII',
 'Chapter VIII',
 'Chapter IX',
 'Chapter X',
 'THE END']