In [11]:
import requests
import lxml.html

In [12]:
response = requests.get("https://gutenberg.net.au/ebooks01/0100011h.html")

In [13]:
tree = lxml.html.fromstring(response.text)

In [14]:
tree.xpath('/html/body/p[1]')[0].text_content()

'Title: Animal Farm\nAuthor: George Orwell\neBook No.: 0100011h.html\nLanguage: English\nDate first posted: August 2001\nMost recent update: March 2008\n\nThis eBook was produced by: Colin Choat'

In [15]:
tree = lxml.html.fromstring(response.text)
tree.xpath('/html/body/h1')[0].text_content()

'Animal Farm'

In [16]:
tree.xpath("/html/body/h2")[0].text_content()

'Contents'

In [17]:
from lxml import etree

def parse_content(tree):
  """
  コンテンツをパースします。

  Args:
    tree: HTMLツリー

  Returns:
    コンテンツのリスト
  """
  content = []
  for element in tree.xpath('/html/body/*'):
    if element.tag == "p":
      content.append({"type": "paragraph", "text": element.text})
    elif element.tag == "pre":
      content.append({"type": "code", "text": element.text})
    elif element.tag == "h1":
      content.append({"type": "heading", "text": element.text})
  return content

# 例
html = """
<html>
<body>
<h1>タイトル</h1>
<p>本文</p>
<pre>コード</pre>
</body>
</html>
"""

tree = etree.fromstring(html)

content = parse_content(tree)

print(content)

[{'type': 'heading', 'text': 'タイトル'}, {'type': 'paragraph', 'text': '本文'}, {'type': 'code', 'text': 'コード'}]


In [18]:
tree.xpath('/html/body/p[6]')[0].text_content()

IndexError: list index out of range

In [19]:
tree.xpath('/html/body/pre[1]')[0].text_content()

AttributeError: 'lxml.etree._Element' object has no attribute 'text_content'

In [13]:
lxml.html.tostring(tree.xpath('/html/body/p[5]/br[1])[0])

b'<h1>Animal Farm</h1>\n\n'

In [20]:
from abc import ABC
from abc import abstractmethod
import re

import lxml.html
import requests


class Scraper(ABC):
    def __init__(self, url):
        self.url = url
        self.response = requests.get(url)
        self.tree = lxml.html.fromstring(self.response.text)

    @abstractmethod
    def get_title(self):
        pass

    @abstractmethod
    def get_language(self):
        pass

    @abstractmethod
    def get_text(self):
        pass

    @abstractmethod
    def get_author(self):
        pass

    @abstractmethod
    def get_metadata(self):
        pass

    @abstractmethod
    def get_chapters(self):
        pass

    @abstractmethod
    def get_chapter(self, chapter):
        pass

    @abstractmethod
    def get_chapter_text(self, chapter):
        pass


class Gutenberg(ABC):
    def __init__(self, url):
        self.url = url
        self.response = requests.get(url)
        self.tree = lxml.html.fromstring(self.response.text)

    @staticmethod
    def _parse_ebook_metadata(text):
        metadata = {}
        for line in text.splitlines():
            match = re.match(r"(.*?):\s*(.*)", line)
            if match:
                key, value = match.groups()
                metadata[key] = value
        return metadata

    def _get_meta_data(self):
        return self.tree.xpath("/html/body/p[1]")[0].text_content()

    def get_title(self):
        return self._parse_ebook_metadata(self._get_meta_data())["Title"]
        # return self.tree.xpath("/html/body/h1")[0].text_content()

    def get_language(self):
        return self.tree.xpath("/html/body/h2")[0].text_content()

    def get_text(self):
        return self.tree.xpath("/html/body/pre")[0].text_content()

    def get_author(self):
        return self.tree.xpath("/html/body/p[4]/text()[3]")[0].strip()

    def get_metadata(self):
        return {"title": self.get_title(), "author": self.get_author()}

    def get_chapters(self):
        return self.tree.xpath("/html/body/h3")

    def get_chapter(self, chapter):
        return self.tree.xpath("/html/body/h3")[chapter].text_content()

    def get_chapter_text(self, chapter):
        return self.tree.xpath("/html/body/h3")[chapter].tail

In [21]:
g = Gutenberg("https://gutenberg.net.au/ebooks01/0100011h.html")

In [22]:
g.get_title()

'Animal Farm'

In [24]:
g.get_text()

"\nBeasts of England, beasts of Ireland,\nBeasts of every land and clime,\nHearken to my joyful tidings\nOf the golden future time.\n\nSoon or late the day is coming,\nTyrant Man shall be o'erthrown,\nAnd the fruitful fields of England\nShall be trod by beasts alone.\n\nRings shall vanish from our noses,\nAnd the harness from our back,\nBit and spur shall rust forever,\nCruel whips no more shall crack.\n\nRiches more than mind can picture,\nWheat and barley, oats and hay,\nClover, beans, and mangel-wurzels\nShall be ours upon that day.\n\nBright will shine the fields of England,\nPurer shall its waters be,\nSweeter yet shall blow its breezes\nOn the day that sets us free.\n\nFor that day we all must labour,\nThough we die before it break;\nCows and horses, geese and turkeys,\nAll must toil for freedom's sake.\n\nBeasts of England, beasts of Ireland,\nBeasts of every land and clime,\nHearken well and spread my tidings\nOf the golden future time.\n\n"

In [23]:
g.get_author().strip()

'George Orwell'

In [None]:
class TestGutenberg:
    url = "https://gutenberg.net.au/ebooks01/0100011h.html"
    title = 'Animal Farm'
    author = 'George Orwell'

    def test_var_positive(self) -> None:
        assert self.VAR >= 0

In [None]:
g = Gutenberg("https://gutenberg.net.au/ebooks01/0100011h.html")