In [1]:
import requests
import lxml.html

In [4]:
from lxml.html import fromstring, HTMLParser

In [5]:
def parse_content(tree):
  """
  コンテンツをパースします。

  Args:
    tree: HTMLツリー

  Returns:
    コンテンツのリスト
  """
  content = []
  for element in tree.xpath('/html/body/*'):
    if element.tag == "p":
      content.append({"type": "paragraph", "text": element.text})
    elif element.tag == "pre":
      content.append({"type": "code", "text": element.text})
    elif element.tag == "h1":
      content.append({"type": "part", "text": element.text})
    elif element.tag == "h2":
      # すべてのh2要素のテキストを取得
      texts = element.xpath('./text()')
      if texts:
        content.append({"type": "chapter", "text": " ".join(texts)})
  return content

In [6]:
response = requests.get("https://gutenberg.net.au/ebooks01/0100011h.html")
html = response.text
tree = fromstring(html, parser=HTMLParser()) 

content = parse_content(tree)
#print(content)

In [7]:
content

[{'type': 'paragraph', 'text': 'Title: Animal Farm'},
 {'type': 'paragraph', 'text': None},
 {'type': 'paragraph', 'text': '*'},
 {'type': 'part', 'text': 'Animal Farm'},
 {'type': 'paragraph', 'text': 'by'},
 {'type': 'chapter', 'text': 'Contents'},
 {'type': 'paragraph', 'text': '\n'},
 {'type': 'chapter', 'text': 'Chapter I'},
 {'type': 'paragraph',
  'text': 'Mr. Jones, of the Manor Farm, had locked the hen-houses for the\nnight, but was too drunk to remember to shut the pop-holes. With\nthe ring of light from his lantern dancing from side to side, he\nlurched across the yard, kicked off his boots at the back door,\ndrew himself a last glass of beer from the barrel in the scullery,\nand made his way up to bed, where Mrs. Jones was already\nsnoring.'},
 {'type': 'paragraph',
  'text': "As soon as the light in the bedroom went out there was a\nstirring and a fluttering all through the farm buildings. Word had\ngone round during the day that old Major, the prize Middle White\nboar, ha

In [10]:
tree = lxml.html.fromstring(response.text)
tree.xpath('/html/body/h1')[0].text_content()

'Anne of Windy Poplars'

In [13]:
lxml.html.tostring(tree.xpath('/html/body/h1')[0])

b'<h1>Animal Farm</h1>\n\n'

In [11]:
from abc import ABC
from abc import abstractmethod
import re

import lxml.html
import requests


class Scraper(ABC):
    def __init__(self, url):
        self.url = url
        self.response = requests.get(url)
        self.tree = lxml.html.fromstring(self.response.text)

    @abstractmethod
    def get_title(self):
        pass

    @abstractmethod
    def get_language(self):
        pass

    @abstractmethod
    def get_text(self):
        pass

    @abstractmethod
    def get_author(self):
        pass

    @abstractmethod
    def get_metadata(self):
        pass

    @abstractmethod
    def get_chapters(self):
        pass

    @abstractmethod
    def get_chapter(self, chapter):
        pass

    @abstractmethod
    def get_chapter_text(self, chapter):
        pass


class Gutenberg(ABC):
    def __init__(self, url):
        self.url = url
        self.response = requests.get(url)
        self.tree = lxml.html.fromstring(self.response.text)

    @staticmethod
    def _parse_ebook_metadata(text):
        metadata = {}
        for line in text.splitlines():
            match = re.match(r"(.*?):\s*(.*)", line)
            if match:
                key, value = match.groups()
                metadata[key] = value
        return metadata

    def _get_meta_data(self):
        return self.tree.xpath("/html/body/p[1]")[0].text_content()

    def get_title(self):
        return self._parse_ebook_metadata(self._get_meta_data())["Title"]
        # return self.tree.xpath("/html/body/h1")[0].text_content()

    def get_language(self):
        return self.tree.xpath("/html/body/h2")[0].text_content()

    def get_text(self):
        return self.tree.xpath("/html/body/pre")[0].text_content()

    def get_published_year(self):
        #return self.tree.xpath("/html/body/h3[1]")
        return self.tree.xpath("/html/body/p")
        #/html/body/p[277]

    def get_author(self):
        return self.tree.xpath("/html/body/p[4]/text()[3]")[0].strip()

    def get_metadata(self):
        return {"title": self.get_title(), "author": self.get_author()}

    def get_chapters(self):
        return self.tree.xpath("/html/body/h3")

    def get_chapter(self, chapter):
        return self.tree.xpath("/html/body/h3")[chapter].text_content()

    def get_chapter_text(self, chapter):
        return self.tree.xpath("/html/body/h3")[chapter].tail

In [12]:
g = Gutenberg("https://gutenberg.net.au/ebooks01/0100011h.html")

In [13]:
a = g.get_published_year()

In [14]:
a[20].text_content()

'"I have little more to say. I merely repeat, remember always\nyour duty of enmity towards Man and all his ways. Whatever goes\nupon two legs is an enemy. Whatever goes upon four legs, or has\nwings, is a friend. And remember also that in fighting against Man,\nwe must not come to resemble him. Even when you have conquered him,\ndo not adopt his vices. No animal must ever live in a house, or\nsleep in a bed, or wear clothes, or drink alcohol, or smoke\ntobacco, or touch money, or engage in trade. All the habits of Man\nare evil. And, above all, no animal must ever tyrannise over his\nown kind. Weak or strong, clever or simple, we are all brothers. No\nanimal must ever kill any other animal. All animals are equal.'

In [140]:
g.get_author().strip()

'George Orwell'

In [None]:
class TestGutenberg:
    url = "https://gutenberg.net.au/ebooks01/0100011h.html"
    title = 'Animal Farm'
    author = 'George Orwell'

    def test_var_positive(self) -> None:
        assert self.VAR >= 0

In [None]:
g = Gutenberg("https://gutenberg.net.au/ebooks01/0100011h.html")