In [4]:
import base64
import xml.etree.ElementTree as ET
from xml import etree

import lxml.etree as ltree
import requests
import structlog
from lxml.etree import XMLParser


def get_file_info(url):
    """URLを引数にして、ファイルの情報をdictにして返す."""
    response = requests.get(url)

    if response.status_code == 200:
        tree_info = response.json()
        file_info_dict: dict[str, bytes] = {}
        for file_info in tree_info["tree"]:
            file_name = file_info["path"]
            file_type = file_info["type"]

            # ファイルの内容を取得
            if file_type != "blob":
                msg = f"Error: {file_type} is not supported."
                raise Exception(msg)

            url = (
                "https://api.github.com/repos/standardebooks/john-maynard-keynes_the-economic-consequences-of-the-peace/git/blobs/"
                + file_info["sha"]
            )
            response = requests.get(url)
            if response.status_code == 200:
                file_content = base64.b64decode(response.json()["content"])
                file_info_dict[file_name] = file_content
            else:
                msg = f"Error: {response.status_code}"
                raise Exception(msg)
        return file_info_dict
    else:
        msg = f"Error: {response.status_code}"
        raise Exception(msg)


# 結果を出力
# print(file_info_dict)


class TextComponent:
    def __init__(self, title, sections):
        self.title = title
        self.sections = sections


class Section:
    def __init__(self, section_id, epub_type, hgroup_info, paragraphs):
        self.section_id = section_id
        self.epub_type = epub_type
        self.hgroup_info = hgroup_info
        self.paragraphs = paragraphs


class HgroupInfo:
    def __init__(self, ordinal, title):
        self.ordinal = ordinal
        self.title = title


def get_page_title(data_root):
    namespaces = {"xhtml": "http://www.w3.org/1999/xhtml"}
    return data_root.xpath("//xhtml:head/xhtml:title", namespaces=namespaces)[0].text


def parse_xhtml(data):
    """XHTMLデータを解析し、TextComponentインスタンスを生成します。.

    Args:
        data: XHTMLデータ

    Returns:
        TextComponentインスタンス
    """
    namespaces = {"xhtml": "http://www.w3.org/1999/xhtml"}
    root = ltree.fromstring(data, nsmap=namespaces)

    title = get_page_title(root)

    # セクション情報の取得
    sections = []
    for section in root.xpath("//xhtml:body/xhtml:section", namespaces=namespaces):
        # セクションID
        section_id = section.get("id")

        # hgroup情報の取得
        hgroup_info = HgroupInfo(
            ordinal=section.find("xhtml:h2", namespaces=namespaces).text
            if section.find("xhtml:h2", namespaces=namespaces) is not None
            else None,
            title=section.find("xhtml:p", namespaces=namespaces).text
            if section.find("xhtml:p", namespaces=namespaces) is not None
            else None,
        )

        # p要素の内容を取得
        paragraphs = []
        for paragraph in section.findall("xhtml:p", namespaces=namespaces):
            paragraphs.append(paragraph.text)

        # セクション情報に追加
        sections.append(Section(section_id, section.get("epub:type"), hgroup_info, paragraphs))

    # データ構造の生成
    text_component = TextComponent(title, sections)

    return text_component

In [7]:
url = "https://api.github.com/repos/standardebooks/john-maynard-keynes_the-economic-consequences-of-the-peace/git/trees/master:src/epub/text"
file_info_dict = get_file_info(url)

Exception: Error: 403

In [6]:

xml_string = file_info_dict.get("chapter-1.xhtml")
parser = XMLParser(
    encoding="UTF-8", resolve_entities=False, strip_cdata=False, recover=True, ns_clean=True
)
root = ET.fromstring(xml_string, parser)
pretty_xml = etree.tostring(root, pretty_print=True, encoding=str)
print(pretty_xml)

Exception: Error: 403

In [2]:
url = "https://api.github.com/repos/standardebooks/john-maynard-keynes_the-economic-consequences-of-the-peace/git/trees/master:src/epub/text"
file_info_dict = get_file_info(url)
content_raw = file_info_dict.get("chapter-1.xhtml").get("content")
xml_string = base64.b64decode(content_raw)
parser = XMLParser(
    encoding="UTF-8", resolve_entities=False, strip_cdata=False, recover=True, ns_clean=True
)
root = ET.fromstring(xml_string, parser)
pretty_xml = etree.tostring(root, pretty_print=True, encoding=str)
#print(pretty_xml)

Error: 403


AttributeError: 'NoneType' object has no attribute 'get'

In [None]:
root

In [None]:
import xml.etree.ElementTree as ET

parser = etree.XMLParser(encoding="UTF-8", resolve_entities=False, strip_cdata=False, recover=True, ns_clean=True)
root = ET.fromstring(xml_string, parser)

In [None]:
pretty_xml = etree.tostring(root, pretty_print=True, encoding=str)

In [None]:
import xml.dom.minidom

document = """\
<slideshow>
<title>Demo slideshow</title>
<slide><title>Slide title</title>
<point>This is a demo</point>
<point>Of a program for processing slides</point>
</slide>

<slide><title>Another demo slide</title>
<point>It is important</point>
<point>To have more than</point>
<point>one slide</point>
</slide>
</slideshow>
"""

dom = xml.dom.minidom.parseString(document)

In [None]:
dom.ge

In [None]:
print(pretty_xml[:1000])

In [None]:
data = pretty_xml

In [None]:
from lxml import etree

#root = pretty_xml
# タイトル取得
title = root.xpath("//xhtml:head/xhtml:title", namespaces=namespaces)[0].text

# セクション情報の取得
section_info = {}
for section in root.xpath("//xhtml:body/xhtml:section", namespaces=namespaces):
    # セクションID
    section_id = section.get("id")
    
    # hgroup情報の取得
    hgroup_info = {}
    hgroup = section.find("xhtml:hgroup", namespaces=namespaces)
    if hgroup is not None:
        # 章番号
        hgroup_info["ordinal"] = hgroup.find("xhtml:h2", namespaces=namespaces).text
        # タイトル
        hgroup_info["title"] = hgroup.find("xhtml:p", namespaces=namespaces).text
    
    # p要素の内容を取得
    paragraphs = []
    for paragraph in section.findall("xhtml:p", namespaces=namespaces):
        paragraphs.append(paragraph.text)
    
    # セクション情報に追加
    section_info[section_id] = hgroup_info, paragraphs

# 結果の出力
print(f"タイトル: {title}")
for section_id, (hgroup_info, paragraphs) in section_info.items():
    print(f"\nセクションID: {section_id} ({section.get('epub:type')})")
    print(f"  章番号: {hgroup_info['ordinal']}")
    print(f"  タイトル: {hgroup_info['title']}")
    print(f"  本文:")
    for paragraph in paragraphs:
        print(f"    {paragraph}")

In [None]:
class TextComponent:
    def __init__(self, title, sections):
        self.title = title
        self.sections = sections

class Section:
    def __init__(self, section_id, epub_type, hgroup_info, paragraphs):
        self.section_id = section_id
        self.epub_type = epub_type
        self.hgroup_info = hgroup_info
        self.paragraphs = paragraphs

class HgroupInfo:
    def __init__(self, ordinal, title):
        self.ordinal = ordinal
        self.title = title

In [None]:
def parse_xhtml(data):
    """
    XHTMLデータを解析し、TextComponentインスタンスを生成します。

    Args:
        data: XHTMLデータ

    Returns:
        TextComponentインスタンス
    """

    # ネームスペースを定義
    namespaces = {'xhtml': 'http://www.w3.org/1999/xhtml'}

    # ルート要素を取得
    root = etree.fromstring(data, nsmap=namespaces)

    # タイトル取得
    title = root.xpath("//xhtml:head/xhtml:title", namespaces=namespaces)[0].text

    # セクション情報の取得
    sections = []
    for section in root.xpath("//xhtml:body/xhtml:section", namespaces=namespaces):
        # セクションID
        section_id = section.get("id")

        # hgroup情報の取得
        hgroup_info = HgroupInfo(
            ordinal=section.find("xhtml:h2", namespaces=namespaces).text if section.find("xhtml:h2", namespaces=namespaces) is not None else None,
            title=section.find("xhtml:p", namespaces=namespaces).text if section.find("xhtml:p", namespaces=namespaces) is not None else None,
        )

        # p要素の内容を取得
        paragraphs = []
        for paragraph in section.findall("xhtml:p", namespaces=namespaces):
            paragraphs.append(paragraph.text)

        # セクション情報に追加
        sections.append(Section(section_id, section.get("epub:type"), hgroup_info, paragraphs))

    # データ構造の生成
    text_component = TextComponent(title, sections)

    return text_component

In [None]:
text_component.sections[0].paragraphs

In [None]:
title_element = root.xpath("//xhtml:head/xhtml:title", namespaces=namespaces)[0]

In [None]:
title_element.text

In [None]:
a = root.xpath('//xhtml:section//xhtml:p', namespaces=namespaces)

In [None]:
a[0]

In [None]:
for i in a:
    print(i.text)

In [None]:
a.text

In [None]:
title_element = html.xpath("/xhtml:head/xhtml:title")

In [None]:
title_element = root.xpath("//xhtml:head/xhtml:title", namespaces=namespaces)[0].text

In [None]:
title_element

In [None]:
namespaces = {'xhtml': 'http://www.w3.org/1999/xhtml'}
root = etree.fromstring(data, nsmap=namespaces)

a = root.xpath('//xhtml:section//xhtml:p/text()')[0].text_content()
print(a)  # 出力: hoge

In [None]:
html = lxml.html.fromstring(content_raw)

In [None]:
a = html.xpath('//xhtml:section//xhtml:p/text()')[0].text_content()

In [None]:
a

In [None]:
base64.b64decode(a).decode()

In [None]:
title = root.find(etree.XPath("/html/head/title")) 

In [None]:
# タイトル要素を取得
title = root.find(etree.XPath("/html/head/title"))

# タイトルテキストを出力
print(title.text)

# 各段落要素を取得
paragraphs = root.findall(etree.XPath("/html/body/p"))

# 各段落テキストを出力
for paragraph in paragraphs:
  print(paragraph.text)

In [None]:
r = response

if r.encoding is None or r.encoding == 'ISO-8859-1':
    r.encoding = r.apparent_encoding

In [None]:
import base64

In [None]:
import requests

url = "https://api.github.com/repos/standardebooks/john-maynard-keynes_the-economic-consequences-of-the-peace/git/blobs/" + file_info["sha"]

response = requests.get(url)

if response.status_code == 200:
    file_content = base64.b64decode(response.json()["content"]).decode()
    #file_content = response.json()["content"].encode
    print(f"ファイル名: {file_name}")
    print(f"ファイル内容: {file_content}")
else:
    print(f"Error: {response.status_code}")

In [None]:
for d in file_info_dict.items():
    print(d[1]["content"])
    break

In [None]:
a = d[1]["content"]

In [None]:
import lxml

In [None]:
lxml.html.etree(a)

In [None]:
import requests
import base64

def get_file_info(url):
  """
  URLを引数にして、ファイルの情報をdictにして返す
  """

  response = requests.get(url)

  if response.status_code == 200:
    # ファイルツリーの情報
    tree_info = response.json()
    file_info_dict = {}
    for file_info in tree_info["tree"]:
      file_name = file_info["path"]
      file_type = file_info["type"]

      # ファイルの内容を取得
      if file_type == "blob":
        url = "https://api.github.com/repos/standardebooks/john-maynard-keynes_the-economic-consequences-of-the-peace/git/blobs/" + file_info["sha"]
        response = requests.get(url)
        if response.status_code == 200:
          # ファイルの内容
          file_content = base64.b64decode(response.json()["content"]).decode()
          file_info_dict[file_name] = {
            "type": file_type,
            "content": file_content
          }
        else:
          print(f"Error: {response.status_code}")
      else:
        file_info_dict[file_name] = {
          "type": file_type
        }
    return file_info_dict
  else:
    print(f"Error: {response.status_code}")

# 例
url = "https://api.github.com/repos/standardebooks/john-maynard-keynes_the-economic-consequences-of-the-peace/git/trees/master:src/epub/text"
file_info_dict = get_file_info(url)

# 結果を出力
print(file_info_dict)

In [None]:
import requests
import base64

def get_file_info(url):
  """
  URLを引数にして、ファイルの情報をdictにして返す
  """

  response = requests.get(url)

  if response.status_code == 200:
    # ファイルツリーの情報
    tree_info = response.json()
    file_info_dict = {}
    for file_info in tree_info["tree"]:
      file_name = file_info["path"]
      file_type = file_info["type"]

      # ファイルの内容を取得
      if file_type == "blob":
        url = "https://api.github.com/repos/standardebooks/john-maynard-keynes_the-economic-consequences-of-the-peace/git/blobs/" + file_info["sha"]
        response = requests.get(url)
        if response.status_code == 200:
          # ファイルの内容
          file_content = base64.b64decode(response.json()["content"]).decode()
          file_info_dict[file_name] = {
            "type": file_type,
            "content": file_content
          }
        else:
          print(f"Error: {response.status_code}")
      else:
        file_info_dict[file_name] = {
          "type": file_type
        }
    return file_info_dict
  else:
    print(f"Error: {response.status_code}")

# 例
url = "https://api.github.com/repos/standardebooks/john-maynard-keynes_the-economic-consequences-of-the-peace/git/trees/master:src/epub/text"
file_info_dict = get_file_info(url)

# 結果を出力
print(file_info_dict)

In [None]:
import requests

url = "https://api.github.com/repos/standardebooks/john-maynard-keynes_the-economic-consequences-of-the-peace/git/trees/master:src/epub/text"

response = requests.get(url)

if response.status_code == 200:
    tree_info = response.json()
    for file_info in tree_info["tree"]:
        file_name = file_info["path"]
        file_type = file_info["type"]
        
        # ファイルの内容を取得
        if file_type == "blob":
            url = "https://api.github.com/repos/standardebooks/john-maynard-keynes_the-economic-consequences-of-the-peace/git/blobs/" + file_info["sha"]
            response = requests.get(url)
            if response.status_code == 200:
                # ファイルの内容
                file_content = base64.b64decode(response.json()["content"]).decode()
                print(f"ファイル名: {file_name}")
                print(f"ファイル内容: {file_content}")
            else:
                print(f"Error: {response.status_code}")
        else:
            print(f"ファイル名: {file_name}")
            print(f"ファイルの種類: {file_type}")
else:
    print(f"Error: {response.status_code}")

In [None]:

import requests

url = "https://api.github.com/repos/standardebooks/john-maynard-keynes_the-economic-consequences-of-the-peace/git/trees/master:src/epub/text"

response = requests.get(url)

if response.status_code == 200:
    # ファイルツリーの情報
    tree_info = response.json()
    for file_info in tree_info["tree"]:
        print(file_info)
else:
    print(f"Error: {response.status_code}")

In [None]:

import requests

url = "https://github.com/standardebooks/john-maynard-keynes_the-economic-consequences-of-the-peace/tree/master/src/epub/text"

response = requests.get(url)

if response.status_code == 200:
    # ファイルの情報
    file_info = response.json()
    print(file_info)
else:
    print(f"Error: {response.status_code}")

In [None]:
import subprocess

def curl_hoge():
    result = subprocess.run(["curl", "https://github.com/standardebooks/john-maynard-keynes_the-economic-consequences-of-the-peace/tree/master/src/epub/text"], capture_output=True)
    if result.returncode == 0:
        return json.loads(result.stdout.decode("utf-8"))
    else:
        raise RuntimeError(f"curl hoge failed with code {result.returncode}")

response_dict = curl_hoge()
print(response_dict)

In [None]:
# URLの完全化
from urllib.parse import urljoin
base_url = "https://standardebooks.org"
page_url = urljoin(base_url, link_url)

In [None]:
# discription
url = get_url_from_xpath("//div[@class='downloads-container']//a[@property='schema:contentUrl' and contains(@class, 'page')]/@href")

In [None]:
# URLの完全化
from urllib.parse import urljoin
base_url = "https://standardebooks.org"
#page_url = urljoin(base_url, url)
base_url + "/" + url

In [None]:
url

In [None]:
# discription
git_url = get_url_from_xpath('//*[@id="details"]/ul/li[1]/p/a/@href')

In [None]:
text_list = git_url + "/tree/master/src/epub/text"

In [None]:
text_list

In [None]:
# discription
git_url = get_url_from_xpath('//*[@id="details"]/ul/li[1]/p/a')

In [None]:
response = requests.get('https://github.com/standardebooks/john-maynard-keynes_the-economic-consequences-of-the-peace/tree/master/src/epub/text')
tree = lxml.html.fromstring(response.text.encode('utf-8'))

In [None]:
link_element = tree.xpath('.//a[@class="Link--primary"]') 

In [None]:
link_element

In [None]:
# `a`要素を抽出
link_element = tree.xpath('.//a[@class="Link--primary"]')  

if link_element is not None:
    link = link_element.get('href')
    print("ファイルへのリンク:", link)  
else:
    print("Link was not found")


In [None]:
# `a`要素を抽出
link_element = tree.xpath('/html/body/div[1]/div[6]/div/main/turbo-frame/div/react-app/div/div/div[1]/div/div/div[2]/div[2]/div/div[3]/div[3]/div/table/tbody/tr[2]')  

In [None]:
link_element

In [None]:
# タイトルを取得
title = tree.findtext(".//title")

# 本文を取得
body = tree.find(".//main[@id='content']")

# 処理結果を出力
print(f"タイトル: {title}")
print(f"本文: {body}")

In [None]:
link = tree.xpath('//*[@id="folder-row-1"]/td[2]/div/div/h3/div/a')

In [None]:
link

In [None]:
//*[@id="folder-row-1"]/td[2]/div/div/h3/div/a

In [None]:
# discription
tree.xpath('/html/body/div[1]/div[6]/div/main/turbo-frame/div/react-app/div/div/div[1]/div/div/div[2]/div[2]/div/div[3]/div[3]/div/table/tbody/tr[2]/td[2]/div/div/h3/div/a/@href') 

In [None]:
//*[@id="folder-row-1"]/td[2]/div/div/h3/div/a