In [48]:
import glob
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm

In [49]:
from xml.dom import minidom
import re
def prettify(rough_string):
    reparsed = minidom.parseString(rough_string)
    pretty = re.sub(r"[\t ]+\n", "", reparsed.toprettyxml(indent="\t"))  # インデント後の不要な改行を削除
    pretty = pretty.replace(">\n\n\t<", ">\n\t<")  # 不要な空行を削除
    pretty = re.sub(r"\n\s*\n", "\n", pretty)  # 連続した改行（空白行を含む）を単一の改行に置換
    return pretty

In [51]:
def fix_pb(soup):
    pbs = soup.find_all("pb")

    for pb in pbs:
        zone_id = pb.get("facs").split("#")[1]

        zone = soup.find("zone", {"xml:id": zone_id})

        url = zone.parent.find("graphic").get("url")

        x = zone.get("ulx")
        y = zone.get("uly")

        w = int(zone.get("lrx")) - int(x)
        h = int(zone.get("lry")) - int(y)

        url_part = url.replace("/full/full/", f"/{x},{y},{w},{h}/full/")

        pb["facs"] = url_part

        pb["corresp"] = f"#{zone_id}"

def fix_facs(soup):
    surfaceGrp = soup.find("surfaceGrp")

    facsimile = soup.find("facsimile")

    facsimile["sameAs"] = surfaceGrp["facs"]

    for surface, idx in zip(surfaceGrp.find_all("surface"), range(len(surfaceGrp.find_all("surface")))):
        surface_new = soup.new_tag("surface")

        graphic = surface.find("graphic")

        info_url = graphic["url"].split("/full/full/")[0] + "/info.json"

        info = requests.get(info_url).json()

        surface_new["xml:id"] = f"f{str(idx+1).zfill(3)}"

        surface_new["ulx"] = str(0)
        surface_new["uly"] = str(0)

        surface_new["lrx"] = str(info["width"])
        surface_new["lry"] = str(info["height"])

        surface_new["sameAs"] = graphic["n"]

        graphic_new = soup.new_tag("graphic")

        graphic_new["width"] = str(info["width"]) + "px"
        graphic_new["height"] = str(info["height"]) + "px"

        graphic_new["url"] = graphic["url"]

        graphic_new["sameAs"] = graphic["url"].split("/full/full/")[0]


        # label = soup.new_tag("label")
        # label.string = str(idx+1)

        # surface_new.append(label)

        surface_new.append(graphic_new)

        for zone in surface.find_all("zone"):
            surface_new.append(zone)

        facsimile.append(surface_new)

    surfaceGrp.decompose()

def add_revision(soup):
    revisionDesc = soup.new_tag("revisionDesc", status="published")

    soup.find("teiHeader").append(revisionDesc)

    change = soup.new_tag("change", when="2024-06-28", who="#snakamura")
    revisionDesc.append(change)

    change.string = "pb要素のfacs属性を修正しました。facsimile要素を修正しました。"

def fix_resp(soup):
    respStmts = soup.find_all("respStmt")

    for respStmt in respStmts:

        if "Satoru Nakamura" in respStmt.text:
            respStmt["xml:id"] = "snakamura"

files = glob.glob("../../tei/*.xml")

files.sort()

for file in tqdm(files):

    opath = "../../xml/lw/" + file.split("/")[-1]

    # print(file)
    with open(file, 'r') as f:
        text = f.read()

        text = '''<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="https://kouigenjimonogatari.github.io/lw/tei_genji.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
<?xml-stylesheet type="text/css" href="https://kouigenjimonogatari.github.io/lw/tei_genji.css"?>
''' + text
        
        soup = BeautifulSoup(text, "xml")

        fix_pb(soup)

        fix_facs(soup)

        add_revision(soup)

        fix_resp(soup)

    with open(opath, 'w') as f:
        f.write(prettify(str(soup)))

    # break

100%|██████████| 54/54 [01:16<00:00,  1.42s/it]
