<a href="https://colab.research.google.com/github/imabari/ImabariScraping/blob/master/%E3%81%82%E3%82%8B%E5%A0%B4%E6%89%80%E3%81%AB%E6%BB%9E%E5%9C%A8%E3%81%97%E3%81%9F%E4%BA%BA%E3%80%85%E3%81%8C%E7%94%9F%E6%B6%AF%E3%81%AB%E6%BB%9E%E5%9C%A8%E3%81%97%E3%81%9F%E5%A0%B4%E6%89%80%E3%82%92%E8%A1%A8%E7%A4%BA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ダウンロード

In [1]:
!curl -O 'https://tripitaka.l.u-tokyo.ac.jp/hbgrn/hobotei.xml'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4472k  100 4472k    0     0  2125k      0  0:00:02  0:00:02 --:--:-- 2126k


In [2]:
from lxml import etree

In [3]:
import pandas as pd

In [4]:
ns = {"tei": "http://www.tei-c.org/ns/1.0"}

In [5]:
tree = etree.parse("hobotei.xml")
root = tree.getroot()

In [6]:
# 名前空間マッピング確認
root.nsmap

{None: 'http://www.tei-c.org/ns/1.0'}

In [7]:
# 言語
langs = ["zh", "zh-latn", "ja-latn", "sa-latn", "ko-latn", "bo-latn"]

# 場所

In [8]:
def fetch_place(root):

    place = []

    for i in root.xpath("//tei:place/tei:location/tei:geo", namespaces=ns):

        d = {}

        d["geo"] = i.text.strip()
        d["gid"] = i.xpath("../../@xml:id", namespaces=ns)[0].strip()

        for name in i.xpath("../../tei:placeName", namespaces=ns):
            if name.text:
                k = name.get("{http://www.w3.org/XML/1998/namespace}lang").strip().lower()
                v = name.text.strip()
                d[k] = v

        place.append(d)

    df = pd.DataFrame(place)

    # 地名結合
    df["place"] = df.reindex(langs, axis=1,).apply(lambda x: "/".join(x.dropna().values), axis=1)

    # 緯度経度変換
    df[["lat", "lon"]] = df["geo"].str.split(",", expand=True)

    df["lat"] = pd.to_numeric(df["lat"].str.strip(), errors="coerce")
    df["lon"] = pd.to_numeric(df["lon"].str.strip(), errors="coerce")

    df.set_index("gid", inplace=True)

    return df.reindex(["place", "lat", "lon"], axis=1)


In [9]:
df_place = fetch_place(root)
df_place

Unnamed: 0_level_0,place,lat,lon
gid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
pl0000,豳州/Bin zhou,35.501100,107.939500
pl0001,阿育王山/A yu wang shan,29.853941,121.740856
pl0002,愛知/Aichi,35.180270,136.906717
pl0003,An hui,30.511200,117.043500
pl0004,安沙府/An sha fu,27.628631,111.457414
...,...,...,...
vh0458,清澄寺/Anyōji,35.160944,140.151389
vh0459,横川定光院/Anyōji,35.099000,135.853861
vh0460,薬師寺/Anyōji,34.668356,135.784311
vh0461,五坊寂静院/Anyōji,34.216639,135.581417


# 人物

In [10]:
def fetch_person(root):

    person = []

    for i in root.xpath("//tei:person", namespaces=ns):

        d = {}

        d["pid"] = i.xpath("./@xml:id", namespaces=ns)[0].strip()

        for name in i.xpath("./tei:name/tei:persName", namespaces=ns):
            if name.text:
                k = name.get("{http://www.w3.org/XML/1998/namespace}lang").strip().lower()

                if k == "zh-latin":
                    k = "zh-latn"

                v = name.text.strip()
                d[k] = v

        person.append(d)

    df = pd.DataFrame(person)
    df.set_index("pid", inplace=True)

    # 名前結合
    df["name"] = df.reindex(langs, axis=1,).apply(lambda x: "/".join(x.dropna().values), axis=1)

    return pd.DataFrame(df["name"])

In [11]:
df_person = fetch_person(root)
df_person

Unnamed: 0_level_0,name
pid,Unnamed: 1_level_1
TP1,愛同/Ai tong/AIDŌ
TP2,阿地瞿多/A di qu duo/AJIKUTA/Atigupta
TP3,阿謨伽/A mo jia/AMOGA
TP4,阿目佉/A mu qu/AMOKKYA
TP5,安澄/ANCHŌ
...,...
rp0047,武/Wu
rp0048,武則天/Wu Zetian
rp0049,武宗/Wu zong
rp0050,*Yuiginan/Vighna


In [12]:
df_place[df_place["place"].str.contains("西安")]

Unnamed: 0_level_0,place,lat,lon
gid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
pl0005,西安/An xi,34.297686,108.938603


# 滞在先

In [13]:
def fetch_event(root):

    event = []

    for i in root.xpath("//tei:event[@corresp]", namespaces=ns):

        d = {}

        d["pid"] = i.xpath("../../@xml:id")[0].strip()
        d["gid"] = i.get("corresp")

        event.append(d)
    
    df0 = pd.DataFrame(event)
    df0["gid"] = df0["gid"].str.split()

    df1 = df0.explode("gid").reset_index(drop=True)

    df1["gid"] = df1["gid"].str.lstrip("#")

    return df1

In [14]:
df_event = fetch_event(root)
df_event

Unnamed: 0,pid,gid
0,TP2,pl0163
1,TP2,pl0034
2,TP5,pl0299
3,TP5,vh0063
4,TP6,pl0002
...,...,...
3124,TP913,vh0375
3125,TP913,pl0220
3126,TP913,vh0227
3127,TP913,pl0097


# 抽出

In [15]:
# 滞在先に地名と位置情報と滞在者をマージ
df0 = df_event.join(df_place, on="gid").join(df_person, on="pid").dropna(subset=["place", "lat", "lon"])
df0

Unnamed: 0,pid,gid,place,lat,lon,name
1,TP2,pl0034,長安/Chang an,34.297686,108.938603,阿地瞿多/A di qu duo/AJIKUTA/Atigupta
2,TP5,pl0299,奈良/Nara,34.685232,135.832833,安澄/ANCHŌ
3,TP5,vh0063,大安寺/Daianji,34.668000,135.812722,安澄/ANCHŌ
4,TP6,pl0002,愛知/Aichi,35.180270,136.906717,安超/ANCHŌ
5,TP6,vh0007,安樂寺/Anrakuji,35.340877,136.976158,安超/ANCHŌ
...,...,...,...,...,...,...
3123,TP913,pl0225,京都/Kyōto,35.021229,135.755604,藏山順空/ZŌZAN JUNKŪ
3124,TP913,vh0375,東福寺/Tōfukuji,34.976035,135.773648,藏山順空/ZŌZAN JUNKŪ
3125,TP913,pl0220,熊本/Kumamoto,32.789812,130.741603,藏山順空/ZŌZAN JUNKŪ
3126,TP913,vh0227,高城寺/Kōjōji,33.326839,130.277912,藏山順空/ZŌZAN JUNKŪ


In [16]:
# 永平寺の滞在者
df1 = df0[df0["place"].str.contains("永平寺")]
df1

Unnamed: 0,pid,gid,place,lat,lon,name
345,TP133,vh0081,永平寺/Eiheiji,36.053056,136.355556,道元/DŌGEN
972,TP293,vh0081,永平寺/Eiheiji,36.053056,136.355556,義雲/GIUN
1901,TP542,vh0081,永平寺/Eiheiji,36.053056,136.355556,孤雲懷弉/KOUN EJŌ
2004,TP569,vh0081,永平寺/Eiheiji,36.053056,136.355556,卍山道白/MANZAN DŌBYAKU


In [17]:
# 上記滞在者のその他滞在場所
df2 = df0[df0["pid"].isin(df1["pid"].tolist())].copy()
df2

Unnamed: 0,pid,gid,place,lat,lon,name
336,TP133,pl0523,浙江/Zhe jiang,30.266582,120.153531,道元/DŌGEN
337,TP133,pl0391,太白山/Tai bai shan,29.81828,121.780698,道元/DŌGEN
338,TP133,vh0365,天童景德寺/Tian dong jing de si,29.806691,121.790918,道元/DŌGEN
340,TP133,pl0225,京都/Kyōto,35.021229,135.755604,道元/DŌGEN
341,TP133,vh0210,建仁寺/Kenninji,35.000986,135.773664,道元/DŌGEN
342,TP133,pl0411,宇治/Uji,34.893043,135.806342,道元/DŌGEN
343,TP133,vh0235,興聖寶林禪寺/Kōshō Hōrinzenji,34.890028,135.813736,道元/DŌGEN
345,TP133,vh0081,永平寺/Eiheiji,36.053056,136.355556,道元/DŌGEN
972,TP293,vh0081,永平寺/Eiheiji,36.053056,136.355556,義雲/GIUN
1901,TP542,vh0081,永平寺/Eiheiji,36.053056,136.355556,孤雲懷弉/KOUN EJŌ


In [18]:
# 滞在場所でグループ化
df3 = df2.groupby(["gid", "place", "lat", "lon"])["name"].apply(list).reset_index()

In [19]:
# 永平寺のアイコンを赤に変更
df3["color"] = "blue"
df3["color"].mask(df3["place"].str.contains("永平寺"), "red", inplace=True)

In [20]:
df3

Unnamed: 0,gid,place,lat,lon,name,color
0,pl0168,石川/Ishikawa,36.594537,136.62578,[卍山道白/MANZAN DŌBYAKU],blue
1,pl0225,京都/Kyōto,35.021229,135.755604,"[道元/DŌGEN, 卍山道白/MANZAN DŌBYAKU]",blue
2,pl0391,太白山/Tai bai shan,29.81828,121.780698,[道元/DŌGEN],blue
3,pl0411,宇治/Uji,34.893043,135.806342,[道元/DŌGEN],blue
4,pl0523,浙江/Zhe jiang,30.266582,120.153531,[道元/DŌGEN],blue
5,vh0068,大乘寺/Daijōji,36.532556,136.658944,[卍山道白/MANZAN DŌBYAKU],blue
6,vh0081,永平寺/Eiheiji,36.053056,136.355556,"[道元/DŌGEN, 義雲/GIUN, 孤雲懷弉/KOUN EJŌ, 卍山道白/MANZAN...",red
7,vh0104,源光庵/Genkōan,35.054814,135.731722,[卍山道白/MANZAN DŌBYAKU],blue
8,vh0210,建仁寺/Kenninji,35.000986,135.773664,[道元/DŌGEN],blue
9,vh0235,興聖寶林禪寺/Kōshō Hōrinzenji,34.890028,135.813736,[道元/DŌGEN],blue


# 地図

In [21]:
import folium

In [22]:
map = folium.Map(
    location=[35.5, 138.5],
    zoom_start=2,
)

In [23]:
for i, r in df3.iterrows():

    s = "<br />".join(r["name"])

    folium.Marker(
        location=[r.lat, r.lon],
        popup=folium.Popup(f"<p>{r.place}</p><p>{s}</p>", max_width=300),
        icon=folium.Icon(color=r.color)
    ).add_to(map)

map