In [1]:
import xml.etree.ElementTree as et

xtree = et.parse('output.xml')
xroot = xtree.getroot()

In [3]:
people = [person for person in xroot.iter('{http://www.tei-c.org/ns/1.0}person')]

In [47]:
import pandas as pd
import numpy as np

In [74]:
altNames = set()
for p in people:
    # persName alt names
    n = p.findall('{http://www.tei-c.org/ns/1.0}persName')
    for _n in n:
        if _n.attrib.get('subtype') is not None:
            _k = 'name-' + _n.attrib.get('type') + '-' + _n.attrib.get('subtype')
            altNames.add(_k)
        elif _n.attrib.get('type') is not None:
            _k = 'name-'+_n.attrib.get('type')
            altNames.add(_k)
print(altNames)

{'name-standardEn', 'name-altName-书名', 'name-altName-别号', 'name-altName-pen name', 'name-altName-alias', 'name-altName-real name', 'name-altName-pinyin', 'name-altName-title', 'name-altName-别署', 'name-altName-字', 'name-altName-别名', 'name-altName-原名'}


In [75]:
idnos = set()
for p in people:
    n = p.findall('{http://www.tei-c.org/ns/1.0}idno')
    for _n in n:
        if _n.attrib.get('type') is not None:
            _k = 'id-'+_n.attrib.get('type')
            idnos.add(_k)
print(idnos)

{'id-VIAF', 'id-legacy1', 'id-NUS_lib'}


In [76]:
traits = set()
for p in people:
    n = p. findall('{http://www.tei-c.org/ns/1.0}trait')
    for m in n:
        k = m.find('{http://www.tei-c.org/ns/1.0}label')
        if k is not None:
            traits.add(k.text)
print(traits)

{'Ancestral Home 祖籍地', 'Dialect Group 方言群', 'Faith 信仰', 'Related Organization', 'Work Organization'}


In [77]:
bibls = set()
for p in people:
    n = p.findall('{http://www.tei-c.org/ns/1.0}listBibl')
    for m in n:
        k = m.attrib.get('type')
        bibls.add(k)
print(bibls)

{'sourcesOn', 'worksBy', 'imagesOf'}


In [78]:
cols = ['id', 'name-zh-hans', 'name-zh-hant'] + list(altNames) + list(idnos) + ['birth-date', 'birth-place-zh', 'birth-place-en', 'birth-ref', 'death-date', 'death-ref', 'floruit-date', 'floruit-place', 'floruit-ref', 'sex', ] + list(traits) + list(bibls) + ['occupation']
print(cols)

['id', 'name-zh-hans', 'name-zh-hant', 'name-standardEn', 'name-altName-书名', 'name-altName-别号', 'name-altName-pen name', 'name-altName-alias', 'name-altName-real name', 'name-altName-pinyin', 'name-altName-title', 'name-altName-别署', 'name-altName-字', 'name-altName-别名', 'name-altName-原名', 'id-VIAF', 'id-legacy1', 'id-NUS_lib', 'birth-date', 'birth-place-zh', 'birth-place-en', 'birth-ref', 'death-date', 'death-ref', 'floruit-date', 'floruit-place', 'floruit-ref', 'sex', 'Ancestral Home 祖籍地', 'Dialect Group 方言群', 'Faith 信仰', 'Related Organization', 'Work Organization', 'sourcesOn', 'worksBy', 'imagesOf', 'occupation']


In [86]:
ori_data = {c:[] for c in cols}
print(ori_data)

{'id': [], 'name-zh-hans': [], 'name-zh-hant': [], 'name-standardEn': [], 'name-altName-书名': [], 'name-altName-别号': [], 'name-altName-pen name': [], 'name-altName-alias': [], 'name-altName-real name': [], 'name-altName-pinyin': [], 'name-altName-title': [], 'name-altName-别署': [], 'name-altName-字': [], 'name-altName-别名': [], 'name-altName-原名': [], 'id-VIAF': [], 'id-legacy1': [], 'id-NUS_lib': [], 'birth-date': [], 'birth-place-zh': [], 'birth-place-en': [], 'birth-ref': [], 'death-date': [], 'death-ref': [], 'floruit-date': [], 'floruit-place': [], 'floruit-ref': [], 'sex': [], 'Ancestral Home 祖籍地': [], 'Dialect Group 方言群': [], 'Faith 信仰': [], 'Related Organization': [], 'Work Organization': [], 'sourcesOn': [], 'worksBy': [], 'imagesOf': [], 'occupation': []}


In [87]:
delimiter = ';'
df = pd.DataFrame()

dict_index = 0

for p in people:
    person = {}
    
    for key in ori_data:
        ori_data[key].append(None)
    
    # id
    _id = p.attrib.get('{http://www.w3.org/XML/1998/namespace}id')
    ori_data['id'][dict_index] = _id
    
    # persName with lang
    n = p.find('{http://www.tei-c.org/ns/1.0}persName[@{http://www.w3.org/XML/1998/namespace}lang="zh-Hans"]')
    _name_zh_Hans = n.text if n is not None else None
    ori_data['name-zh-hans'][dict_index] = _name_zh_Hans
    
    n = p.find('{http://www.tei-c.org/ns/1.0}persName[@{http://www.w3.org/XML/1998/namespace}lang="zh-Hant"]')
    _name_zh_Hant = n.text if n is not None else None
    ori_data['name-zh-hant'][dict_index] = _name_zh_Hant
    
    # persName alt names
    n = p.findall('{http://www.tei-c.org/ns/1.0}persName')
    for _n in n:
        if _n.attrib.get('subtype') is not None:
            _k = 'name-' + _n.attrib.get('type') + '-' + _n.attrib.get('subtype')
            if _k in person:
                person[_k] += delimiter + _n.text
            else:
                person[_k] = _n.text
        elif _n.attrib.get('type') is not None:
            _k = 'name-'+_n.attrib.get('type')
            if _k in person:
                person[_k] += delimiter + _n.text
            else:
                person[_k] = _n.text
    for person_key in person:
        ori_data[person_key][dict_index] = person[person_key]
    person.clear()
        
    # idno
    n = p.findall('{http://www.tei-c.org/ns/1.0}idno')
    for _n in n:
        if _n.attrib.get('type') is not None:
            _k = 'id-'+_n.attrib.get('type')
            person[_k] = _n.text
    for person_key in person:
        ori_data[person_key][dict_index] = person[person_key]
    person.clear()
    
    # birth
    n = p.find('{http://www.tei-c.org/ns/1.0}birth')
    m = n.find('{http://www.tei-c.org/ns/1.0}date')
    _birth_date = m.text if m is not None else None
    ori_data['birth-date'][dict_index] = _birth_date
    m = n.find('{http://www.tei-c.org/ns/1.0}placeName[@{http://www.w3.org/XML/1998/namespace}lang="zh"]')
    _birth_placeName_zh = m.text if m is not None else None
    ori_data['birth-place-zh'][dict_index] = _birth_placeName_zh
    m = n.find('{http://www.tei-c.org/ns/1.0}placeName[@{http://www.w3.org/XML/1998/namespace}lang="en"]')
    _birth_placeName_en = m.text if m is not None else None
    ori_data['birth-place-en'][dict_index] = _birth_placeName_en
    m = n.findall('{http://www.tei-c.org/ns/1.0}ref')
    _birth_ref = delimiter.join([_m.text if _m.text is not None else '' for _m in m] if m is not None else [])
    ori_data['birth-ref'][dict_index] = _birth_ref
    
    # death
    n = p.find('{http://www.tei-c.org/ns/1.0}death')
    m = n.find('{http://www.tei-c.org/ns/1.0}date')
    _death_date = m.text if m is not None else None
    ori_data['death-date'][dict_index] = _death_date
    m = n.findall('{http://www.tei-c.org/ns/1.0}ref')
    _death_ref = delimiter.join([_m.text if _m.text is not None else '' for _m in m] if m is not None else [])
    ori_data['death-ref'][dict_index] = _death_ref
    
    # floruit
    n = p.find('{http://www.tei-c.org/ns/1.0}floruit')
    m = n.find('{http://www.tei-c.org/ns/1.0}date')
    _floruit_date = m.text if m is not None else None
    ori_data['floruit-date'][dict_index] = _floruit_date
    m = n.find('{http://www.tei-c.org/ns/1.0}placeName')
    _floruit_placeName = m.text if m is not None else None
    ori_data['floruit-place'][dict_index] = _floruit_placeName
    m = n.findall('{http://www.tei-c.org/ns/1.0}ref')
    _floruit_ref = delimiter.join([_m.text if _m.text is not None else '' for _m in m] if m is not None else [])
    ori_data['floruit-ref'][dict_index] = _floruit_ref
    
    # sex
    n = p.find('{http://www.tei-c.org/ns/1.0}sex')
    _sex = n.text if n is not None else None
    ori_data['sex'][dict_index] = _sex
    
    # trait
    n = p. findall('{http://www.tei-c.org/ns/1.0}trait')
    for m in n:
        k = m. find('{http://www.tei-c.org/ns/1.0}label')
        d = m. find('{http://www.tei-c.org/ns/1.0}desc')
        if k is not None and k.text in person:
            person[k.text] += (delimiter + d.text) if d.text is not None else ''
        elif k is not None:
            person[k.text] = d.text if d.text is not None else ''
    for person_key in person:
        ori_data[person_key][dict_index] = person[person_key]
    person.clear()
    
    # list bibl
    n = p.findall('{http://www.tei-c.org/ns/1.0}listBibl')
    for m in n:
        k = m.attrib.get('type')
        l = []
        for d in m.findall('{http://www.tei-c.org/ns/1.0}bibl'):
            if d.text is None:
                continue
            l.append(d.text)
        v = delimiter.join(l)
        # hack for imagesOf, as it's always empty for now
        if k == 'imagesOf':
            v = ''
        ori_data[k][dict_index] = v

    # occupation
    l = []
    n = p.findall('{http://www.tei-c.org/ns/1.0}occupation')
    for m in n:
        if m.text is None:
            continue
        l.append(m.text)
    v = delimiter.join(l)
    ori_data['occupation'][dict_index] = v
        
    dict_index += 1
    


In [88]:
df = pd.DataFrame(ori_data)
df

Unnamed: 0,id,name-zh-hans,name-zh-hant,name-standardEn,name-altName-书名,name-altName-别号,name-altName-pen name,name-altName-alias,name-altName-real name,name-altName-pinyin,...,sex,Ancestral Home 祖籍地,Dialect Group 方言群,Faith 信仰,Related Organization,Work Organization,sourcesOn,worksBy,imagesOf,occupation
0,s000000,李大傻,,Lee Dai Soh,,,,李福鸿,,Li Dasha,...,1,"广东省东莞县/Dongguan County, Guangdong Province",粤/Canton,,东安会馆;广惠肇碧山亭,马来亚电台;丽的呼声,,,,讲古艺人
1,s000001,孙中山,,Sun Yat-sen,,,,,,Sun Zhongshan,...,1,"广东省东莞县/Dongguan County, Guangdong Province",粤/Canton,,,,,,,
2,s000002,蔡志民,,,,,,,,Cai Zhimin,...,1,"广东省丰顺县/Fengshun County, Guangdong Province",潮/Teochew,,南洋客属总会;丰永大公会;丰顺会馆;星洲鞋业商会董事,志和行、志祥行,,,,商人
3,s000003,伍伯胜,,,,,,,,Wu Bosheng,...,1,"广东省台山县/Taishan County, Guangdong Province",粤/Canton,,,中国驻新加坡领事馆,,,,外交官
4,s000004,伍连德,,Wu Lien Teh,,,,,,Wu Liande,...,1,"广东省台山县/Taishan County, Guangdong Province",粤/Canton,,,,,,,
5,s000005,陈梦桃,,,,,,,,Chen Mengtao,...,1,"广东省嘉应州/Jiaying Prefecture, Guangdong Province",客/Hakka,,应新学堂,,,,,
6,s000006,徐统雄,,,,,,,,Xu Tongxiong,...,1,"广东省大埔县/Dapu County, Guangdong Province",客/Hakka,,,,,,,
7,s000007,陈瑞麟,,,,,,,,Chen Ruilin,...,1,"广东省梅县/Mei County, Guangdong Province",客/Hakka,,星华筹赈会,松发园;陈奇珍号杂货店,,,,农场主;商人
8,s000008,杨缵文,,Yeo Chan Boon,,,,,,Yang Zuanwen,...,1,"广东省潮安县/Chao'an County, Guangdong Province",潮/Teochew,,,,,,,
9,s000009,林忠国,,,,,,林煌玲;林玲,,Lin Zhongguo,...,1,"广东省澄海县/Chenghai County, Guangdong Province",潮/Teochew,,,,,,,商人


In [89]:
df.to_excel("xml-to-excel.xlsx")  