# Combine XMLs

## Import Packages

Import modules from the standard library to work with the file system xmls and regular expressions.


*   glob - searches for filenames that match a specific pattern
*   ElementTree - for working with xmls
*   re - for regular expressions (patterns used to find specific parts of strings)

[Colab Markdown Cheat Sheet](https://colab.research.google.com/notebooks/markdown_guide.ipynb)


In [1]:
import glob
import xml.etree.ElementTree as ET
import re

In [2]:
pages = glob.glob("data/raw/1865/00*.xml")

In [3]:
pages

['data/raw/1865/0006_p014.xml',
 'data/raw/1865/0017_p025.xml',
 'data/raw/1865/0010_p018.xml',
 'data/raw/1865/0002_1865_letter.xml',
 'data/raw/1865/0003_1865_page_1.xml',
 'data/raw/1865/0014_p022.xml',
 'data/raw/1865/0011_p019.xml',
 'data/raw/1865/0019_p027.xml',
 'data/raw/1865/0026_p034.xml',
 'data/raw/1865/0012_p020.xml',
 'data/raw/1865/0018_p026.xml',
 'data/raw/1865/0013_p021.xml',
 'data/raw/1865/0023_p031.xml',
 'data/raw/1865/0007_p015.xml',
 'data/raw/1865/0016_p024.xml',
 'data/raw/1865/0024_p032.xml',
 'data/raw/1865/0021_p029.xml',
 'data/raw/1865/0025_p033.xml',
 'data/raw/1865/0001_1865_cover.xml',
 'data/raw/1865/0005_p013.xml',
 'data/raw/1865/0004_p011.xml',
 'data/raw/1865/0022_p030.xml',
 'data/raw/1865/0015_p023.xml',
 'data/raw/1865/0008_p016.xml',
 'data/raw/1865/0020_p028.xml',
 'data/raw/1865/0009_p017.xml']

In [10]:
int(pages[1].split("/")[-1].split("_")[0])

17

In [12]:
ordered_pages = sorted(pages, key=lambda x: int(x.split("/")[-1].split("_")[0]))
ordered_pages

['data/raw/1865/0001_1865_cover.xml',
 'data/raw/1865/0002_1865_letter.xml',
 'data/raw/1865/0003_1865_page_1.xml',
 'data/raw/1865/0004_p011.xml',
 'data/raw/1865/0005_p013.xml',
 'data/raw/1865/0006_p014.xml',
 'data/raw/1865/0007_p015.xml',
 'data/raw/1865/0008_p016.xml',
 'data/raw/1865/0009_p017.xml',
 'data/raw/1865/0010_p018.xml',
 'data/raw/1865/0011_p019.xml',
 'data/raw/1865/0012_p020.xml',
 'data/raw/1865/0013_p021.xml',
 'data/raw/1865/0014_p022.xml',
 'data/raw/1865/0015_p023.xml',
 'data/raw/1865/0016_p024.xml',
 'data/raw/1865/0017_p025.xml',
 'data/raw/1865/0018_p026.xml',
 'data/raw/1865/0019_p027.xml',
 'data/raw/1865/0020_p028.xml',
 'data/raw/1865/0021_p029.xml',
 'data/raw/1865/0022_p030.xml',
 'data/raw/1865/0023_p031.xml',
 'data/raw/1865/0024_p032.xml',
 'data/raw/1865/0025_p033.xml',
 'data/raw/1865/0026_p034.xml']

In [21]:
for i in [0,1,2,3,4]:
  print(i)

0
1
2
3
4


In [13]:
trees, roots = [], []
for p in ordered_pages:
    tree = ET.parse(p)
    root = tree.getroot()

    trees.append(tree)
    roots.append(root)

In [14]:
combined_root = roots[0]
combined_tree = trees[0]

In [15]:
for child in roots[0]:
    print(child)

<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata' at 0x7825838a7a10>
<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page' at 0x7825838a7ce0>


In [16]:
for child in roots[2][1][5][7][2]:
    print(child.text)

of bringing the whole of the surveys under the Home Department, as proposed in Lieutenant


In [17]:
for root in roots[1:]:
    for child in root:
        combined_root.append(child)

In [18]:
for child in combined_root:
    print(child.tag)

{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-

In [19]:
ET.indent(combined_tree, space="    ")
combined_tree.write("combined_pages.xml", encoding="UTF-8")

### Parse credit sections

In [20]:
"structure {type:credit;}" in combined_root[1][5].attrib["custom"]

IndexError: child index out of range

In [None]:
credits = []
for child in combined_root:
    if child.tag.split("}")[1] == "Page":
        for region in child:
            if "structure {type:credit;}" in region.attrib.get("custom", []):
                credits.append(region)

In [None]:
def parse_xml_attrib(s):
    return re.findall(r"(?<attr_name>\w*)\s\(?<attr_value>{[\w:\s\d]*;\})", s)

In [None]:
s = credits[0][3].attrib["custom"]
s

In [None]:
re.findall(r"(?P<attr_name>\w*)\s(?P<attr_value>\{[\w:;\s\d]*;\})", s)