<a href="https://colab.research.google.com/github/harrylloyd-bl/hr-coleridge/blob/hr/Extract_Entities.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extract Entities

## Import Packages

Import modules from the standard library to work with the file system xmls and regular expressions.


*   glob - searches for filenames that match a specific pattern
*   ElementTree - for working with xmls
*   re - for regular expressions (patterns used to find specific parts of strings)
*   os - os.remove for deleting files

[Colab Markdown Cheat Sheet](https://colab.research.google.com/notebooks/markdown_guide.ipynb)


In [None]:
import glob
import xml.etree.ElementTree as ET
import re
import os

## Combine pages

### Define path for data


In [None]:
pages = glob.glob("sample_data/1865/00*.xml")

### Import data from path

In [None]:
pages

['sample_data/1865/0024_p032.xml',
 'sample_data/1865/0013_p021.xml',
 'sample_data/1865/0009_p017.xml',
 'sample_data/1865/0017_p025.xml',
 'sample_data/1865/0018_p026.xml',
 'sample_data/1865/0023_p031.xml',
 'sample_data/1865/0006_p014.xml',
 'sample_data/1865/0004_p011.xml',
 'sample_data/1865/0016_p024.xml',
 'sample_data/1865/0014_p022.xml',
 'sample_data/1865/0003_1865_page_1.xml',
 'sample_data/1865/0011_p019.xml',
 'sample_data/1865/0022_p030.xml',
 'sample_data/1865/0025_p033.xml',
 'sample_data/1865/0008_p016.xml',
 'sample_data/1865/0026_p034.xml',
 'sample_data/1865/0002_1865_letter.xml',
 'sample_data/1865/0019_p027.xml',
 'sample_data/1865/0001_1865_cover.xml',
 'sample_data/1865/0010_p018.xml',
 'sample_data/1865/0015_p023.xml',
 'sample_data/1865/0020_p028.xml',
 'sample_data/1865/0007_p015.xml',
 'sample_data/1865/0005_p013.xml',
 'sample_data/1865/0012_p020.xml',
 'sample_data/1865/0021_p029.xml']

### Order pages correctly

In [None]:
ordered_pages = sorted(pages, key=lambda x: int(x.split("/")[-1].split("_")[0]))
ordered_pages

['sample_data/1865/0001_1865_cover.xml',
 'sample_data/1865/0002_1865_letter.xml',
 'sample_data/1865/0003_1865_page_1.xml',
 'sample_data/1865/0004_p011.xml',
 'sample_data/1865/0005_p013.xml',
 'sample_data/1865/0006_p014.xml',
 'sample_data/1865/0007_p015.xml',
 'sample_data/1865/0008_p016.xml',
 'sample_data/1865/0009_p017.xml',
 'sample_data/1865/0010_p018.xml',
 'sample_data/1865/0011_p019.xml',
 'sample_data/1865/0012_p020.xml',
 'sample_data/1865/0013_p021.xml',
 'sample_data/1865/0014_p022.xml',
 'sample_data/1865/0015_p023.xml',
 'sample_data/1865/0016_p024.xml',
 'sample_data/1865/0017_p025.xml',
 'sample_data/1865/0018_p026.xml',
 'sample_data/1865/0019_p027.xml',
 'sample_data/1865/0020_p028.xml',
 'sample_data/1865/0021_p029.xml',
 'sample_data/1865/0022_p030.xml',
 'sample_data/1865/0023_p031.xml',
 'sample_data/1865/0024_p032.xml',
 'sample_data/1865/0025_p033.xml',
 'sample_data/1865/0026_p034.xml']

### Define tree and root

In [None]:
trees, roots = [], []
for p in ordered_pages:
    tree = ET.parse(p)
    root = tree.getroot()

    trees.append(tree)
    roots.append(root)

### Specify combined trees and roots

In [None]:
combined_root = roots[0]
combined_tree = trees[0]

### Combine children into single list

In [None]:
for root in roots[1:]:
    for child in root:
        combined_root.append(child)

### Print combined list of children

In [None]:
for child in combined_root:
    print(child.tag)

{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-

### Save combined tree in a new file

In [None]:
ET.indent(combined_tree, space="    ")
combined_tree.write("combined_pages.xml", encoding="UTF-8")

## Extract Attributes

### Parse report date

### Parse header sections

In [None]:
credits = []
for i, child in enumerate(combined_root.iter("{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextRegion")):
  if "structure {type:heading;}" in child.attrib.get("custom", []):
    print(child.attrib)
    try:
      print(child[1][2][0].text)
    except IndexError:
      print("Error in Heading {i}")
    # if child.tag.split("}")[1] == "Page":
    #     for region in child:
    #         if "structure {type:credit;}" in region.attrib.get("custom", []):
    #             credits.append(region)

{'id': 'r_2', 'custom': 'readingOrder {index:1;} structure {type:heading;}'}
EXECUTIVE SURVEYS.
{'id': 'r_22', 'custom': 'readingOrder {index:1;} structure {type:heading;}'}
No. 2 TOPOGRAPHICAL PARTY
{'id': 'r_25', 'custom': 'readingOrder {index:3;} structure {type:heading;}'}
No. 3 TOPOGRAPHICAL PARTY.
{'id': 'r_14', 'custom': 'readingOrder {index:1;} structure {type:heading;}'}
No. 4 TOPOGRAPHICAL PARTY.
{'id': 'r_37', 'custom': 'readingOrder {index:5;} structure {type:heading;}'}
No. 5 TOPOGRAPHICAL PARTY.
{'id': 'r_111', 'custom': 'readingOrder {index:6;} structure {type:heading;}'}
No. 6 TOPOGRAPHICAL SURVEY.
{'id': 'r_5', 'custom': 'readingOrder {index:1;} structure {type:heading;}'}
No. 7 TOPOGRAPHICAL PARTY.
{'id': 'r_58', 'custom': 'readingOrder {index:6;} structure {type:heading;}'}
No. 8 TOPOGRAPHICAL PARTY.


### Parse name

In [None]:
def parse_xml_attrib(s):
    return re.findall(r"(?<attr_name>\w*)\s\(?<attr_value>{[\w:\s\d]*;\})", s)

In [None]:
s = credits[0][3].attrib["custom"]
s

In [None]:
re.findall(r"(?P<attr_name>\w*)\s(?P<attr_value>\{[\w:;\s\d]*;\})", s)

# Redundant Cells

## Define pages to delete with path

In [None]:
pages_to_delete = glob.glob("sample_data/00*.xml")

In [None]:
for page in pages_to_delete:
  os.remove(page)

## Split filename string demo

In [None]:
int(pages[7].split("/")[-1].split("_")[0])

19

## Print integer demo

In [None]:
for i in [0,1,2,3,4]:
  print(i)

0
1
2
3
4


## Print children demo

In [None]:
for child in roots[0]:
    print(child)

<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata' at 0x79fb9c52ae80>
<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page' at 0x79fb9c5d98a0>


## Print text line demo

In [None]:
for child in roots[2][2][5][7][2]:
    print(child.text)

IndexError: child index out of range