<a href="https://colab.research.google.com/github/harrylloyd-bl/hr-coleridge/blob/hr/Extract_Entities.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extract Entities

## Import Packages

Import modules from the standard library to work with the file system xmls and regular expressions.


*   glob - searches for filenames that match a specific pattern
*   ElementTree - for working with xmls
*   re - for regular expressions (patterns used to find specific parts of strings)
*   os - os.remove for deleting files

[Colab Markdown Cheat Sheet](https://colab.research.google.com/notebooks/markdown_guide.ipynb)


In [None]:
import glob
import xml.etree.ElementTree as ET
import re
import os
import pandas as pd

## Combine pages

### Define path for data


In [None]:
report_date = 1867

In [None]:
pages = glob.glob(f"sample_data/{report_date}/00*.xml")

### Import data from path

In [None]:
pages

['sample_data/1867/0012_p012.xml',
 'sample_data/1867/0017_p017.xml',
 'sample_data/1867/0030_p031.xml',
 'sample_data/1867/0067_p068.xml',
 'sample_data/1867/0070_p071.xml',
 'sample_data/1867/0025_p026.xml',
 'sample_data/1867/0033_p034.xml',
 'sample_data/1867/0068_p069.xml',
 'sample_data/1867/0037_p038.xml',
 'sample_data/1867/0016_p016.xml',
 'sample_data/1867/0061_p062.xml',
 'sample_data/1867/0032_p033.xml',
 'sample_data/1867/0009_p009.xml',
 'sample_data/1867/0044_p045.xml',
 'sample_data/1867/0034_p035.xml',
 'sample_data/1867/0011_p011.xml',
 'sample_data/1867/0029_p030.xml',
 'sample_data/1867/0082_1867 p.8 Table.xml',
 'sample_data/1867/0024_p025.xml',
 'sample_data/1867/0039_p040.xml',
 'sample_data/1867/0078_p079.xml',
 'sample_data/1867/0003_p003.xml',
 'sample_data/1867/0077_p078.xml',
 'sample_data/1867/0051_p052.xml',
 'sample_data/1867/0083_1867 p.9 Table.xml',
 'sample_data/1867/0050_p051.xml',
 'sample_data/1867/0079_p080.xml',
 'sample_data/1867/0007_p007.xml',


### Order pages correctly

In [None]:
ordered_pages = sorted(pages, key=lambda x: int(x.split("/")[-1].split("_")[0]))
ordered_pages

['sample_data/1867/0001_p001.xml',
 'sample_data/1867/0002_p002.xml',
 'sample_data/1867/0003_p003.xml',
 'sample_data/1867/0004_p004.xml',
 'sample_data/1867/0005_p005.xml',
 'sample_data/1867/0006_p006.xml',
 'sample_data/1867/0007_p007.xml',
 'sample_data/1867/0008_p008.xml',
 'sample_data/1867/0009_p009.xml',
 'sample_data/1867/0010_p010.xml',
 'sample_data/1867/0011_p011.xml',
 'sample_data/1867/0012_p012.xml',
 'sample_data/1867/0013_p013.xml',
 'sample_data/1867/0014_p014.xml',
 'sample_data/1867/0015_p015.xml',
 'sample_data/1867/0016_p016.xml',
 'sample_data/1867/0017_p017.xml',
 'sample_data/1867/0018_p018.xml',
 'sample_data/1867/0019_p019.xml',
 'sample_data/1867/0020_p020.xml',
 'sample_data/1867/0021_p021.xml',
 'sample_data/1867/0022_p022.xml',
 'sample_data/1867/0023_p023.xml',
 'sample_data/1867/0024_p025.xml',
 'sample_data/1867/0025_p026.xml',
 'sample_data/1867/0026_p027.xml',
 'sample_data/1867/0027_p028.xml',
 'sample_data/1867/0028_p029.xml',
 'sample_data/1867/0

### Define tree and root

In [None]:
trees, roots = [], []
for p in ordered_pages:
    if 'Table' not in p:
      tree = ET.parse(p)
      root = tree.getroot()

      trees.append(tree)
      roots.append(root)

### Specify combined trees and roots

In [None]:
combined_root = roots[0]
combined_tree = trees[0]

### Combine children into single list

In [None]:
for root in roots[1:]:
    for child in root:
        combined_root.append(child)

### Print combined list of children

In [None]:
for child in combined_root:
    print(child.tag)

{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-

### Save combined tree in a new file

In [None]:
ET.indent(combined_tree, space="    ")
combined_tree.write(f"{report_date}_combined_pages.xml", encoding="UTF-8")

## Extract Attributes

### Parse header sections

In [None]:
heading = []
for i, child in enumerate(combined_root.iter("{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextRegion")):
  if "structure {type:heading;}" in child.attrib.get("custom", []):
    print(child.attrib)
    try:
      print(child[1][2][0].text)
      heading.append(child)
    except IndexError:
      print("Error in Heading {i}")
    # if child.tag.split("}")[1] == "Page":
    #     for region in child:
    #         if "structure {type:credit;}" in region.attrib.get("custom", []):
    #             credits.append(region)

{'id': 'r_28', 'custom': 'readingOrder {index:2;} structure {type:heading;}'}
EXECUTIVE SURVEYS.
{'id': 'r_2', 'custom': 'readingOrder {index:3;} structure {type:heading;}'}
No. I PARTY.—GWALIOR AND CENTRAL INDIA SURVEY.
{'id': 'r_6', 'custom': 'readingOrder {index:5;} structure {type:heading;}'}
No. II PARTY.—HYDERABAD SURVEY.
{'id': 'r_38', 'custom': 'readingOrder {index:1;} structure {type:heading;}'}
No. III PARTY.—CENTRAL PROVINCES AND VIZAGAPATAM
{'id': 'r_3', 'custom': 'readingOrder {index:1;} structure {type:heading;}'}
No. IV PARTY —CHOTA NAGPORE SURVEY.
{'id': 'r_2', 'custom': 'readingOrder {index:1;} structure {type:heading;}'}
No. V PARTY.—REWAH AND BUNDELKUND SURVEY.
{'id': 'r_1', 'custom': 'readingOrder {index:1;} structure {type:heading;}'}
No. VI PARTY.—KOSSIA AND GARROW HILLS SURVEY.
{'id': 'r_2', 'custom': 'readingOrder {index:2;} structure {type:heading;}'}
No. VII PARTY.—RAJPOOTANA SURVEY.
{'id': 'r_4', 'custom': 'readingOrder {index:4;} structure {type:heading;}'}


### Find Person

In [None]:
credits = []
for i, child in enumerate(combined_root.iter("{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextRegion")):
  if "structure {type:credit;}" in child.attrib.get("custom", []):
    print(child.attrib)
    try:
      print(child[1][2][0].text)
      credits.append(child)
    except IndexError:
      print("Error in Credit {i}")
    # if child.tag.split("}")[1] == "Page":
    #     for region in child:
    #         if "structure {type:credit;}" in region.attrib.get("custom", []):
    #             credits.append(region)

{'id': 'r_3', 'custom': 'readingOrder {index:4;} structure {type:credit;}'}
Personnel.
{'id': 'r_2', 'custom': 'readingOrder {index:1;} structure {type:credit;}'}
Personnel.
{'id': 'r_39', 'custom': 'readingOrder {index:2;} structure {type:credit;}'}
Personnel.
{'id': 'r_41', 'custom': 'readingOrder {index:3;} structure {type:credit;}'}
NAME OF ASSISTANTS.
{'id': 'r_4', 'custom': 'readingOrder {index:2;} structure {type:credit;}'}
Personnel.
{'id': 'r_3', 'custom': 'readingOrder {index:2;} structure {type:credit;}'}
Personnel.
{'id': 'r_2', 'custom': 'readingOrder {index:2;} structure {type:credit;}'}
Personnel.
{'id': 'r', 'custom': 'readingOrder {index:3;} structure {type:credit;}'}
Average Number of Plane
{'id': 'r_3', 'custom': 'readingOrder {index:3;} structure {type:credit;}'}
Personnel.
{'id': 'r_5', 'custom': 'readingOrder {index:4;} structure {type:credit;}'}
Table:—
{'id': 'r_5', 'custom': 'readingOrder {index:5;} structure {type:credit;}'}
Captain W. H. Edgcome, R. E., in
{'

### Print person text lines

In [None]:
def parse_attributes(attrib: str):
  attrib_pair_re = re.compile(r"(?P<tag>\w+) (?P<text>\{[\.\w\s:;\d]+\})")
  attrib_inner_re = re.compile(r"(?P<tag>\w+):(?P<text>[\.\w\s\d]+)")
  all_attribs = attrib_pair_re.findall(attrib)
  return {k:attrib_inner_re.findall(v[1:-1]) for k,v in all_attribs}

### Parse person attributes

In [None]:
parse_attributes("readingOrder {index:1;} person {offset:0; length:23;firstname:A.\u0020B.; title:Captian; lastname:Melville;} leader {offset:0; length:22;} Role {offset:24; length:9; continued:true;title:Executive\u0020Officer\u0020in\u0020Charge;}")

{'readingOrder': [('index', '1')],
 'person': [('offset', '0'),
  ('length', '23'),
  ('firstname', 'A. B.'),
  ('title', 'Captian'),
  ('lastname', 'Melville')],
 'leader': [('offset', '0'), ('length', '22')],
 'Role': [('offset', '24'),
  ('length', '9'),
  ('continued', 'true'),
  ('title', 'Executive Officer in Charge')]}

### Create entities list

In [None]:
entities = []

### Extract person attributes

In [None]:
for line in credits[0].iter("{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextLine"):
  line_attributes = parse_attributes(line.attrib.get("custom", []))
  if "person" in line_attributes:
    print(line_attributes.get("person"))
    print(line[2][0].text)
    entity = {}
    for attrib in line_attributes.get("person")[2:]:
      entity[attrib[0]] = attrib[1]
    entities.append(entity)

[('offset', '0'), ('length', '26'), ('firstname', 'Charles'), ('title', 'Lieutenant'), ('lastname', 'Strahan')]
Lieutenant Charles Strahan, R. E.
[('offset', '0'), ('length', '12'), ('firstname', 'H.'), ('title', 'Mr.'), ('lastname', 'Horst')]
Mr. H. Horst, Civil Assistant.
[('offset', '0'), ('length', '12'), ('firstname', 'P.'), ('title', 'Mr.'), ('lastname', 'Chill')]
Mr. P. Chill, Sub-Assistant.
[('offset', '0'), ('length', '14'), ('firstname', 'G.'), ('title', 'Mr.'), ('lastname', 'McArthy')]
Mr. G. McArthy, Sub-Assistant.
[('offset', '0'), ('length', '14'), ('firstname', 'G.'), ('title', 'Mr.'), ('lastname', 'Alluntt')]
Mr. G. Alluntt, Sub-Assistant.
[('offset', '0'), ('length', '13'), ('firstname', 'G.'), ('title', 'Mr.'), ('lastname', 'Murphy')]
Mr. G. Murphy, Sub-Assistant.
[('offset', '0'), ('length', '13'), ('firstname', 'G.'), ('title', 'Mr.'), ('lastname', 'Esteve')]
Mr. G. Esteve, Sub-Assistant.
[('offset', '16'), ('length', '9'), ('lastname', 'Chooramun')]
Goolam Mahomed,

### List entities

In [None]:
entities

[{'firstname': 'Charles', 'title': 'Lieutenant', 'lastname': 'Strahan'},
 {'firstname': 'H.', 'title': 'Mr.', 'lastname': 'Horst'},
 {'firstname': 'P.', 'title': 'Mr.', 'lastname': 'Chill'},
 {'firstname': 'G.', 'title': 'Mr.', 'lastname': 'McArthy'},
 {'firstname': 'G.', 'title': 'Mr.', 'lastname': 'Alluntt'},
 {'firstname': 'G.', 'title': 'Mr.', 'lastname': 'Murphy'},
 {'firstname': 'G.', 'title': 'Mr.', 'lastname': 'Esteve'},
 {'lastname': 'Chooramun'}]

### Extract entities to data frame

In [None]:
pd.DataFrame(entities)

Unnamed: 0,firstname,title,lastname
0,Charles,Lieutenant,Strahan
1,H.,Mr.,Horst
2,P.,Mr.,Chill
3,G.,Mr.,McArthy
4,G.,Mr.,Alluntt
5,G.,Mr.,Murphy
6,G.,Mr.,Esteve
7,,,Chooramun


# Redundant Cells

### Parse report date

### Parse name

In [None]:
def parse_xml_attrib(s):
    return re.findall(r"(?<attr_name>\w*)\s\(?<attr_value>{[\w:\s\d]*;\})", s)

In [None]:
s = credits[0][3].attrib["custom"]
s

In [None]:
re.findall(r"(?P<attr_name>\w*)\s(?P<attr_value>\{[\w:;\s\d]*;\})", s)

## Define pages to delete with path

In [None]:
pages_to_delete = glob.glob("sample_data/00*.xml")

In [None]:
for page in pages_to_delete:
  os.remove(page)

## Split filename string demo

In [None]:
int(pages[7].split("/")[-1].split("_")[0])

19

## Print integer demo

In [None]:
for i in [0,1,2,3,4]:
  print(i)

0
1
2
3
4


## Print children demo

In [None]:
for child in roots[0]:
    print(child)

<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata' at 0x79fb9c52ae80>
<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page' at 0x79fb9c5d98a0>


## Print text line demo

In [None]:
for child in roots[2][2][5][7][2]:
    print(child.text)

IndexError: child index out of range