<a href="https://colab.research.google.com/github/harrylloyd-bl/hr-coleridge/blob/hr/Extract_Entities.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extract Entities

## Import Packages

Import modules from the standard library to work with the file system xmls and regular expressions.


*   glob - searches for filenames that match a specific pattern
*   ElementTree - for working with xmls
*   re - for regular expressions (patterns used to find specific parts of strings)
*   os - os.remove for deleting files

[Colab Markdown Cheat Sheet](https://colab.research.google.com/notebooks/markdown_guide.ipynb)


In [1]:
import glob
import xml.etree.ElementTree as ET
import re
import os
import pandas as pd

## Combine pages

### Define report_date

In [2]:
report_date = 1866

### Define data path


In [3]:
pages = glob.glob(f"sample_data/{report_date}/00*.xml")

### Import data from path

In [4]:
pages

['sample_data/1866/0009_p008.xml',
 'sample_data/1866/0040_p041.xml',
 'sample_data/1866/0005_p004.xml',
 'sample_data/1866/0050_p051.xml',
 'sample_data/1866/0037_p038.xml',
 'sample_data/1866/0007_p006.xml',
 'sample_data/1866/0033_p034.xml',
 'sample_data/1866/0013_p012.xml',
 'sample_data/1866/0017_p016.xml',
 'sample_data/1866/0016_p015.xml',
 'sample_data/1866/0011_p010.xml',
 'sample_data/1866/0039_p040.xml',
 'sample_data/1866/0029_p030.xml',
 'sample_data/1866/0008_p007.xml',
 'sample_data/1866/0012_p011.xml',
 'sample_data/1866/0043_p044.xml',
 'sample_data/1866/0001_1866 title page.xml',
 'sample_data/1866/0047_p048.xml',
 'sample_data/1866/0014_p013.xml',
 'sample_data/1866/0015_p014.xml',
 'sample_data/1866/0051_p052.xml',
 'sample_data/1866/0021_p020.xml',
 'sample_data/1866/0022_p021.xml',
 'sample_data/1866/0038_p039.xml',
 'sample_data/1866/0004_p003.xml',
 'sample_data/1866/0023_p022.xml',
 'sample_data/1866/0034_p035.xml',
 'sample_data/1866/0028_p029.xml',
 'sample_

### Order pages correctly

In [5]:
ordered_pages = sorted(pages, key=lambda x: int(x.split("/")[-1].split("_")[0]))
ordered_pages

['sample_data/1866/0001_1866 title page.xml',
 'sample_data/1866/0002_12.xml',
 'sample_data/1866/0003_p002.xml',
 'sample_data/1866/0004_p003.xml',
 'sample_data/1866/0005_p004.xml',
 'sample_data/1866/0006_p005.xml',
 'sample_data/1866/0007_p006.xml',
 'sample_data/1866/0008_p007.xml',
 'sample_data/1866/0009_p008.xml',
 'sample_data/1866/0010_p009.xml',
 'sample_data/1866/0011_p010.xml',
 'sample_data/1866/0012_p011.xml',
 'sample_data/1866/0013_p012.xml',
 'sample_data/1866/0014_p013.xml',
 'sample_data/1866/0015_p014.xml',
 'sample_data/1866/0016_p015.xml',
 'sample_data/1866/0017_p016.xml',
 'sample_data/1866/0018_p017.xml',
 'sample_data/1866/0019_p018.xml',
 'sample_data/1866/0020_p019.xml',
 'sample_data/1866/0021_p020.xml',
 'sample_data/1866/0022_p021.xml',
 'sample_data/1866/0023_p022.xml',
 'sample_data/1866/0024_p023.xml',
 'sample_data/1866/0025_p025.xml',
 'sample_data/1866/0026_p027.xml',
 'sample_data/1866/0027_p028.xml',
 'sample_data/1866/0028_p029.xml',
 'sample_da

### Define tree and root

In [6]:
trees, roots = [], []
for p in ordered_pages:
    if 'Table' not in p:
      tree = ET.parse(p)
      root = tree.getroot()

      trees.append(tree)
      roots.append(root)

### Specify combined trees and roots

In [7]:
combined_root = roots[0]
combined_tree = trees[0]

### Combine children into single list

In [8]:
for root in roots[1:]:
    for child in root:
        combined_root.append(child)

### Print combined list of children

In [9]:
for child in combined_root:
    print(child.tag)

{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-

### Save combined tree in a new file

In [10]:
ET.indent(combined_tree, space="    ")
combined_tree.write(f"{report_date}_combined_pages.xml", encoding="UTF-8")

## Extract Attributes

### Parse header sections

In [21]:
my_list = [4,5,6,7]

In [23]:
my_list[1:-1]

[5, 6]

In [29]:
heading = []
for i, TextRegion in enumerate(combined_root.iter("{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextRegion")):
  if "structure {type:heading;}" in TextRegion.attrib.get("custom", []):
    print(TextRegion.attrib)
    try:
      for TextLine in TextRegion[1:-1]:
        print(TextLine[2][0].text)
        if TextLine[2][0].text and ("PARTY" in TextLine[2][0].text or "PARTIES" in TextLine[2][0].text):
          survey_party = TextLine[2][0].text
          heading.append((TextRegion, survey_party))
    except IndexError:
      print("Error in Heading {i}")

{'id': 'r_3', 'custom': 'readingOrder {index:2;} structure {type:heading;}'}
EXECUTIVE SURVEYS.
Nos. 1 AND 7 PARTIES.
GWALIOR, CENTRAL INDIA AND RAJPOOTANA SURVEY.
{'id': 'r', 'custom': 'readingOrder {index:1;} structure {type:heading;}'}
None
No. 2 PARTY.
HYDERABAD SURVEY.
{'id': 'r_3', 'custom': 'readingOrder {index:1;} structure {type:heading;}'}
No. 3 PARTY.
GANJAM AND ORISSA SURVEY.
{'id': 'r_2', 'custom': 'readingOrder {index:1;} structure {type:heading;}'}
No. 4 PARTY.
CHOTA NAGPORE DIVISION SURVEY.
{'id': 'r_3', 'custom': 'readingOrder {index:1;} structure {type:heading;}'}
No. 5 PARTY.
REWAH AND BUNDELKUND SURVEY.
{'id': 'r_3', 'custom': 'readingOrder {index:2;} structure {type:heading;}'}
No. 6 PARTY.
KOSSIA AND GARROW HILLS SURVEY.
{'id': 'r_6', 'custom': 'readingOrder {index:4;} structure {type:heading;}'}
THE PEGU SURVEY.
{'id': 'r', 'custom': 'readingOrder {index:0;} structure {type:heading;}'}
THE OPERATIONS IN THE DRAWING, LITHOGRAPHIC AND PHOTOGRAPHIC BRANCHES OF THE
S

In [30]:
heading

[(<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextRegion' at 0x7c90fb4928e0>,
  'Nos. 1 AND 7 PARTIES.'),
 (<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextRegion' at 0x7c90fb300770>,
  'No. 2 PARTY.'),
 (<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextRegion' at 0x7c90fb33e840>,
  'No. 3 PARTY.'),
 (<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextRegion' at 0x7c90fb38d530>,
  'No. 4 PARTY.'),
 (<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextRegion' at 0x7c90fb1dcc70>,
  'No. 5 PARTY.'),
 (<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextRegion' at 0x7c90fb21de90>,
  'No. 6 PARTY.'),
 (<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextRegion' at 0x7c90faf06890>,
  'EXTRACTS FROM NARRATIVE REPORTS OF NOS. 1 AND 7 TOPOGRAPHICAL SURVEY PARTIES,'),
 (<Element '{http://schema.pri

### Find Person

In [12]:
# TODO Some credit sections are tables of square miles covered, these are poorly OCR'd and have been downloaded as separate tables
# Find a way to exclude these sections from network analysis

credits = []
for i, child in enumerate(combined_root.iter("{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextRegion")):
  if "structure {type:credit;}" in child.attrib.get("custom", []):
    print(child.attrib)
    try:
      print(child[1][2][0].text)
      credits.append(child)
    except IndexError:
      print("Error in Credit {i}")
    # if child.tag.split("}")[1] == "Page":
    #     for region in child:
    #         if "structure {type:credit;}" in region.attrib.get("custom", []):
    #             credits.append(region)

{'id': 'r_2', 'custom': 'readingOrder {index:3;} structure {type:credit;}'}
PERSONNEL.
{'id': 'r_3', 'custom': 'readingOrder {index:1;} structure {type:credit;}'}
No. 1 PARTY.
{'id': 'r_3', 'custom': 'readingOrder {index:2;} structure {type:credit;}'}
PERSONNEL.
{'id': 'r_4', 'custom': 'readingOrder {index:3;} structure {type:credit;}'}
Square miles.
{'id': 'r_5', 'custom': 'readingOrder {index:2;} structure {type:credit;}'}
Square miles.
{'id': 'r_4', 'custom': 'readingOrder {index:3;} structure {type:credit;}'}
PERSONNEL.
{'id': 'r_9', 'custom': 'readingOrder {index:2;} structure {type:credit;}'}
PERSONNEL.
{'id': 'r_10', 'custom': 'readingOrder {index:3;} structure {type:credit;}'}
Mr. McGill,
{'id': 'r_4', 'custom': 'readingOrder {index:2;} structure {type:credit;}'}
PERSONNEL.
{'id': 'r_5', 'custom': 'readingOrder {index:3;} structure {type:credit;}'}
Square miles
{'id': 'r_4', 'custom': 'readingOrder {index:3;} structure {type:credit;}'}
PERSONNEL.
{'id': 'r_5', 'custom': 'readin

### Parse person attributes

In [13]:
def parse_attributes(attrib: str):
  attrib_pair_re = re.compile(r"(?P<tag>\w+) (?P<text>\{[\.\w\s:;\d]+\})")
  attrib_inner_re = re.compile(r"(?P<tag>\w+):(?P<text>[\.\w\s\d]+)")
  all_attribs = attrib_pair_re.findall(attrib)
  return {k:attrib_inner_re.findall(v[1:-1]) for k,v in all_attribs}

In [14]:
parse_attributes("readingOrder {index:1;} person {offset:0; length:23;firstname:A.\u0020B.; title:Captian; lastname:Melville;} leader {offset:0; length:22;} Role {offset:24; length:9; continued:true;title:Executive\u0020Officer\u0020in\u0020Charge;}")

{'readingOrder': [('index', '1')],
 'person': [('offset', '0'),
  ('length', '23'),
  ('firstname', 'A. B.'),
  ('title', 'Captian'),
  ('lastname', 'Melville')],
 'leader': [('offset', '0'), ('length', '22')],
 'Role': [('offset', '24'),
  ('length', '9'),
  ('continued', 'true'),
  ('title', 'Executive Officer in Charge')]}

### Create entities list

In [15]:
entities = []

### Extract person attributes

In [16]:
for line in credits[0].iter("{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextLine"):
  line_attributes = parse_attributes(line.attrib.get("custom", []))
  if "person" in line_attributes:
    print(line_attributes.get("person"))
    print(line[2][0].text)
    entity = {}
    for attrib in line_attributes.get("person")[2:]:
      entity[attrib[0]] = attrib[1]
    entities.append(entity)

[('offset', '0'), ('length', '23'), ('firstname', 'G.'), ('title', 'Lieut.'), ('lastname', 'Strahan')]
Lieut. G. Strahan, R.E., Senior Assistant Surveyor.
[('offset', '0'), ('length', '23'), ('firstname', 'C.'), ('title', 'Lieut.'), ('lastname', 'Strahan')]
Lieut. C. Strahan, R.E., Assistant Surveyor
[('offset', '0'), ('length', '12'), ('firstname', 'H.'), ('title', 'Mr.'), ('lastname', 'Horst')]
Mr. H. Horst, Civil Assistant.
[('offset', '0'), ('length', '12'), ('firstname', 'G.'), ('title', 'Mr.'), ('lastname', 'Chill')]
Mr. G. Chill, Sub-Assistant,  2nd Class.
[('offset', '0'), ('length', '14'), ('firstname', 'W.'), ('title', 'Mr.'), ('lastname', 'Chapman')]
Mr. W. Chapman, Sub-Assistant.
[('offset', '0'), ('length', '15'), ('firstname', 'G.'), ('title', 'Mr.'), ('lastname', 'McCarthy')]
Mr. G. McCarthy, Sub-Assistant, 3rd Class.
[('offset', '0'), ('length', '13'), ('firstname', 'J.'), ('title', 'Mr.'), ('lastname', 'Hussey')]
Mr. J. Hussey, Sub-Assistant, 3rd Class.
[('offset', '0'

### List entities

In [17]:
entities

[{'firstname': 'G.', 'title': 'Lieut.', 'lastname': 'Strahan'},
 {'firstname': 'C.', 'title': 'Lieut.', 'lastname': 'Strahan'},
 {'firstname': 'H.', 'title': 'Mr.', 'lastname': 'Horst'},
 {'firstname': 'G.', 'title': 'Mr.', 'lastname': 'Chill'},
 {'firstname': 'W.', 'title': 'Mr.', 'lastname': 'Chapman'},
 {'firstname': 'G.', 'title': 'Mr.', 'lastname': 'McCarthy'},
 {'firstname': 'J.', 'title': 'Mr.', 'lastname': 'Hussey'},
 {'firstname': 'R.', 'title': 'Mr.', 'lastname': 'Todd'},
 {'firstname': 'C.', 'title': 'Mr.', 'lastname': 'Tapsell'},
 {'firstname': 'F.', 'title': 'Mr.', 'lastname': 'Kitchen'},
 {'firstname': 'C.', 'title': 'Mr.', 'lastname': 'Kirk'},
 {'firstname': 'W.', 'title': 'Mr.', 'lastname': 'Stotesbury'},
 {'lastname': 'Chooramun'},
 {'firstname': 'Hurlall', 'lastname': 'Singh'}]

### Extract entities to data frame

In [18]:
pd.DataFrame(entities)

Unnamed: 0,firstname,title,lastname
0,G.,Lieut.,Strahan
1,C.,Lieut.,Strahan
2,H.,Mr.,Horst
3,G.,Mr.,Chill
4,W.,Mr.,Chapman
5,G.,Mr.,McCarthy
6,J.,Mr.,Hussey
7,R.,Mr.,Todd
8,C.,Mr.,Tapsell
9,F.,Mr.,Kitchen


# Redundant Cells

### Parse report date

### Parse name

In [None]:
def parse_xml_attrib(s):
    return re.findall(r"(?<attr_name>\w*)\s\(?<attr_value>{[\w:\s\d]*;\})", s)

In [None]:
s = credits[0][3].attrib["custom"]
s

'readingOrder {index:2;} Role {offset:0; length:18; continued:true;title:Executive\\u0020Officer\\u0020in\\u0020Charge;}'

In [None]:
re.findall(r"(?P<attr_name>\w*)\s(?P<attr_value>\{[\w:;\s\d]*;\})", s)

[('readingOrder', '{index:2;}')]

## Define pages to delete with path

In [None]:
pages_to_delete = glob.glob("sample_data/00*.xml")

In [None]:
for page in pages_to_delete:
  os.remove(page)

## Split filename string demo

In [None]:
int(pages[7].split("/")[-1].split("_")[0])

9

## Print integer demo

In [None]:
for i in [0,1,2,3,4]:
  print(i)

0
1
2
3
4


## Print children demo

In [None]:
for child in roots[0]:
    print(child)

<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata' at 0x7d80ff3149f0>
<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page' at 0x7d80ffa48bd0>
<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata' at 0x7d80fefe0310>
<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page' at 0x7d80fefe0540>
<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata' at 0x7d80feff5490>
<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page' at 0x7d80feff56c0>
<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata' at 0x7d80ff007880>
<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page' at 0x7d80ff007ab0>
<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata' at 0x7d80ff01c4a0>
<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page'

## Print text line demo

In [None]:
for child in roots[2][2][5][7][2]:
    print(child.text)

IndexError: child index out of range