<a href="https://colab.research.google.com/github/harrylloyd-bl/hr-coleridge/blob/hr/Extract_Entities.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extract Entities

## Import Packages

Import modules from the standard library to work with the file system xmls and regular expressions.


*   glob - searches for filenames that match a specific pattern
*   ElementTree - for working with xmls
*   re - for regular expressions (patterns used to find specific parts of strings)
*   os - os.remove for deleting files

[Colab Markdown Cheat Sheet](https://colab.research.google.com/notebooks/markdown_guide.ipynb)


In [1]:
import glob
import xml.etree.ElementTree as ET
import re
import os
import pandas as pd

## Combine pages

### Define report_date

In [2]:
report_date = 1865

### Define data path


In [3]:
pages = glob.glob(f"data/raw/{report_date}/00*.xml")

### Import data from path

In [4]:
pages

['data/raw/1865\\0001_1865_cover.xml',
 'data/raw/1865\\0002_1865_letter.xml',
 'data/raw/1865\\0003_1865_page_1.xml',
 'data/raw/1865\\0004_p011.xml',
 'data/raw/1865\\0005_p013.xml',
 'data/raw/1865\\0006_p014.xml',
 'data/raw/1865\\0007_p015.xml',
 'data/raw/1865\\0008_p016.xml',
 'data/raw/1865\\0009_p017.xml',
 'data/raw/1865\\0010_p018.xml',
 'data/raw/1865\\0011_p019.xml',
 'data/raw/1865\\0012_p020.xml',
 'data/raw/1865\\0013_p021.xml',
 'data/raw/1865\\0014_p022.xml',
 'data/raw/1865\\0015_p023.xml',
 'data/raw/1865\\0016_p024.xml',
 'data/raw/1865\\0017_p025.xml',
 'data/raw/1865\\0018_p026.xml',
 'data/raw/1865\\0019_p027.xml',
 'data/raw/1865\\0020_p028.xml',
 'data/raw/1865\\0021_p029.xml',
 'data/raw/1865\\0022_p030.xml',
 'data/raw/1865\\0023_p031.xml',
 'data/raw/1865\\0024_p032.xml',
 'data/raw/1865\\0025_p033.xml',
 'data/raw/1865\\0026_p034.xml']

### Order pages correctly

In [5]:
ordered_pages = sorted(pages, key=lambda x: int(x.split("\\")[-1].split("_")[0]))
ordered_pages

['data/raw/1865\\0001_1865_cover.xml',
 'data/raw/1865\\0002_1865_letter.xml',
 'data/raw/1865\\0003_1865_page_1.xml',
 'data/raw/1865\\0004_p011.xml',
 'data/raw/1865\\0005_p013.xml',
 'data/raw/1865\\0006_p014.xml',
 'data/raw/1865\\0007_p015.xml',
 'data/raw/1865\\0008_p016.xml',
 'data/raw/1865\\0009_p017.xml',
 'data/raw/1865\\0010_p018.xml',
 'data/raw/1865\\0011_p019.xml',
 'data/raw/1865\\0012_p020.xml',
 'data/raw/1865\\0013_p021.xml',
 'data/raw/1865\\0014_p022.xml',
 'data/raw/1865\\0015_p023.xml',
 'data/raw/1865\\0016_p024.xml',
 'data/raw/1865\\0017_p025.xml',
 'data/raw/1865\\0018_p026.xml',
 'data/raw/1865\\0019_p027.xml',
 'data/raw/1865\\0020_p028.xml',
 'data/raw/1865\\0021_p029.xml',
 'data/raw/1865\\0022_p030.xml',
 'data/raw/1865\\0023_p031.xml',
 'data/raw/1865\\0024_p032.xml',
 'data/raw/1865\\0025_p033.xml',
 'data/raw/1865\\0026_p034.xml']

### Define tree and root

In [6]:
trees, roots = [], []
for p in ordered_pages:
    if 'Table' not in p:
      tree = ET.parse(p)
      root = tree.getroot()

      trees.append(tree)
      roots.append(root)

### Specify combined trees and roots

In [7]:
combined_root = roots[0]
combined_tree = trees[0]  # This references its root, so when the root is updated with more children, the tree also updates

### Combine children into single list

In [9]:
for root in roots[1:]:
    for child in root:
        combined_root.append(child)  # This also updates the combined_tree

### Print combined list of children

In [9]:
for child in combined_root:
    print(child.tag)

{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata
{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-

### Save combined tree in a new file

In [10]:
ET.indent(combined_tree, space="    ")
# combined_tree.write(f"{report_date}_combined_pages.xml", encoding="UTF-8")

## Extract Attributes

In [11]:
ns = {
    "page": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15",
    "xsi": "http://www.w3.org/2001/XMLSchema-instance"
}

### Parse person attributes

In [19]:
def parse_attributes(attrib: str):
    attrib_pair_re = re.compile(r"(?P<tag>\w+) (?P<text>\{[\.\w\s:;\d]+\})")
    attrib_inner_re = re.compile(r"(?P<tag>\w+):(?P<text>[\.\w\s\d]+)")
    all_attribs = attrib_pair_re.findall(attrib)
    inner_found = {k:attrib_inner_re.findall(v[1:-1]) for k,v in all_attribs}
    return {k0: {k1:v1 for k1, v1 in v0} for k0, v0 in inner_found.items()}

In [20]:
parse_attributes("readingOrder {index:1;} person {offset:0; length:23;firstname:A.\u0020B.; title:Captian; lastname:Melville;} leader {offset:0; length:22;} Role {offset:24; length:9; continued:true;title:Executive\u0020Officer\u0020in\u0020Charge;}")

{'readingOrder': {'index': '1'},
 'person': {'offset': '0',
  'length': '23',
  'firstname': 'A. B.',
  'title': 'Captian',
  'lastname': 'Melville'},
 'leader': {'offset': '0', 'length': '22'},
 'Role': {'offset': '24',
  'length': '9',
  'continued': 'true',
  'title': 'Executive Officer in Charge'}}

### Parse header sections

In [28]:
type(line)

xml.etree.ElementTree.Element

In [26]:
def extract_attribute_text(line, attrib_name, attribs):
    offset, length = attribs[attrib_name]["offset"], attribs[attrib_name]["length"]
    return line[2][0].text[offset:offset + length]

In [27]:
entities = []
for i, region in enumerate(combined_root.iter(f"{{{ns['page']}}}TextRegion")):
    if "{type:heading;}" in region.attrib.get("custom", []):
        print({"child_idx": i} | region.attrib)
        survey_party_lines = []
        for line in region[1:-1]:
            line_attributes = parse_attributes(line.attrib.get("custom", []))
            print(line_attributes)
            print(line[2][0].text)
            if line[2][0].text and "TOPOGRAPHICAL" in line[2][0].text:
                survey_party_lines.append(line[2][0].text.rstrip("."))
            elif line[2][0].text and "SURVEY" in line[2][0].text:
                survey = line[2][0].text
            elif "Season" in line[2][0].text:
                season = line[2][0].text
        survey_party = ", ".join(survey_party_lines)
        # heading.append((region, survey_party))
    elif "{type:credit;}" in region.attrib.get("custom", []):
        print({"child_idx": i} | region.attrib)
        for line in region[1:-1]:  # TODO add 'continued logic'
            line_attributes = parse_attributes(line.attrib.get("custom", []))
            if "survey_party" in line_attributes:
                survey_party = extract_attribute_text(line, "survey_party", line_attributes)
            if "survey_area" in line_attributes:
                survey_area = extract_attribute_text(line, "survey_area", line_attributes)
            if "place" in line_attributes:
                place = extract_attribute_text(line, "place", line_attributes)
            print(line_attributes)
            if "person" in line_attributes:
                # print(line_attributes.get("person"))
                # print(line[2][0].text)
                entity = {}
                for attrib in line_attributes.get("person")[2:]:
                    entity[attrib[0]] = attrib[1]
                entity["survey_party"] = survey_party
                entity["survey"] = survey
                entity["season"] = season
                entities.append(entity)

{'child_idx': 13, 'id': 'r_2', 'custom': 'readingOrder {index:1;} structure {type:heading;}'}
{'readingOrder': {'index': '0'}}
EXECUTIVE SURVEYS.
{'readingOrder': {'index': '1'}, 'survey_party': {'offset': '0', 'length': '25'}}
No. 1 TOPOGRAPHICAL PARTY.
{'readingOrder': {'index': '2'}, 'survey_area': {'offset': '0', 'length': '32'}}
GWALIOR AND CENTRAL INDIA SURVEY,
{'readingOrder': {'index': '3'}}
AND
{'readingOrder': {'index': '4'}, 'survey_party': {'offset': '0', 'length': '25'}, 'survey_area': {'offset': '27', 'length': '17'}, 'place': {'offset': '27', 'length': '10', 'wikiData': 'Q3929733', 'placeName': 'RAJPOOTANA'}}
No. 7 TOPOGRAPHICAL PARTY, RAJPOOTANA SURVEY.
{'readingOrder': {'index': '5'}}
Season 1863-64.
{'child_idx': 14, 'id': 'r', 'custom': 'readingOrder {index:2;} structure {type:credit;}'}


TypeError: slice indices must be integers or None or have an __index__ method

In [82]:
entity_df = pd.DataFrame(entities)
title = entity_df["title"]
entity_df = entity_df.drop(columns="title")
entity_df.insert(0, "title", title)
entity_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83 entries, 0 to 82
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         63 non-null     object
 1   firstname     42 non-null     object
 2   lastname      83 non-null     object
 3   survey_party  83 non-null     object
 4   survey        83 non-null     object
 5   season        83 non-null     object
 6   continued     1 non-null      object
dtypes: object(7)
memory usage: 4.7+ KB


In [83]:
entity_df.query("survey_party == 'No. 2 TOPOGRAPHICAL PARTY'")

Unnamed: 0,title,firstname,lastname,survey_party,survey,season,continued
22,Esq,James,Mulheran,No. 2 TOPOGRAPHICAL PARTY,HYDERABAD SURVEY.,Season 1863-64.,
23,Mr,A.,Chamarett,No. 2 TOPOGRAPHICAL PARTY,HYDERABAD SURVEY.,Season 1863-64.,
24,Mr,A.,Chenell,No. 2 TOPOGRAPHICAL PARTY,HYDERABAD SURVEY.,Season 1863-64.,
25,Mr,B.,Maine,No. 2 TOPOGRAPHICAL PARTY,HYDERABAD SURVEY.,Season 1863-64.,
26,,,Baparno,No. 2 TOPOGRAPHICAL PARTY,HYDERABAD SURVEY.,Season 1863-64.,
27,,,Mypatrao,No. 2 TOPOGRAPHICAL PARTY,HYDERABAD SURVEY.,Season 1863-64.,
28,Mr,,Smith,No. 2 TOPOGRAPHICAL PARTY,HYDERABAD SURVEY.,Season 1863-64.,
29,,,Farrel,No. 2 TOPOGRAPHICAL PARTY,HYDERABAD SURVEY.,Season 1863-64.,
30,Mr,,Chennell,No. 2 TOPOGRAPHICAL PARTY,HYDERABAD SURVEY.,Season 1863-64.,
31,Mr,,Maine,No. 2 TOPOGRAPHICAL PARTY,HYDERABAD SURVEY.,Season 1863-64.,


In [78]:
entity_df["survey_party"].unique()

array(['No. 1 TOPOGRAPHICAL PARTY, No. 7 TOPOGRAPHICAL PARTY, RAJPOOTANA SURVEY',
       'No. 2 TOPOGRAPHICAL PARTY', 'No. 3 TOPOGRAPHICAL PARTY',
       'No. 4 TOPOGRAPHICAL PARTY', 'No. 5 TOPOGRAPHICAL PARTY',
       'No. 6 TOPOGRAPHICAL SURVEY', 'No. 7 TOPOGRAPHICAL PARTY',
       'No. 8 TOPOGRAPHICAL PARTY'], dtype=object)

### Find Person

In [12]:
# TODO Some credit sections are tables of square miles covered, these are poorly OCR'd and have been downloaded as separate tables
# Find a way to exclude these sections from network analysis

credits = []
for i, child in enumerate(combined_root.iter("{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextRegion")):
  if "structure {type:credit;}" in child.attrib.get("custom", []):
    print(child.attrib)
    try:
      print(child[1][2][0].text)
      credits.append(child)
    except IndexError:
      print("Error in Credit {i}")
    # if child.tag.split("}")[1] == "Page":
    #     for region in child:
    #         if "structure {type:credit;}" in region.attrib.get("custom", []):
    #             credits.append(region)

{'id': 'r_2', 'custom': 'readingOrder {index:3;} structure {type:credit;}'}
PERSONNEL.
{'id': 'r_3', 'custom': 'readingOrder {index:1;} structure {type:credit;}'}
No. 1 PARTY.
{'id': 'r_3', 'custom': 'readingOrder {index:2;} structure {type:credit;}'}
PERSONNEL.
{'id': 'r_4', 'custom': 'readingOrder {index:3;} structure {type:credit;}'}
Square miles.
{'id': 'r_5', 'custom': 'readingOrder {index:2;} structure {type:credit;}'}
Square miles.
{'id': 'r_4', 'custom': 'readingOrder {index:3;} structure {type:credit;}'}
PERSONNEL.
{'id': 'r_9', 'custom': 'readingOrder {index:2;} structure {type:credit;}'}
PERSONNEL.
{'id': 'r_10', 'custom': 'readingOrder {index:3;} structure {type:credit;}'}
Mr. McGill,
{'id': 'r_4', 'custom': 'readingOrder {index:2;} structure {type:credit;}'}
PERSONNEL.
{'id': 'r_5', 'custom': 'readingOrder {index:3;} structure {type:credit;}'}
Square miles
{'id': 'r_4', 'custom': 'readingOrder {index:3;} structure {type:credit;}'}
PERSONNEL.
{'id': 'r_5', 'custom': 'readin

### Extract person attributes

In [None]:
credits = []

In [14]:
for line in credits[0].iter("{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextLine"):
    line_attributes = parse_attributes(line.attrib.get("custom", []))
    if "person" in line_attributes:
        print(line_attributes.get("person"))
        print(line[2][0].text)
        entity = {}
        for attrib in line_attributes.get("person")[2:]:
          entity[attrib[0]] = attrib[1]
        entities.append(entity)

TypeError: '_Printer' object is not subscriptable

### Create entities list

In [15]:
entities = []

### List entities

In [17]:
entities

[{'firstname': 'G.', 'title': 'Lieut.', 'lastname': 'Strahan'},
 {'firstname': 'C.', 'title': 'Lieut.', 'lastname': 'Strahan'},
 {'firstname': 'H.', 'title': 'Mr.', 'lastname': 'Horst'},
 {'firstname': 'G.', 'title': 'Mr.', 'lastname': 'Chill'},
 {'firstname': 'W.', 'title': 'Mr.', 'lastname': 'Chapman'},
 {'firstname': 'G.', 'title': 'Mr.', 'lastname': 'McCarthy'},
 {'firstname': 'J.', 'title': 'Mr.', 'lastname': 'Hussey'},
 {'firstname': 'R.', 'title': 'Mr.', 'lastname': 'Todd'},
 {'firstname': 'C.', 'title': 'Mr.', 'lastname': 'Tapsell'},
 {'firstname': 'F.', 'title': 'Mr.', 'lastname': 'Kitchen'},
 {'firstname': 'C.', 'title': 'Mr.', 'lastname': 'Kirk'},
 {'firstname': 'W.', 'title': 'Mr.', 'lastname': 'Stotesbury'},
 {'lastname': 'Chooramun'},
 {'firstname': 'Hurlall', 'lastname': 'Singh'}]

### Extract entities to data frame

In [18]:
pd.DataFrame(entities)

Unnamed: 0,firstname,title,lastname
0,G.,Lieut.,Strahan
1,C.,Lieut.,Strahan
2,H.,Mr.,Horst
3,G.,Mr.,Chill
4,W.,Mr.,Chapman
5,G.,Mr.,McCarthy
6,J.,Mr.,Hussey
7,R.,Mr.,Todd
8,C.,Mr.,Tapsell
9,F.,Mr.,Kitchen


# Redundant Cells

### Parse report date

### Parse name

In [None]:
def parse_xml_attrib(s):
    return re.findall(r"(?<attr_name>\w*)\s\(?<attr_value>{[\w:\s\d]*;\})", s)

In [None]:
s = credits[0][3].attrib["custom"]
s

'readingOrder {index:2;} Role {offset:0; length:18; continued:true;title:Executive\\u0020Officer\\u0020in\\u0020Charge;}'

In [None]:
re.findall(r"(?P<attr_name>\w*)\s(?P<attr_value>\{[\w:;\s\d]*;\})", s)

[('readingOrder', '{index:2;}')]

## Define pages to delete with path

In [None]:
pages_to_delete = glob.glob("sample_data/00*.xml")

In [None]:
for page in pages_to_delete:
  os.remove(page)

## Split filename string demo

In [None]:
int(pages[7].split("/")[-1].split("_")[0])

9

## Print integer demo

In [None]:
for i in [0,1,2,3,4]:
  print(i)

0
1
2
3
4


## Print children demo

In [None]:
for child in roots[0]:
    print(child)

<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata' at 0x7d80ff3149f0>
<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page' at 0x7d80ffa48bd0>
<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata' at 0x7d80fefe0310>
<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page' at 0x7d80fefe0540>
<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata' at 0x7d80feff5490>
<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page' at 0x7d80feff56c0>
<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata' at 0x7d80ff007880>
<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page' at 0x7d80ff007ab0>
<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Metadata' at 0x7d80ff01c4a0>
<Element '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page'

## Print text line demo

In [None]:
for child in roots[2][2][5][7][2]:
    print(child.text)

IndexError: child index out of range