In [10]:
from arpeggio import RegExMatch as _
from arpeggio import ZeroOrMore, OneOrMore, And, Optional, Not
from arpeggio import ParserPython, NoMatch, EOF
import yaml
import re

In [3]:
with open("Categories.yml", "r") as f:
    divisions = yaml.load(f)

In [4]:
categories = {}
for d in divisions:
    for cat in d['categories']:
        categories[cat] = d['name']
        

- born
- died
- flourished
- lived



In [5]:
from arpeggio import PTNodeVisitor, visit_parse_tree

class BioVisitor(PTNodeVisitor):
    
    def visit_after(self, node, children):
        return "after"
    
    def visit_about(self, node, children):
        return "about"  
    
    def visit_integer(self, node, children):
        return int(str(node))

    def visit_century(self, node, children):
        out = {'value': None, 'century': True}
        bc = len(children.bc) > 0
        if self.debug:
            print(bc)
        values = children.integer
        if len(values) == 2:
            value = sum([x - 0.5 for x in values]) / len(values) * 100
        elif len(values) == 1:
            value = values[0] * 100 - 50
        if bc:
            value *= -1
        out['value'] = value
        return out
    
    def visit_year(self, node, children):
        if self.debug:
            print(children)
            print(node)
        value = children[0]
        if len(children) > 1:
            value *= -1
        return {'value': value, 'century': False}
  
    def visit_name(self, node, children):
        return ' '.join(str(x) for x in children)

    def visit_category(self, node, children):
        category =  ' '.join(children)
        return category

    def visit_died_prefix(self, node, children):
        out = 'died'
        if len(children):
            out = out + ' ' + str(children[0])
        return out
    
    def visit_died(self, node, children):
        return {children[0]: children[1]}
    
    def visit_born_prefix(self, node, children):
        if self.debug:
            print(node)
            print(children)
        out = 'born'
        if len(children):
            out = out + ' ' + str(children[0])
        return out
    
    def visit_born(self, node, children):
        if self.debug:
            print(node)
            print(children)
        return {children[0]: children[1]}    

    def visit_lived_prefix(self, node, children):
        out = 'lived'
        if len(children):
            out = out + ' ' + str(children[1])
        return out

    def visit_lived(self, node, children):
        return {children[0]: children[1]} 
    
    def visit_flourished_prefix(self, node, children):
        out = 'flourished'
        if len(children):
            out = out + ' ' + str(children[0])
        return out

    def visit_flourished(self, node, children):
        return {children[0]: children[1]}     
    
    def visit_age(self, node, children):
        if self.debug:
            print(node)
            print(children)
        if len(children) > 1:
            out = {'age ' + children[0]: children[1]}
        else:
            out = {'age': children[0]}
        if self.debug:
            print(out)
        return out
    
    def visit_period(self, node, children):
        return None
    
    def visit_born_expr(self, node, children):
        out = children[0]
        if len(children) > 1:
            out.update(children[1])
        return out
    
    def visit_died_expr(self, node, children):
        if self.debug:
            print(node)
            print(children)
        out = children[0]
        if len(children) > 1:
            out.update(children[1])
        return out   

    
    def visit_flourished_expr(self, node, children):
        if self.debug:
            print(node)
            print(children)
        out = children[0]
        if len(children) > 1:
            out.update(children[1])
        return out      
    
    def visit_bio(self, node, children):
        out = children[1]
        out['name'] = children.name[0]
        if len(children.category):
            category = children.category[0].split(' ')
            out['category'] = category[0]
            if len(category) > 1:
                out['subcategory'] = category[1]
            out['division'] = categories[out['category']]
        else:
            out['division'] = 'Statesman and Warriors'
        return out
    

def word():
    return Not(["d.", "fl.", "b."]), _(r"\S+")

def punct():
    return [",", ".", "(", ")"] 

def integer():
    return _("[0-9]+")

def name():
    return OneOrMore([word, punct])

def after():
    return "af."
    
def about(): 
    return "ab."

def above():
    return "above"

def before():
    return "before"
    
def bc():
    return "BC."

def period():
    return "."

def year():
    return integer, Optional(bc)

def ordinal():
    return "\d+"

def century():
    return Optional("in"), Optional(about), integer, Optional("or", integer), "Cent.", Optional(bc)

def born_prefix():
    return "b.", Optional(about)

def born():
    return born_prefix, year

def lived_prefix():
    return ["l.", "liv."], Optional(after)

def lived():
    return lived_prefix, year

def died_prefix():
    return "d.", Optional([after, about])

def died():
    return died_prefix, year

def flourished_prefix():
    return "fl.", Optional([after, about, before]),

def flourished():
    return flourished_prefix, [century, year]
        
def age():
    return Optional([about, above]), integer

def born_expr():
    return born, Optional(period), Optional([age, lived, died])
                          
def died_expr():
    return died, Optional(period), Optional([age, lived, born])
                          
def flourished_expr():
    return flourished, Optional(period), Optional([age, lived, died])

def category():
    return [
        "Engineer",
        *(f"H P. {k}" for k in 
             ["Ion", "Soc", "Cyr", "Meg", "Eleat",
              "Ac", "Per", "Sto", "Cyn", "Ital",
              "Scept", "Ep", "Eleack"]),
        "H P.",
        "Bell",
        "Trav",
        "Moh",
        "Met",
        "Mor",
        "Pol",
        "Chy",
        "Act",
        "Eng",
        "Geo",
        "Ant",
        "Bel",
        "Ph",        
        "Po",        
        "Pa",
        "St",
        "Mu",
        "Pr",        
        "Ar",
        "Cr",
        "Or",        
        "Ch",        
        "P",        
        "H",        
        "L",
        "J",
        "F",
        "D",
        "M",
        "R"
    ]

def bio():
    return (name, [died_expr, flourished_expr, born_expr],
            Optional(period), 
            Optional(category), 
            Optional(period), 
            EOF)

parser = ParserPython(bio)

In [6]:
import yaml
category_lookup = {}
with open("Categories.yml", "r") as fp:
    data = yaml.load(fp)
    for x in data:
        for cat in x['categories']:
            if isinstance(cat, list):
                for subcat in cat[1]:
                    category_lookup[cat[0] + ' ' + subcat] = x['name']
                category_lookup[cat[0]] = x['name']
            else:
                category_lookup[cat] = x['name']

In [56]:
data = []
with open("priestly1778.txt", "r") as f:
    for line in f.readlines():
        try:
            parse_tree = parser.parse(line.strip())
            try:
                parsed = visit_parse_tree(parse_tree, BioVisitor())
                parsed['text'] = line.strip()
                data.append(parsed)
            except IndexError as exc:
                raise exc
        except NoMatch as exc:
            print("ERROR:", line)

DOTS_YEARS = 10

for person in data:
    life_type = tuple(sorted([k for k in person if re.match("died|born|lived|flourished|age", k)]))
    if life_type == ("age", "born"):
        person['born_1'] = person['born']['value']
        person['died_1'] = person['born_1'] + person['age']
    elif life_type == ("age", "died"):
        person['died_1'] = person['died']['value']
        person['born_1'] = person['died_1'] - person['age']
    elif life_type == ("age", "died about"):
        # Ignore uncertainty with "about"
        person['died_1'] = person['died about']['value']
        person['born_1'] = person['died_1'] - person['age'] 
    elif life_type == ("age", "died after"):
        person['died_1'] = person['died after']['value']
        person['died_2'] = person['died_1'] + DOTS_YEARS
        # died after uncertainty perpetuates through age
        person['born_1'] = person['died_2'] - person['age']
        person['born_2'] = person['died_1'] - person['age']
    elif life_type == ("age", "flourished"):
        # use exact values for age and the 2/3 rule for flourished
        fl = person['flourished']['value']
        age = person['age']
        person["born_1"] = fl + (1 / 3) * age
        person["died_1"] = fl - (2 / 3) * age        
    elif life_type == ("age about", "born"):
        # ignore uncertainty with about
        person['born_1'] = person['born']['value']
        person['died_1'] = person['born_1'] + person['age about']         
    elif life_type == ("age about", "died"):
        # ignore uncertainty with about
        person['died_1'] = person['died']['value']
        person['born_1'] = person['died_1'] - person['age about']
    elif life_type == ("age about", "died after"):
        # ignore uncertainty with about
        # for died after the uncertainty per
        person['died_1'] = person['died after']['value']
        person['died_2'] = person['died_1'] + DOTS_YEARS
        person['born_1'] = person['died_2'] - person['age about']
        person['born_2'] = person['died_1'] - person['age about']
    elif life_type == ("age about", "died about"):
        person['died_1'] = person['died about']['value']
        person['born_1'] = person['died_1'] - person['age about']
    elif life_type == ("age above", "born"):
        person['born_1'] = person['born']['value']
        person['died_1'] = person['born_1'] + person['age above']
        person['died_2'] = person['died_1'] + DOTS_YEARS        
    elif life_type == ("age above", "died"):
        person['died_1'] = person['died']['value']
        person['born_1'] = person['died_1'] - person['age above']     
        person['born_2'] = person['born_1'] - DOTS_YEARS    
    elif life_type == ("age above", "died after"):
        person['died_1'] = person['died after']['value']
        person['died_2'] = person['died_1'] + DOTS_YEARS
        person['born_1'] = person['died_2'] - person['age above']
        person['born_2'] = person['born_1'] - 2 * DOTS_YEARS
    elif life_type == ("born",):
        person["born_1"] = person["born"]["value"]
        person["died_1"] = person["born_1"] + 3 * DOTS_YEARS
        person["died_2"] = person["died_1"] + 3 * DOTS_YEARS
    elif life_type == ("born", "died"):
        person['died_1'] = person['died']['value']
        person['born_1'] = person['born']['value']
    elif life_type == ("born", "died after"):
        person['born_1'] = person['born']['value']         
        person['died_1'] = person['died after']['value']
        person['died_2'] = person['died_1'] + DOTS_YEARS
    elif life_type == ("born", "lived after"):
        person["born_1"] = person["born"]["value"]
        person["died_1"] = person["lived after"]['value']
        person["died_2"] = person["died_1"] + DOTS_YEARS        
    elif life_type == ("born about",):
        person["born_1"] = person["born about"]["value"]
        person["died_1"] = person["born_1"] + 3 * DOTS_YEARS
        person["died_2"] = person["died_1"] + 3 * DOTS_YEARS    
    elif life_type == ("born about", "died"):
        person['died_1'] = person['died']['value']
        person['born_1'] = person['died_1'] - person['born about']        
    elif life_type == ("born before",):
        person["born_1"] = person["born"]["value"]
        person["born_2"] = person["born_1"] - 1 * DOTS_YEARS
        person["died_1"] = person["born_1"] + 3 * DOTS_YEARS
        person["died_2"] = person["died_1"] + 3 * DOTS_YEARS 
    elif life_type == ("died",):
        person['died_1'] = person['died']['value']
        person['born_1'] = person['died_1'] - 3 * DOTS_YEARS
        person['died_2'] = person['died_1'] - 4 * DOTS_YEARS
    elif life_type == ("died about",):
        person['died_1'] = person['died about']['value']
        person['born_1'] = person['died_1'] - 3 * DOTS_YEARS
        person['died_2'] = person['died_1'] - 4 * DOTS_YEARS        
    elif life_type == ("died after", ):
        person['died_1'] = person['died after']['value']
        person['died_2'] = person['died_1'] + DOTS_YEARS
        person['born_1'] = person['died_1'] - 2 * DOTS_YEARS
        person['born_2'] = person['born_1'] - 4 * DOTS_YEARS        
    elif life_type == ("flourished",):
        fl = person['flourished']['value']
        person["born_1"] = fl - 2 * DOTS_YEARS
        person["born_2"] = fl - 3 * DOTS_YEARS
        person["died_1"] = fl + DOTS_YEARS
        person["died_2"] = person["died_1"] + 2 * DOTS_YEARS
    elif life_type == ("flourished after",):
        # treat the same as flourished
        fl = person['flourished after']['value']
        person["born_1"] = fl - 2 * DOTS_YEARS
        person["born_2"] = fl - 3 * DOTS_YEARS
        person["died_1"] = fl + DOTS_YEARS
        person["died_2"] = person["died_1"] + 2 * DOTS_YEARS
    elif life_type == ("flourished before",):
        # treat the same as flourished
        fl = person['flourished before']['value']
        person["born_1"] = fl - 2 * DOTS_YEARS
        person["born_2"] = fl - 3 * DOTS_YEARS
        person["died_1"] = fl + DOTS_YEARS
        person["died_2"] = person["died_1"] + 2 * DOTS_YEARS           
    elif life_type == ("flourished about", ):
        person["born_2"] = person["flourished about"]['value'] - 5 * DOTS_YEARS
        person["died_2"] = person["flourished about"]['value'] + 3 * DOTS_YEARS
    else:
        print("unknown type: ", life_type)

In [55]:
import json

outfile = "priestly_1778.json"
with open(outfile, "w") as f:
    json.dump(data, f, indent = 1)