In [157]:
import yaml

In [158]:
import csv

# Helper functions

In [164]:
from arpeggio import Optional, ZeroOrMore, OneOrMore, EOF, UnorderedGroup, OrderedChoice, Sequence
from arpeggio import RegExMatch as _
from arpeggio import ParserPython

def ordinal():
    return _('\d+(st|nd|rd|th)')

def punct():
    return _("[.,;:]")

def integer():
    return _("\d+")

def list_sep():
    return [(",", "and"), ",", "and"]

def quoted(x):
    return ('"', x, '"')

def parens(x):
    return ("(", x, ")")

def split_words(x):
    return tuple(x.split(' '))

def quoted_letter():
    """Quoted letter, e.g. "A", often used for companies."""
    return "\"", _("[A-Z]"), Optional(punct), "\""

Test unit type parser

# Navy Ships

In [169]:
NAVY_ORG = ["UNITED STATES NAVY", ]

SHIP_NAMES = [
    "Conestoga",
    "Lexington",
    "Tyler",
    "Brooklyn",
    "Octorora",
    "Hartford",
    "Ossippee",
    "Itasca",
    "Oneida",
    "Galena",
    "Metacomet",
    "Richmond",
    "Port Royal",
    "Lackawanna",
    "Seminole",
    "Monongahela",
    "Tecumseh"
]

def ship():
    return quoted(SHIP_NAMES)

def ship_list():
    """Lists of Ships

    - U. S. Gunboats "Conestoga", "Lexington" and "Tyler".

    """
    return ship(), ZeroOrMore(Optional(list_sep), ship())

def navy_units():
    return NAVY_ORG, _('-+'), ship_list()

# Army Units

In [170]:
def detachments():
    """Detachments pattern
    
    - (Detachments)
    
    """
    return parens(_("Detachments?"))


def section():
    return parens("Section")


def battalion():
    return parens((Optional(ordinal), "Battalion"))


def mounted():
    return parens("Mounted")


def named_companies():
    """List of Company Names.
    
    - (Co. "A")
    - (Cos. "A" and "B")
    - (Cos. "A", "B", and "C")
    
    """
    return "(", _("Cos?\.?"), quoted_letter, ZeroOrMore(list_sep, quoted_letter), ")"


def num_companies():
    """Number of companies
    
    - (4 Cos.)
    - (1 Co.)
    
    """
    return "(", integer, _("Cos?\.?"), ")"


def companies():
    return [named_companies, num_companies]


print(ParserPython(named_companies).parse('(Cos. "A")'))
print(ParserPython(named_companies).parse('(Cos. "A", "B", and "C")'))
print(ParserPython(num_companies).parse('(4 Cos.)'))


( | Cos. | " | A | " | )
( | Cos. | " | A | " | , | " | B | " | , | and | " | C | " | )
( | 4 | Cos. | )


In [171]:
def one_battery():
    return "Battery", quoted_letter

def multiple_batteries():
    return "Batteries", OneOrMore(Optional(list_sep), quoted_letter)

def artillery_battery():
    """Artillery Batteries.
        
    - "Battery \"B\", 1st Light Artillery (Section)"
    
    """
    return (Optional("Independent"),
            [one_battery, multiple_batteries], 
            Optional(punct), 
            Optional(ordinal), 
            Optional("Independent"),
            Optional(["Light", "Heavy"]), "Artillery",
            Optional([section, detachments])
           )

ex = "Battery \"B\", 1st Light Artillery"
print(ParserPython(artillery).parse(ex))
ex = "Battery \"B\", 1st Light Artillery (Section)"
print(ParserPython(artillery).parse(ex))

Battery | " | B | " | , | 1st | Light | Artillery
Battery | " | B | " | , | 1st | Light | Artillery | ( | Section | )


In [172]:
UNIT_TYPE = [
    (Optional(["Middle", "East", "Vidette"]), "Cavalry"),
    (Optional(["Mounted", "Colored"]), "Infantry"),
    (Optional("Independent"),
     Optional(_("Batter(y|ies)")),
     Optional(["Heavy", "Light"]), "Artillery")
]

def unit_note():
    """Unit notes.
    
    Notes that appear in parentheses after a unit name.
    """
    return parens([detachments, battalion, companies, mounted])

def ordinal_unit():
    return ordinal, Optional(unit_note)

def ordinal_unit_list():
    return (ordinal_unit(),
            ZeroOrMore(list_sep(), ordinal_unit()),
            UNIT_TYPE, 
            Optional(unit_note))

for i in ("1st, 2nd, and 4th Infantry", 
          "22nd and 44th Cavalry",
          "6th and 12th Independent Batteries Light Artillery",
          "3rd Cavalry"
         ):
    print(ParserPython(ordinal_unit_list).parse(i))
    


1st | , | 2nd | , | and | 4th | Infantry
22nd | and | 44th | Cavalry
6th | and | 12th | Independent | Batteries | Light | Artillery
3rd | Cavalry


Miscellaneous stuff after the troops engaged:

In [173]:
def other_text():
    return [
    "By State Troops",
    "Gen. R. L. McCook and Escort. Gen. McCook killed.",
    "(No Reports.)",
    "(Confederate Reports.)",
    "Train Guard",
    "Naval Attack on Blockade Runner",
    "(No Details.)",
    "Scouting party",
    "Foraging party",
    "By Confederate Forces",
    "To U. S. Forces",
    "Surrender of Confederate Forces in Departments of Alabama, Mississippi and Eastern Louisiana.",
    "Explosion of Ordnance Depot.",
    "United States forces",
    _("Attack on U. S. Steamer .*"),
    "(See Force in Campaign against Mobile.)",
    "13th Army Corps.",
    "Picket Attack.",
    "Pickets",
    "Pickets, 2nd Brigade, 1st Division, 16th Corps",
    "picket attack",
    "(Destruction of Salt Works.)",
    "Union Indians, under Opothleyholo."
    ]

In [174]:
def union_losses():
    return _("(Union loss|Loss).*")

def troops_engaged():
    return (Optional(unit_list()),
            Optional(punct), 
            Optional(other_text), 
            Optional(punct), 
            Optional(union_losses), 
            EOF)

In [175]:
import datetime
import re
import os
from ruamel.yaml import YAML
yaml = YAML(typ='safe')
yaml.default_flow_style = False
yaml.representer.ignore_aliases = lambda x: True

def iter_files():
    outdir = 'engagements'
    for filename in os.listdir(outdir):
        fn = os.path.join(outdir, filename)
        state, ext = os.path.splitext(fn)
        state = state.split("/")[1]
        if ext == '.yml':
            with open(fn, 'r') as fp:
                data = yaml.load(fp)
            yield (state, data)


In [176]:
from arpeggio import NoMatch
parser = ParserPython(troops_engaged)

dyer = dict(iter_files())
for engagement in dyer["Arkansas"]:
    try:
        text = engagement['troops_engaged']
    except KeyError:
        continue
    try:
        parser.parse(text)
    except NoMatch as e:
        print(engagement['slug'], text)
        print(e)
        stop

    

Arkansas/1862-03-18-skirmish-salem-spring-river IOWA--3rd Cavalry (Detachment). MISSOURI--6th Cavalry (Detachment). Union loss, 5 killed, 10 wounded. Total, 15.
Expected '(' or '(' or '(' or '(' or '(' at position (1, 20) => ' Cavalry (*Detachment'.


NameError: name 'stop' is not defined