In [1]:
import yaml

In [2]:
import csv

In [5]:
import datetime
import re
import os
from ruamel.yaml import YAML
yaml = YAML(typ='safe')
yaml.default_flow_style = False
yaml.representer.ignore_aliases = lambda x: True

def iter_files():
    outdir = 'engagements'
    for filename in os.listdir(outdir):
        fn = os.path.join(outdir, filename)
        state, ext = os.path.splitext(fn)
        state = state.split("/")[1]
        if ext == '.yml':
            with open(fn, 'r') as fp:
                data = yaml.load(fp)
            yield (state, data)

In [6]:
dyer = dict(iter_files())

- force name
- detachment
- type

In [62]:
from arpeggio import Optional, ZeroOrMore, OneOrMore, EOF, UnorderedGroup, OrderedChoice, Sequence
from arpeggio import RegExMatch as _
from arpeggio import ParserPython

def ordinal():
    return _('\d+(st|nd|rd|th)')

def punct():
    return _("[.,;:]")

def integer():
    return _("\d+")

def list_sep():
    return [(",", "and"), ",", "and"]

def quoted(x):
    return ('"', x, '"')

def parens(x):
    return ("(", x, ")")


Test unit type parser

In [99]:
NAVY_ORG = ["UNITED STATES NAVY", ]

SHIP_NAMES = [
    "Conestoga", 
    "Lexington",
    "Tyler",
    "Brooklyn", 
    "Octorora", 
    "Hartford", 
    "Ossippee",
    "Itasca", 
    "Oneida", 
    "Galena", 
    "Metacomet", 
    "Richmond", 
    "Port Royal", 
    "Lackawanna",
    "Seminole", 
    "Monongahela",
    "Tecumseh"
]

def ship():
    return quoted(SHIP_NAMES)

def ship_list():
    """Lists of Ships
    
    - U. S. Gunboats "Conestoga", "Lexington" and "Tyler".
    
    """
    return ship(), ZeroOrMore(Optional(list_sep), ship())

def navy_units():
    return NAVY_ORG, _('-+'), ship_list()

In [100]:
ParserPython(ship_list).parse('"Conestoga", "Seminole"')

[  '"' [0],  'Conestoga' [1],  '"' [10], [  ',' [11] ],  '"' [13],  'Seminole' [14],  '"' [22] ]

In [101]:
ARMY_ORG = [
  ("ALABAMA", Optional(("and", "TENNESSEE"))),
 'ARKANSAS',
 'CALIFORNIA',
 'COLORADO',
 'CONNECTICUT',
 'DAKOTA',
 'DELAWARE',
 'DISTRICT OF COLUMBIA',
 'FLORIDA',
 'IDAHO TERRITORY',
 'ILLINOIS',
 ("INDIANA", Optional("LEGION")),
 'IOWA',
 'KANSAS',
 'KENTUCKY',
 'LOUISIANA',
 'MAINE',
 'MARYLAND',
 'MASSACHUSETTS',
 'MICHIGAN',
 'MINNESOTA',
 ('MISSISSIPPI', Optional(("MARINE", "BRIGADE"))),
 'MISSOURI',
 'NEBRASKA',
 'NEVADA',
 'NEW HAMPSHIRE',
 'NEW JERSEY',
 'NEW MEXICO',
 'NEW YORK',
 'NORTH CAROLINA',
 'OHIO',
 'OREGON',
 'PENNSYLVANIA',
 'RHODE ISLAND',
 'SOUTH CAROLINA',
 'TENNESSEE',
 'TEXAS',
 ('UNITED', 'STATES', Optional([("COLORED", "TROOPS")])),
 'VERMONT',
 'VIRGINIA',
 'WASHINGTON',
 'WEST VIRGINIA',
 'WISCONSIN']

In [127]:
def other_text():
    return [
    "By State Troops",
    "Gen. R. L. McCook and Escort. Gen. McCook killed.",
    "(No Reports.)",
    "(Confederate Reports.)",
    "Train Guard",
    "Naval Attack on Blockade Runner",
    "(No Details.)",
    "Scouting party",
    "Foraging party",
    "By Confederate Forces",
    "To U. S. Forces",
    "Surrender of Confederate Forces in Departments of Alabama, Mississippi and Eastern Louisiana.",
    "Explosion of Ordnance Depot.",
    "United States forces",
    _("Attack on U. S. Steamer .*"),
    "(See Force in Campaign against Mobile.)",
    "13th Army Corps.",
    "Picket Attack.",
    "Pickets",
    "Pickets, 2nd Brigade, 1st Division, 16th Corps",
    "picket attack",
    "(Destruction of Salt Works.)",
    "Union Indians, under Opothleyholo."
    ]


def quoted_letter():
    return "\"", _("[A-Z]"), Optional(punct), "\""


def detachments():
    """Detachments pattern
    
    - (Detachments)
    
    """
    return parens(_("Detachments?"))


def section():
    return parens("Section")


def battalion():
    return parens((Optional(ordinal), "Battalion"))


def mounted():
    return parens("Mounted")


def named_companies():
    """List of Company Names.
    
    - (Co. "A")
    - (Cos. "A" and "B")
    - (Cos. "A", "B", and "C")
    
    """
    return "(", _("Cos?\.?"), quoted_letter, ZeroOrMore(list_sep, quoted_letter), ")"


def num_companies():
    """Number of companies
    
    - (4 Cos.)
    - (1 Co.)
    
    """
    return "(", integer, _("Cos?\.?"), ")"


def companies():
    return [named_companies, num_companies]


print(ParserPython(named_companies).parse('(Cos. "A")'))
print(ParserPython(named_companies).parse('(Cos. "A", "B", and "C")'))
print(ParserPython(num_companies).parse('(4 Cos.)'))


( | Cos. | " | A | " | )
( | Cos. | " | A | " | , | " | B | " | , | and | " | C | " | )
( | 4 | Cos. | )


In [128]:
def one_battery():
    return "Battery", quoted_letter

def multiple_batteries():
    return "Batteries", OneOrMore(Optional(list_sep), quoted_letter)

def artillery():
    """Artillery Batteries.
        
    - "Battery \"B\", 1st Light Artillery (Section)"
    
    """
    return (Optional("Independent"),
            [one_battery, multiple_batteries], 
            Optional(punct), 
            Optional(ordinal), 
            Optional("Light"), "Artillery",
            Optional([section, detachment])
           )

ex = "Battery \"B\", 1st Light Artillery"
print(ParserPython(artillery).parse(ex))
ex = "Battery \"B\", 1st Light Artillery (Section)"
print(ParserPython(artillery).parse(ex))

Battery | " | B | " | , | 1st | Light | Artillery
Battery | " | B | " | , | 1st | Light | Artillery | ( | Section | )


In [129]:
UNIT_TYPE = [
    (Optional(["Middle", "East", "Vidette"]), "Cavalry"),
    (Optional(["Mounted", "Colored"]), "Infantry"),
    (Optional("Independent"),
     Optional(_("Batter(y|ies)")),
     Optional(["Heavy", "Light"]), "Artillery")
]

def unit_note():
    """Unit notes.
    
    Notes that appear in parentheses after a unit name.
    """
    return parens([detachment, battalion, companies, mounted])

def ordinal_unit():
    return ordinal, Optional(unit_note)

def ordinal_unit_list():
    return (ordinal_unit_in_list(),
            ZeroOrMore(list_sep(), ordinal_unit_in_list()),
            UNIT_TYPE, Optional(unit_note))

for i in ("1st, 2nd, and 4th Infantry", 
          "22nd and 44th Cavalry",
          "6th and 12th Independent Batteries Light Artillery"
         ):
    print(ParserPython(ordinal_unit_list).parse(i))

1st | , | 2nd | , | and | 4th | Infantry
22nd | and | 44th | Cavalry
6th | and | 12th | Independent | Batteries | Light | Artillery


In [157]:
NAMED_UNIT = [
    ("Landgraeber's", "Battery", '"F"', Optional(punct), "2nd", "Light", "Artillery"),
    "Landgraeber's Battery Flying Artillery",
    "Landgraeber's Battery Flying Artillery ("F" 2nd Artillery)",    
    ("Cogswell's", "Independent", "Battery", "Light", "Artillery"),
    ("Latham's", "Co.", "Cavalry"),
    "Chicago Board of Trade Battery Light Artillery",
    "Fremont's Hussars",
    "Benton Hussars",
    "Bowen's Battalion Cavalry",
    "Jenks' and Smith's Cavalry Cos.",
    "Phelps' Regt. Infantry",
    "5th Co. Sharpshooters.", # Ohio
    "Wright's Battalion Cavalry"
]

def unit_name():
    """Single unit name
    
    - 1st Cavalry (Detachment)
    - 5th Cavalry
    - Battery "B", 1st Light Artillery
    
    """
    return [NAMED_UNIT,
            ordinal_unit,
            list_of_ordinal_units,
           ]


def army_units():
    """Lists of Units.
    
    - KENTUCKY--5th Cavalry
    - PENNSYLVANIA--7th Cavalry (Detachment); 78th and 79th Infantry
    
    """
    return ARMY_ORG, _('-+'), (OneOrMore(unit_name, Optional(punct)))


def units():
    return [navy_units(), army_units()]

def unit_list():
    return units(), ZeroOrMore(ptional(punct), units())
    


Miscellaneous stuff after the troops engaged:

In [158]:
def union_losses():
    return _("(Union loss|Loss).*")

def troops_engaged():
    return (Optional([army_units, navy_units]),
            Optional(punct), 
            Optional(other_text), 
            Optional(punct), 
            Optional(union_losses), 
            EOF)

In [None]:
from arpeggio import NoMatch
parser = ParserPython(troops_engaged)

for engagement in dyer["Arkansas"]:
    try:
        text = engagement['troops_engaged']
    except KeyError:
        continue
    try:
        parser.parse(text)
    except NoMatch as e:
        print(engagement['slug'], text)
        print(e)
        stop

    

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Arkansas/1862-02-16-action-pott-s-hill-sugar-creekERROR! Session/line number was not unique in  database. History logging moved to new sessionILLINOIS--3rd Cavalry. MISSOURI--Wright's Battalion Cavalry. Union loss, 1 killed, 3 wounded. Total, 4. 
470Expected '(' or punct or 'Landgraeber's' or 'Landgraeber's Battery Flying Artillery' or 'Landgraeber's Battery Flying Artillery ( 2nd Artillery)' or 'Cogswell's' or 'Latham's' or 'Chicago Board of Trade Battery Light Artillery' or 'Fremont's Hussars' or 'Benton Hussars' or 'Bowen's Battalion Cavalry' or 'Jenks' and Smith's Cavalry Cos.' or 'Phelps' Regt. Infantry' or '5th Co. Sharpshooters.' or 'Wright's Battalion Cavalry' or ordinal or ordinal or punct or 'By State Troops' or 'Gen. R. L. McCook and Escort. Gen. McCook killed.' or '(No Reports.)' or '(Confederate Reports.)' or 'Train Guard' or 'Naval Attack on Blockade Runner' or '(No Details.)' or 'Scouting party' or 'Foraging party' or 'By Confederate Forces' or 'To U. S. Forces' or 'Surr