In [1]:
FILENAME = "Perseus_text_2001.05.0068.xml"

In [3]:
import lxml.etree as ET
tree = ET.parse(FILENAME)

In [4]:
root = tree.getroot()

In [16]:
# List of regiments in the Union Armies, with total number of deaths in each.
table = root.xpath("//*[@id='c.13']/p[1]/table")[0]

In [240]:
import itertools


import re

def get_text(x):
    y = ''.join(x.xpath("./*[not(self::note)]//text()"))
    y = clean_ws(y)
    y = re.sub("）", ")", y)
    return y

UNIT_TYPES = {
 'Artillery.',
 'Cavalry.',
 'Engineers.',
 'Heavy Artillery.',
 'Independent Batteries.',
 'Infantry.',
 'Light Artillery.',
 'Light Batteries.',
 'Marine Artillery.',
 'Mounted Infantry.',
 'Mounted Rifles.',
 'National Guard.',
 'Sharpshooters.',
 'Three-Months Service（‘61).',
 "Hundred-Days Men (1864)",
 "Three-Months' Service (‘61)",
 "--Penn. S. M.",
 "1st Rhode Island--",
 "Penn. Light Artillery--",
 "1st Illinois Light Artillery",
 "2d Illinois Light Artillery",
 "1st Michigan--"
}

def get_num(x, i):
    num = x.xpath(f"cell[{i}]/num/@value")
    if len(num):
        return int(num[0])
    else:
        return None

fieldnames = ("killed_died_wounds_enlisted", 
             "killed_died_wounds_officers", 
             "killed_died_wounds_total", 
             "died_other_enlisted",             
             "died_other_officers",             
             "died_other_total",            
             "total_deaths")

def get_organized(row):
    ret = get_text(row.xpath("cell[1]")[0])
    ret = re.sub("Ap'l", "April", ret)
    months = ("-+", "Jan", "Feb", "Mar", "April", "May", "June", 
              "July", "Aug", "Sept", "Oct", "Nov", "Dec")
    pattern = "({})\.?,?\s*['‘]?(6[1-5])".format('|'.join(months))
    m = re.match(pattern, ret)
    if m:
        mon = m.group(1)
        year = int(m.group(2))
        if re.match("-+", mon):
            mon = None
        else:
            mon = months.index(mon)
        return (mon, year)
    else:
        # print(ret)
        return (None, None)
                   
def get_unit_name(row):
    return get_text(row.xpath("cell[2]")[0])
        
alldata = []
for row in table.xpath("row[@role='data']")[2:]:
    unit_types = set()
    if (get_text(row.xpath("cell[1]")[0]) == ""
            and get_text(row.xpath("cell[3]")[0]) == ""
            and get_text(row.xpath("cell[7]")[0]) == ""
            and get_text(row.xpath("cell[8]")[0]) == ""):
        col2 = get_text(row.xpath('cell[2]')[0])
        if (col2 in UNIT_TYPES):
            unit_type = re.sub(r"\.$", "", col2)
        else:
            # print(ET.tostring(row))
            print(clean_ws(get_text(row.xpath("cell[2]")[0])))
            #print([get_text(x) for x in row])
    else:
        try:
            id_ = row.xpath("cell[2]/orgName/@key")[0]
        except IndexError:
            id_ = None
        unit_name = get_unit_name(row)
        organized = get_organized(row)
        data = {'key': id_, 
                'unit_name': unit_name,
                'organized_month': organized[0],
                'organized_year': organized[1],
                'division': re.sub(r"\.$", "", get_text(row.xpath("cell[10]")[0])),
                'corps': re.sub(r"\.$", "", get_text(row.xpath("cell[11]")[0])),
                'unit_type': unit_type
               }
        for key, i in zip(fieldnames, range(3, 10)):
            data[key] = get_num(row, i)
        notes = [get_text(x) for x in row.xpath(".//note")]
        data['notes'] = ';'.join(notes)
        alldata.append(data)


Cavalry
Heavy Artillery
Light Batteries
Infantry
Cavalry
Heavy Artillery
Light Batteries
Infantry
Cavalry
Heavy Artillery
Light Batteries
Infantry
Cavalry
Heavy Artillery
Light Batteries
Sharpshooters
Infantry
Three-Months Service61
Hundred-Days Men1864
Cavalry
Heavy Artillery
Light Artillery
1st Rhode Island
Infantry
Cavalry
Heavy Artillery
Light Batteries
Infantry
Cavalry
Mounted Rifles
Heavy Artillery
Marine Artillery
Light Artillery
Independent Batteries

Sharpshooters
Infantry
National Guard
Cavalry
Light Batteries
Infantry
Three-Months' Service‘61
Cavalry
Heavy Artillery
Light Artillery
Independent Batteries
Penn. Light Artillery
Infantry
Cavalry
Heavy Artillery
Light Batteries
Infantry
Cavalry
Light Batteries
Infantry
Cavalry
Light Artillery
Infantry
Cavalry
Heavy Artillery
Light Artillery
Light Batteries
Sharpshooters
Infantry
Cavalry
Light Batteries
Infantry
Cavalry
Heavy Artillery
Light Batteries
Infantry
Cavalry
Light Artillery
Independent Batteries
Infantry
Cavalry
Light Ar

In [241]:
import csv
fieldnames = (
    'key', 'unit_type', 'unit_name', 
    'organized_year',
    'organized_month',
    'killed_died_wounds_enlisted',
    'killed_died_wounds_officers',
    'killed_died_wounds_total',
    'died_other_enlisted',
    'died_other_officers',
    'died_other_total',
    'total_deaths',
    'division',
    'corps',
    'notes'
)
with open("list_union_regiments_total_deaths.csv", "w") as fp:
    writer = csv.DictWriter(fp, fieldnames)
    writer.writeheader()
    writer.writerows(alldata)