In [2]:
from lxml import etree
infile = open('source_data_hamlet-tei.xml', 'rb')
xml = infile.read()
infile.close()

In [3]:
tree = etree.fromstring(xml)

ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

titles = tree.xpath('//tei:title/text()', namespaces = ns)



print("len of titles:", len(titles))


len of titles: 7


In [4]:
# statement that finds the text of all the title elements inside the titleStmt node (3 results)

results2 = tree.xpath('//tei:titleStmt/tei:title/text()', namespaces = ns)
print("Here is the text of results2:")
print(results2)
print("len of results2:", len(results2))


Here is the text of results2:
['The Tragedie of Hamlet, Prince of Denmark from Mr. William\n                    Shakespeares comedies, histories, & tragedies. Published according to the\n                    true originall copies.', 'Mr. VVilliam Shakespeares comedies, histories, &\n                    tragedies', 'Bodleian First Folio, Arch. G c.7']
len of results2: 3


In [5]:
# xpath query that selects all the type attributes from the div elements (26 results)
# Create Counter object with the results of your xpath query as a parameter
# Count values appearing the text


results3 = tree.xpath('//tei:div/@type', namespaces = ns)

print("len of results3:", len(results3))

from collections import Counter

print(Counter(results3))



len of results3: 26
Counter({'scene': 20, 'act': 5, 'play': 1})


In [6]:
# xpath query that finds all the div elements that are for scenes



allscenedivs = tree.xpath('//tei:div[@type="scene"]', namespaces = ns)
print("len of allscenedivs:", len(allscenedivs))
print(allscenedivs)


len of allscenedivs: 20
[<Element {http://www.tei-c.org/ns/1.0}div at 0x11285a0f0>, <Element {http://www.tei-c.org/ns/1.0}div at 0x11285a140>, <Element {http://www.tei-c.org/ns/1.0}div at 0x11285a190>, <Element {http://www.tei-c.org/ns/1.0}div at 0x11285a1e0>, <Element {http://www.tei-c.org/ns/1.0}div at 0x11285a230>, <Element {http://www.tei-c.org/ns/1.0}div at 0x11285a2d0>, <Element {http://www.tei-c.org/ns/1.0}div at 0x11285a320>, <Element {http://www.tei-c.org/ns/1.0}div at 0x11285a3c0>, <Element {http://www.tei-c.org/ns/1.0}div at 0x11285a410>, <Element {http://www.tei-c.org/ns/1.0}div at 0x11285a460>, <Element {http://www.tei-c.org/ns/1.0}div at 0x11285a4b0>, <Element {http://www.tei-c.org/ns/1.0}div at 0x11285a550>, <Element {http://www.tei-c.org/ns/1.0}div at 0x11285a5a0>, <Element {http://www.tei-c.org/ns/1.0}div at 0x11285a5f0>, <Element {http://www.tei-c.org/ns/1.0}div at 0x11285a640>, <Element {http://www.tei-c.org/ns/1.0}div at 0x11285a690>, <Element {http://www.tei-c.org/

In [7]:
# Iterated over elements to build queries to extract select attributes:
#   - added an xpath query that extracts the type attribute
#   - added an xpath query that extracts the n attribute
#   - results inside a list -> extract the [0] element out of each
#   - Created a print statement in the following format: "Act #: Scene: #" where each # is replaced by the appropriate value

for div in allscenedivs:

    type_value = div.xpath('@type', namespaces=ns)
    n = div.xpath('@n', namespaces=ns)
    print(type_value[0], ":", n[0])


scene : 1
scene : 2
scene : 3
scene : 4
scene : 5
scene : 1
scene : 2
scene : 1
scene : 2
scene : 3
scene : 4
scene : 1
scene : 2
scene : 3
scene : 4
scene : 5
scene : 6
scene : 7
scene : 1
scene : 2


In [8]:
# Target the div elements that are just for the scenes
# Note: through exploration, it is found that the scene divs live inside of the act divs.

# Goal: extract scene info and now add the act number in that same line.

# Inside loop:
#   - added an xpath query that finds the parent element's type attribute value
#   - added an xpath query that finds the parent element's n attribute value
#   - added select targets into existing print statement

for div in allscenedivs:

    type_value = div.xpath('@type', namespaces=ns)
    parent_type = div.xpath('../@type', namespaces=ns)
    n = div.xpath('@n', namespaces=ns)
    parent_n = div.xpath('../@n', namespaces=ns)
    print(parent_type[0], ":", parent_n[0],",", type_value[0], ":", n[0])


act : 1 , scene : 1
act : 1 , scene : 2
act : 1 , scene : 3
act : 1 , scene : 4
act : 1 , scene : 5
act : 2 , scene : 1
act : 2 , scene : 2
act : 3 , scene : 1
act : 3 , scene : 2
act : 3 , scene : 3
act : 3 , scene : 4
act : 4 , scene : 1
act : 4 , scene : 2
act : 4 , scene : 3
act : 4 , scene : 4
act : 4 , scene : 5
act : 4 , scene : 6
act : 4 , scene : 7
act : 5 , scene : 1
act : 5 , scene : 2


In [9]:
# Look at the structure of the person elements that appear inside the listPerson
# element just a little way down from the top of the file.
# Each contains several personal names (in the persName element) with a type attribute,
# several other kinds of personal information, and the person's social status

# Write a query that selects (but does not extract anything from) the person elements.

pers_elements = tree.xpath('//tei:person', namespaces = ns)


In [10]:
# Loop over element results and pull out values accounting for missing values with function

def checkFor1Result(xpathresult, missing_value):
    if len(xpathresult) > 1:
        howmany = len(xpathresult)
        raise ValueError("Your list had " + str(howmany) + " items instead of 1. Shutting down the program.." + "but first here's your failed result" + str(xpathresult))
    elif len(xpathresult) == 1:
        result = xpathresult[0] # grab the element when there is exactly one to grab
    else:
        result = missing_value
    return result

In [11]:
#  Loop over your person element results, and for each element:
#   - write an xpath query that finds the id of the person, e.g. "#F-ham-mar"
#   - write an xpath query that finds the socecStatus element text
#   - write an xpath query that finds the occupation element text
#   - run the results of these three queries through the checkFor1Result function (and add an appropriate missing value)
#   - print out the results in one line, in this order:  id, status, occupation

for person in pers_elements:
    results = []
    id = person.xpath("@xml:id", namespaces = ns)
    soc = person.xpath("tei:socecStatus/text()", namespaces = ns)
    occ = person.xpath("tei:occupation/text()", namespaces = ns)
    results.append(checkFor1Result(id, "Missing id"))
    results.append(checkFor1Result(soc, "Missing status"))
    results.append(checkFor1Result(occ, "Missing occupation"))
    print(results)


['F-ham-pla.1', 'worker', 'Missing occupation']
['F-ham-all', 'Missing status', 'Missing occupation']
['F-ham-amb', 'noble', 'Missing occupation']
['F-ham-plp', 'worker', 'Missing occupation']
['F-ham-plq', 'worker', 'Missing occupation']
['F-ham-ber', 'worker', 'military']
['F-ham-cap', 'worker', 'military']
['F-ham-clo.1', 'worker', 'Missing occupation']
['F-ham-for', 'noble', 'military']
['F-ham-fra', 'worker', 'military']
['F-ham-gen', 'noble', 'Missing occupation']
['F-ham-gmn', 'noble', 'Missing occupation']
['F-ham-gho', 'noble', 'military']
['F-ham-gui', 'noble', 'Missing occupation']
['F-ham-ham', 'noble', 'Missing occupation']
['F-ham-hor', 'noble', 'Missing occupation']
['F-ham-cla', 'noble', 'Missing occupation']
['F-ham-lae', 'Missing status', 'Missing occupation']
['F-ham-luc', 'noble', 'Missing occupation']
['F-ham-mar', 'worker', 'military']
['F-ham-mes', 'worker', 'Missing occupation']
['F-ham-oph', 'noble', 'Missing occupation']
['F-ham-osr', 'noble', 'Missing occupat

In [12]:
# A copy of the for loop modified:
#   - collect these new results into a list within the for loop
#   - use an accumulator to collect all those lists

all_results = []

for person in pers_elements:
    results = []
    id = person.xpath("@xml:id", namespaces = ns)
    soc = person.xpath("tei:socecStatus/text()", namespaces = ns)
    occ = person.xpath("tei:occupation/text()", namespaces = ns)
    results.append(checkFor1Result(id, "Missing id"))
    results.append(checkFor1Result(soc, "Missing status"))
    results.append(checkFor1Result(occ, "Missing occupation"))
    #print(results)
    all_results.append(results)
print(all_results)
print(len(all_results))

[['F-ham-pla.1', 'worker', 'Missing occupation'], ['F-ham-all', 'Missing status', 'Missing occupation'], ['F-ham-amb', 'noble', 'Missing occupation'], ['F-ham-plp', 'worker', 'Missing occupation'], ['F-ham-plq', 'worker', 'Missing occupation'], ['F-ham-ber', 'worker', 'military'], ['F-ham-cap', 'worker', 'military'], ['F-ham-clo.1', 'worker', 'Missing occupation'], ['F-ham-for', 'noble', 'military'], ['F-ham-fra', 'worker', 'military'], ['F-ham-gen', 'noble', 'Missing occupation'], ['F-ham-gmn', 'noble', 'Missing occupation'], ['F-ham-gho', 'noble', 'military'], ['F-ham-gui', 'noble', 'Missing occupation'], ['F-ham-ham', 'noble', 'Missing occupation'], ['F-ham-hor', 'noble', 'Missing occupation'], ['F-ham-cla', 'noble', 'Missing occupation'], ['F-ham-lae', 'Missing status', 'Missing occupation'], ['F-ham-luc', 'noble', 'Missing occupation'], ['F-ham-mar', 'worker', 'military'], ['F-ham-mes', 'worker', 'Missing occupation'], ['F-ham-oph', 'noble', 'Missing occupation'], ['F-ham-osr', 'n