# Scrape the XMLs

In [1]:
# Load modules
import pandas as pd
import xml.etree.cElementTree as ET #XML Parser
from lxml import etree #ElementTree and lxml allow us to parse the XML file.
import requests #make request to server
import time #pause loop
import itertools

In [2]:
# a request for the ids of all trials between 1754 and 1756
trials = requests.get('https://www.oldbaileyonline.org/obapi/ob?term0=fromdate_17540116&term1=todate_17561208&&start=0').json()

# iterate through manuscripts
for trial in trials['hits'][:10]:

    # build url
    url = 'http://www.oldbaileyonline.org/obapi/text?div={}'.format(trial)
        
    # get the response
    res = requests.get(url).text
        
    #create a file name
    fname = 'data/old-bailey-' + '-' + trial + '.xml'
        
    # save the file
    with open(fname, 'w') as f:
            f.write(res)

    # pause for a second so we don't overload their servers
    time.sleep(1)

# Function for Building Dataframe

In [3]:
def table_of_cases(xml_file_name):
    file = ET.ElementTree(file = xml_file_name)
    iterate = file.getiterator()
    i = 1
    table = pd.DataFrame()
    # Create an empty list to store text
    text_list = []
    for element in iterate:
        if element.tag == "interp":
            t = element.attrib['type']
            val = [element.attrib['value']]
            labels = list(table.columns.values)
            num = str(i)
            if t not in labels:
                table[t] = val
            elif t+num not in labels:
                table[t+num] = val
            elif t+num in labels:
                num = str(i+1)
                table[t+num] = val
        elif element.tag == 'p':
            # Use itertext to extract text from element
            text = ''.join(list(element.itertext()))
            # Replace \n (line breaks) and strip out leading numbers
            text = text.replace('\n', '').lstrip('0123456789.- ')
            # Remove excessive white spaces between words
            text = ' '.join(text.split())
            # Append to list
            text_list.append(text)
    # Add contents of list to column in dataframe
    table['text'] = ''.join(text_list)
    return table

# Loop Through XMLs and Build Dataframe

In [4]:
table = pd.DataFrame()
for i in trials['hits'][:10]:
    raw_data = 'data/old-bailey--'+ i +'.xml'
    data = table_of_cases(raw_data)
    table = table.append(data, ignore_index=True)
table

Unnamed: 0,collection,date,gender,gender1,gender2,given,given1,given2,offenceCategory,offenceCategory1,...,surname1,surname2,text,type,uri,verdictCategory,verdictCategory1,verdictSubcategory,verdictSubcategory1,year
0,BAILEY,17540116,female,male,,Hannah,Richard,,theft,,...,Beach,,"Hannah Ash , spinster , was indicted for steal...",,sessionsPapers/17540116,guilty,,pleadedGuilty,,1754
1,BAILEY,17540116,male,female,male,Peter,Mary,Joseph,theft,,...,Foreman,Sheers,(M.) Peter Foreman and Mary his wife were indi...,,sessionsPapers/17540116,guilty,,,,1754
2,BAILEY,17540116,female,male,female,Sarah,Joseph,Ann,theft,,...,Smithson,Smithson,"(M.) Sarah Williams , spinster , was indicted ...",crimeLocation,sessionsPapers/17540116,guilty,,,,1754
3,BAILEY,17540116,male,female,male,Joseph,Elizabeth,Thomas,theft,,...,Kempster,Stevens,"(M.) Elizabeth wife of Joseph Kempster , was i...",crimeLocation,sessionsPapers/17540116,guilty,,,,1754
4,BAILEY,17540116,male,male,,John,Thomas,,theft,,...,Fazakerley,,(M.) John Allen was indicted for stealing one ...,,sessionsPapers/17540116,notGuilty,,,,1754
5,BAILEY,17540116,male,male,,William,Thomas,,theft,,...,Wetworth,,(M.) William Derter was indicted for stealing ...,,sessionsPapers/17540116,notGuilty,,,,1754
6,BAILEY,17540116,male,male,male,William,Nicholas,Launcelot,theft,,...,Healing,Hide,(M.) William Ford was indicted for stealing on...,crimeLocation,sessionsPapers/17540116,guilty,,,,1754
7,BAILEY,17540116,female,male,,Anne,John,,theft,,...,Jervas,,"(L.) Anne Beezley , spinster , was indicted fo...",,sessionsPapers/17540116,guilty,,,,1754
8,BAILEY,17540116,male,male,male,Robert,John,Abraham,theft,deception,...,Thorp,Julian,"Robert Barber was indicted for that he, togeth...",,sessionsPapers/17540116,guilty,notGuilty,pleadedGuilty,noEvidence,1754
9,BAILEY,17540116,female,female,female,Elizabeth,Catherine,Catharine,theft,,...,Davis,Davis,", 90. (M.) Elizabeth Eaton and Catherine Davis...",crimeLocation,sessionsPapers/17540116,guilty,,lesserOffence,,1754
