In [4]:
import os
import re
import pandas as pd

try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET

In [5]:
# filepath = os.path.join('../curated_corpus/', '19004049201_PageText.xml')
filepath = os.path.join('../curated_corpus/', '19004049202_PageText.xml')
# filepath = os.path.join('../curated_corpus/', '19004543800_PageText.xml')
# filepath = os.path.join('../curated_corpus/', '19006691600_PageText.xml')
# filepath = os.path.join('../curated_corpus/', '19007263500_PageText.xml')

root = ET.parse(filepath)

In [46]:
# Try a few pages first
selected_pages = ['00410', '00420', '00430', '00440', '00450', '00460', '00470', '00480', '00490']
selected_text = []
for elem in root.iter('page'):
    if elem.attrib['id'] in selected_pages:
        selected_text.append(elem.find('ocrText').text)

# Example of a text
print(selected_text[4])


			Intelligibility. Particularity. Time.
			CHAPTER XXXV. HOW THE FACr S SHOULD BE PLEADED. ABTICLE 1. Intelligibilityand particularity.
			2. Conciseness. 3. Positiveness. ART. 1. Intelligibility and particularity. Every pleading must be sufficiently definite and cer- tain in its statements to make the precise nature of the charge or defense apparent to the court and to the adverse party. (Code, § 160.) Intelligibilitly.]-Pleadings must therefore be intelligi- ble, as much under the new system as under the former. (Boye v. Brown, sp. t., 3 How. 391; asf'd, 7 Barb. 80.) They are expressly required by the Code to be couched in plain and ordinary language. (Code, Q§ 142, 149, 153.) Particularity.]-Pleadings must, in order to be "cer- tain," give sufficient particulars of the transactions stated to enable the adverse parties to identify the circum- stances. For this purpose, several descriptive allegations are required, which, though they cannot be made the subject of an issue, and are n

In [42]:
# Try Regex
citations = []
for text in selected_text:
    matches = re.findall("\((.*?)\)", text)
    citations.append(matches)

# Change to DataFrame
df = pd.DataFrame(list(zip(selected_text, citations)), columns=['text', 'citations'])

for _, row in df.iterrows():
    print(row['text'], row['citations'])
    print()


			No other facts to be pleaded. Evidence not to be pleaded. strictness unnecessary. (See Lanning v. Carpenter, 20 N. y. 447, 458; M'Kyring v. Bull, 16 N. Y. 297.) None but material and issuable facts may be pleaded. (Man!l v. Morcwood, 5 Sands. 557; Rensselaer & Wlash. p. S. Co. v. Wetsel, sp. t., 6 How. 68; Williams v. HIayes, sp.t., 5 How. 470.) And though a pleading may state any facts bearing upon the final judgment in the action (Howard v. Tiffany, 3 Sands. 695), it need not (Corwin v. Freland, 6 N. Y. [2 Seld.] 560; rev'g S. C. 6 How. 241; Clleney v. Garbutt, sp. t., 5 H-ow. 467) and must not (Lee v. Elias, 3 Sands. 737; Code Rep. N. S. 116; Sellar v. Sage, sp. t., 13 How. 231; Field v. Morse, sp. t., 8 How. 47; Pitnlam? v. Piitnam, sp. t., 2 Code Rep. 64) contain any allegations which affect only the right of a party to a provisional remedy. Evidence not to be p)leatlded.]-Tle material facts only, and not the circumstances which tend to prove those facts, are to be pleaded. Ev

In [45]:
# Try NLTK Named Entity Recognition
import nltk

# Run these to install
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

sent = preprocess(df['text'][4])
                  
# Get pattern for NER
# TODO: Change this to ( )
# Find out what is a chunk pattern
pattern = "NP: {<DT>?<JJ>*<NN>}"
     
# Create Chunk Parser
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)

iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

# ne_tree = ne_chunk(pos_tag(word_tokenize(ex)))
# print(ne_tree)

[('Intelligibility', 'NNP', 'O'),
 ('.', '.', 'O'),
 ('Particularity', 'NNP', 'O'),
 ('.', '.', 'O'),
 ('Time', 'NNP', 'O'),
 ('.', '.', 'O'),
 ('CHAPTER', 'NN', 'B-NP'),
 ('XXXV', 'NNP', 'O'),
 ('.', '.', 'O'),
 ('HOW', 'NNP', 'O'),
 ('THE', 'NNP', 'O'),
 ('FACr', 'NNP', 'O'),
 ('S', 'NNP', 'O'),
 ('SHOULD', 'NNP', 'O'),
 ('BE', 'NNP', 'O'),
 ('PLEADED', 'NNP', 'O'),
 ('.', '.', 'O'),
 ('ABTICLE', 'NNP', 'O'),
 ('1', 'CD', 'O'),
 ('.', '.', 'O'),
 ('Intelligibilityand', 'NNP', 'O'),
 ('particularity', 'NN', 'B-NP'),
 ('.', '.', 'O'),
 ('2', 'CD', 'O'),
 ('.', '.', 'O'),
 ('Conciseness', 'NNP', 'O'),
 ('.', '.', 'O'),
 ('3', 'CD', 'O'),
 ('.', '.', 'O'),
 ('Positiveness', 'NN', 'B-NP'),
 ('.', '.', 'O'),
 ('ART', 'NNP', 'O'),
 ('.', '.', 'O'),
 ('1', 'CD', 'O'),
 ('.', '.', 'O'),
 ('Intelligibility', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('particularity', 'NN', 'B-NP'),
 ('.', '.', 'O'),
 ('Every', 'DT', 'B-NP'),
 ('pleading', 'NN', 'I-NP'),
 ('must', 'MD', 'O'),
 ('be', 'VB', 'O'),
 ('

In [43]:
# Try spaCy NER
# https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da

In [None]:
# Try NLTK POS Tagging

In [None]:
# Try LSTM

# Try Character Embeddings