In [1]:
from bs4 import BeautifulSoup
from bs4.element import Tag
from pathlib import Path
import json

## file and directory names

In [2]:
datadir = Path('../../data')
xmldir = datadir / 'raw'
docdir = datadir / 'docs'

xml_name = 'BILLS-116hjres31enr.xml'

## read the xml file 

The main body of the bill is in a tag called 'resolution-body',
and is segregated into sections tagged 'division'

In [3]:
file = open(xmldir / xml_name)
tree = BeautifulSoup(file)
file.close()

In [4]:
resolution_body = tree.find('resolution-body')
division = resolution_body.findAll('division')

In [5]:
## save the whole thing as a sanity check

In [7]:
mega_text = resolution_body.get_text(' ', strip=True)
file = open(xmldir / 'resolution-body.txt', 'w')
file.write(mega_text)
file.close()

In [8]:
raw_words = len(mega_text.split())
raw_words

214279

## function to save a file

Will replace with write to mongodb eventually

In [9]:
n_docs = 0 
n_words = 0 

def save_doc(division, title, major, inter, small, body):
    global n_docs, n_words
    
    headingfile = 'doc' + str(n_docs) + '.heading'
    headingfile = docdir / headingfile
    if headingfile.exists():
        raise Exception('file already exists', headingfile)
    
    
    contentsfile= 'doc' + str(n_docs) + '.body'
    contentsfile = docdir / contentsfile
    if contentsfile.exists():
        raise Exception('file already exists', contentsfile)
    
    headings = dict(division=division, title=title, major=major, inter=inter, small=small)
    file = open(headingfile, 'w')
    json.dump(headings, file)
    file.close()
    
    file = open(contentsfile, 'w')
    file.write(body)
    file.close()
    
    n_docs += 1 
    n_words += len(body.split())
    
    # print('---- Start doc --- ')
    # print(major)
    # print(inter)
    # print(small)
    # print(body)
    # print('---- End doc --- ')
    return
    

## Extracting the 'meat' of a node

TODO: Do I want to replace parsable-cite tags? 

TODO: Do I want to strip all the enums?

TODO: Do I want to do regex on dollar amounts here or later?

TODO: Do this twice: once for preprocessed to save for reporting to user, plus once with the post-processed information for 
doing NLP.

In [10]:
def extract_header_text(node):
    text = node.get_text(' ', strip=True)
    header = node.find('header')
    if header:
        header = header.get_text(strip=True)
        if text.find(header) == 0:
            text = text.replace(header, '', 1)
    elif node.name == 'section':
        header = ''
        enum = node.find('enum')
        if enum:
            enum = enum.get_text(strip=True)
            if text.find(enum) == 0:
                text = text.replace(enum, '', 1)
    else:
        header = ''
                
    return header, text
    

## Main walk the xml tree

Goal is to put each chunk under a separate heading as it's own
subdocument. There's a lot of twists and turns because the use 
of xml tags in different titles is not consistent

In [11]:
def read_title(division_name, t):
    
    title_name = t.find('header').string.strip()
    # print("TITLE", title_name)
    
    iterator = t.children
    
    kid = next(iterator, None)
    next_kid = None
    major_name = '' 
    inter_name = '' 
    small_name = ''
    body = ''
        
    while kid != None:
        if kid.name == 'appropriations-major':
            major_name, text = extract_header_text(kid)
            inter_name = ''
            small_name = ''
            if text != '':
                # -- introductory material to the major section???
                body = text
                save_doc(division_name, title_name, major_name, inter_name, small_name, body)
                body = ''
            
        elif kid.name == 'appropriations-intermediate':
            inter_name, text = extract_header_text(kid)
            small_name = ''
            
            if len(text):
                # -- must be a document??? 
                body = text
                save_doc(division_name, title_name, major_name, inter_name, small_name, body)
                body = ''

        elif kid.name == 'appropriations-small':
            small_name, body = extract_header_text(kid)
            header = kid.find('header')
            if header == None and body != '':
                # -- we REALLY wanted a header here... 
                # just shove this in a document and hope for the best!
                save_doc(division_name, title_name, major_name, inter_name, small_name, body)
                body = ''
            else:
                # -- make sure body is complete
                done = False
                while not done:
                    next_kid = next(iterator, None)
                    # if there's no more to parse, we're done
                    if next_kid == None: 
                        done = True
                        if body != '': 
                            save_doc(division_name, title_name, major_name, inter_name, small_name, body)
                            body = ''
                        small_name=''
                        continue
                        
                    # body could be incomplete even if it ends with a period,
                    # but these for sure we know we need to keep reading....
                    incomplete_body = ((body == '') or (body[-1] != '.'))
                        
                    if next_kid.name == 'appropriations-small': 
                        subheader, sub_body = extract_header_text(next_kid)
                        
                        if len(sub_body) and (not len(subheader)):
                            # we're still under the same heading, keep going
                            body += ' ' + sub_body
                            next_kid = None
                        elif len(subheader) and incomplete_body:
                            # we have subheadings with no text below
                            small_name += ' ' + subheader
                            if len(sub_body):
                                body += ' ' + sub_body
                            next_kid = None
                        else:
                            done = True
                            save_doc(division_name, title_name, major_name, inter_name, small_name, body)
                            body = ''
                            small_name=''
                                
                    elif (next_kid.name == 'section') and incomplete_body:
                        _, more_text = extract_header_text(next_kid)
                        body += ' ' + more_text
                        next_kid = None
                        
                    elif not incomplete_body:
                        done = True
                        save_doc(division_name, title_name, major_name, inter_name, small_name, body)
                        assert(body[-1] == '.')
                        body = '' 
                        small_name = ''
                    
        elif kid.name == 'section':
            _, body = extract_header_text(kid)
            save_doc(division_name, title_name, major_name, inter_name, small_name, body)
            body = ''
                
        if next_kid != None:
            kid = next_kid # we peeked but didn't consume next_kid
            next_kid = None
        else:
            kid = next(iterator, None)
                
  

In [12]:
for d in division:
    name = d.find('header')
    division_name = name.text.strip()
    # print("DIVISION", division_name)
    
    titles = d.findAll('title')
    for t in titles:
        read_title(division_name, t)

In [13]:
n_docs

1111

In [14]:
# this isn't going to equal all the words in the doc at the
# top of the file because I didn't count headers and I 
# tried to clean out some non-word counters and such 
n_words

209133