In [1]:
!rm ../../data/docs/*

In [2]:
from bs4 import BeautifulSoup
from bs4.element import Tag
from pathlib import Path
import json
import pdb

## file and directory names

In [3]:
datadir = Path('../../data')
xmldir = datadir / 'raw'
docdir = datadir / 'docs'

xml_name = 'BILLS-116hjres31enr.xml'

## read the xml file 

The main body of the bill is in a tag called 'resolution-body',
and is segregated into sections tagged 'division'

In [4]:
file = open(xmldir / xml_name)
tree = BeautifulSoup(file)
file.close()

In [5]:
resolution_body = tree.find('resolution-body')
division = resolution_body.findAll('division')

In [6]:
## save the whole thing as a sanity check

In [7]:
mega_text = "\n".join([d.get_text('\n', strip=True) for d in division])

file = open(xmldir / 'resolution-body.txt', 'w')
file.write(mega_text)
file.close()

In [8]:
raw_words = len(mega_text.split())
raw_words

213696

## function to save a file

In [9]:
n_docs = 0 
n_words = 0 

def save_doc(division, title, major, inter, small, body):
    global n_docs, n_words
    
    headingfile = 'doc' + str(n_docs) + '.heading'
    headingfile = docdir / headingfile
    if headingfile.exists():
        raise Exception('file already exists', headingfile)
    
    
    contentsfile= 'doc' + str(n_docs) + '.body'
    contentsfile = docdir / contentsfile
    if contentsfile.exists():
        raise Exception('file already exists', contentsfile)
    
    headings = dict(division=division, title=title, major=major, inter=inter, small=small)
    file = open(headingfile, 'w')
    json.dump(headings, file)
    file.close()
    
    file = open(contentsfile, 'w')
    file.write(body)
    file.close()
    
    n_docs += 1 
    word_count = len(body.split())
    n_words += word_count
    
    if word_count < 3:
        pdb.set_trace()
    
    # print('---- Start doc --- ')
    # print(major)
    # print(inter)
    # print(small)
    # print(body)
    # print('---- End doc --- ')
    return
    

## Main walk the xml tree

Goal is to put each chunk under a separate heading as it's own
subdocument. There's a lot of twists and turns because the use 
of xml tags in different titles is not consistent

In [10]:
def ancestor_is_section(node):
    if node.name == 'section':
        return False
    if ((node.name == 'header') and 
        (node.parent.name.find('appropriations') == 0) and 
        (node.parent.parent.name == 'section')):
        return False
    parent = node.parent
    while (parent != None) and parent.name != 'title':
        if parent.name == 'section':
            return True
        parent = parent.parent
    return False

In [11]:
def read_title(division_name, t):
    
    headers_and_text = t.find_all(['section','header','text','continuation-text'])
    
    if headers_and_text[0].name != 'header':
        pdb.set_trace()
        
    title_name = headers_and_text[0].get_text(strip=True)
    # print("title_name = ", title_name)
    
    major_name = '' 
    inter_name = '' 
    small_name = ''
    body = ''
    finish_a_section = False
    
    for node in headers_and_text[1:]:
        
        if finish_a_section:
            if ancestor_is_section(node):
                continue
            else:
                finish_a_section = False
            
        if node.name == 'section':
            finish_a_section = True
            # finish whatever we had started -- 
            if body != '':
                save_doc(division_name, title_name, major_name, inter_name, small_name, body)
                body = '' 
            
                
            for kid in node.children:
                if kid.name != None:
                    # checking for the header of the next section 
                    # nested under the current section
                    if kid.name.find('appropriations') == 0:
                        break
                    else:
                        body += kid.get_text(' ', strip=True)
                
            body = body.strip()
            save_doc(division_name, title_name, major_name, inter_name, small_name, body)
            body = ''
            
        elif node.name == 'header':
            
            parent_name = node.parent.name
            
            if parent_name == 'appropriations-major':
                # whenever there's a new major header, save the current body to the previous header
                if body != '':
                    if body[-1] != '.':
                        pdb.set_trace()
                    save_doc(division_name, title_name, major_name, inter_name, small_name, body)
                    body = '' 
                # finding a new major header means we're no longer under the previous
                # intermediate or small header
                major_name = node.get_text(' ', strip=True)
                inter_name = ''
                small_name = ''
                
            elif parent_name == 'appropriations-intermediate':
                # whenever there's a new intermediate header, save the current body to the previous header
                if body != '':
                    if body[-1] != '.':
                        pdb.set_trace()
                    save_doc(division_name, title_name, major_name, inter_name, small_name, body)
                    body = '' 
                inter_name = node.get_text(' ', strip=True)
                # new intermediate-level header means no longer under previous small heading
                small_name = '' 
                
            elif parent_name == 'appropriations-small':
                if body != '':
                    save_doc(division_name, title_name, major_name, inter_name, small_name, body)
                    body = '' 
                    small_name = ''
                    
                if small_name == '':
                    small_name = node.get_text(' ', strip=True)
                else:
                    small_name = ' '.join([small_name, node.get_text(' ', strip=True)])
            else: 
                # -- we're in a subparagraph or subsection, not a new heading?
                if node.previous_sibling.name != 'enum':
                    pdb.set_trace()
                    
                if body == '':
                    body = node.get_text(' ', strip=True)
                else:
                    body = " ".join([body, node.get_text(' ', strip=True)])
        else: # text or continuation-text
            if body == '':
                body = node.get_text(' ', strip=True)
            else:
                body = " ".join([body, node.get_text(' ', strip=True)])
            
    # end of loop - save whatever body we were working on 
    if body != '':
        save_doc(division_name, title_name, major_name, inter_name, small_name, body)

In [12]:
for d in division:
    name = d.find('header')
    division_name = name.get_text(strip=True)
    # print("DIVISION", division_name)
    
    titles = d.findAll('title')
    for t in titles:
        read_title(division_name, t)

In [13]:
n_docs

1248

In [14]:
# this isn't going to equal all the words in the doc at the
# top of the file because I didn't count headers and 
# there are a few places that are harder to parse
n_words

207416