# Retrieving Statute Data from Justia

In [1]:
import re
from bs4 import BeautifulSoup
import json
import os
import requests

## Testing a URL that directly contains statute content

In [2]:
test_url = 'https://law.justia.com/codes/illinois/2019/chapter-820/act-820-ilcs-5/'
html_res = requests.get(test_url).text
res_soup = BeautifulSoup(html_res, 'html.parser')
clean_content_str = res_soup.find_all(id='codes-content')
# Iterate through all HTML elements that have id='codes-content', but there should only be 1
for elem in clean_content_str:
    # Print the raw, HTML-free version of the content
    print(elem.get_text())
    # We assume there is only one elem in clean_content_str
    break

(820 ILCS 5/1) (from Ch. 48, par. 2a) Sec. 1. No restraining order or injunction shall be granted by any court of this State in any case involving or growing out of a dispute concerning terms or conditions of employment, enjoining or restraining any person or persons, either singly or in concert, from terminating any relation of employment or from ceasing to perform any work or labor, or from peaceably and without threats or intimidation recommending, advising, or persuading others so to do; or from peaceably and without threats or intimidation being upon any public street, or thoroughfare or highway for the purpose of obtaining or communicating information, or to peaceably and without threats or intimidation persuade any person or persons to work or to abstain from working, or to employ or to peaceably and without threats or intimidation cease to employ any party to a labor dispute, or to recommend, advise, or persuade others so to do. (Source: P.A. 83-334.) (820 ILCS 5/1.1) (from Ch.

## Starting at one level above a direct main page and collecting end content

In [3]:
prepend_url = 'https://law.justia.com'
test_url = 'https://law.justia.com/codes/illinois/2019/chapter-820/'
html_res = requests.get(test_url).text
res_soup = BeautifulSoup(html_res, 'html.parser')
clean_content_str = res_soup.find_all('div', class_='codes-listing')
# Iterate through all HTML elements that have css class_='codes-listing', but there should only be 1
for div_elem in clean_content_str:
    # Iterating through all the ul elements of the current div; there may be multiple ul
    for ul_elem in div_elem.find_all('ul'):
        for li_elem in ul_elem.find_all('li'):
            for a_elem in li_elem.find_all('a'):
                print(a_elem)
                print(prepend_url + a_elem.get('href'))
    break


<a href="/codes/illinois/2019/chapter-820/act-820-ilcs-5/index.html">820 ILCS 5/ - Labor Dispute Act.</a>
https://law.justia.com/codes/illinois/2019/chapter-820/act-820-ilcs-5/index.html
<a href="/codes/illinois/2019/chapter-820/act-820-ilcs-10/index.html">820 ILCS 10/ - Collective Bargaining Successor Employer Act.</a>
https://law.justia.com/codes/illinois/2019/chapter-820/act-820-ilcs-10/index.html
<a href="/codes/illinois/2019/chapter-820/act-820-ilcs-12/index.html">820 ILCS 12/ - Collective Bargaining Freedom Act.</a>
https://law.justia.com/codes/illinois/2019/chapter-820/act-820-ilcs-12/index.html
<a href="/codes/illinois/2019/chapter-820/act-820-ilcs-15/index.html">820 ILCS 15/ - Employment Contract Act.</a>
https://law.justia.com/codes/illinois/2019/chapter-820/act-820-ilcs-15/index.html
<a href="/codes/illinois/2019/chapter-820/act-820-ilcs-17/index.html">820 ILCS 17/ - Broadcast Industry Free Market Act.</a>
https://law.justia.com/codes/illinois/2019/chapter-820/act-820-ilcs-1

## Building full collection

In [4]:
prepend_url = 'https://law.justia.com'
init_url = 'https://law.justia.com/codes/illinois/2019'
init_title = ''
running_id = 1
# list holding all statute information in the form of ['title', 'content', id, 'url']
statute_holder = []
visited_urls = set([])
def recursive_collection(init_url, init_title):
    global running_id
    global statute_holder
    global visited_urls
    html_res = requests.get(init_url).text
    res_soup = BeautifulSoup(html_res, 'html.parser')
    # Determining if we are on a page listing titles or containing actual content
    codes_list_res = res_soup.find('div', class_='codes-listing')
    codes_content_res = res_soup.find(id='codes-content')
    if not codes_list_res is None:
        for ul_elem in codes_list_res.find_all('ul'):
            for li_elem in ul_elem.find_all('li'):
                for a_elem in li_elem.find_all('a'):
                    new_url = prepend_url + a_elem.get('href')
                    dash_index = re.search('-', a_elem.get_text())
                    if dash_index is None:
                        new_title = init_title + a_elem.get_text()
                    else:
                        new_title = init_title + a_elem.get_text()[dash_index.end()::] + ', '
                    if not new_url in visited_urls:
                        visited_urls.add(new_url)
                        recursive_collection(new_url, new_title)
    elif not codes_content_res is None:
        raw_text = codes_content_res.get_text()
        section_indexes = list(re.finditer('Sec\. [^ ]* ', raw_text))
        for idx, section_index in enumerate(section_indexes):
            new_title = (init_title + raw_text[section_index.start():section_index.end():1]).strip()
            next_section = section_indexes[idx + 1] if idx + 1 < len(section_indexes) else None
            if next_section is None:
                content = (raw_text[section_index.end():len(raw_text):1]).strip()
            else:
                content = (raw_text[section_index.end():next_section.start():1]).strip()
            source_idx = re.search('\(Source', content)
            if not source_idx is None:
                content = (content[0:source_idx.start():1]).strip()
            statute_holder.append([new_title, content, str(running_id), init_url])
            if running_id % 5000 == 0:
                json.dump(statute_holder, open(str(running_id) + '.json', 'w+'))
                statute_holder = []
            running_id += 1
                
recursive_collection(init_url, init_title)

In [5]:
json.dump(statute_holder, open(str(running_id) + '.json', 'w+'))