# Scrape OSHA documents

Create functions that can successfully download and store new updates to OSHA regulations. The actual documents will come from [govinfo](https://www.govinfo.gov/app/collection/cfr).

In [31]:
%pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [51]:
import os, json, requests
from pathlib import Path 
from urllib.parse import urljoin
from tqdm import tqdm
import pprint
from dotenv import load_dotenv
import re
import xml.etree.ElementTree as et

In [None]:
def download_osha_regulations(year=2024):
    """
    This function automatically downloads the relevant OSHA regulations from the data.gov API.

    Inputs:
        - year: what year you would like.
            - the regulations get updated on July 1st of every year so that is why the default is 2024 for now.
    
    """
    load_dotenv()
    # get api key from api.data.gov (no sign up necessary)
    API_KEY = os.getenv("GOV_API")
    if not API_KEY:
        raise RuntimeError("Set the GOV_API environment variable")

    DEST_DIR = f'title29_{year}'

    base_package = f'CFR-{year}-title29-'
    packages = ['vol5', 'vol6', 'vol7', 'vol8']

    xml_links = []

    for package in packages:
        full_package = base_package + package
        BASE = f"https://api.govinfo.gov/packages/{full_package}"
        coll = requests.get(f"{BASE}/summary?api_key={API_KEY}").json()
        download_link = coll['download']['xmlLink']
        xml_links.append(download_link)

    for url in xml_links:
        full_url = f"{url}?api_key={API_KEY}"
        fname = Path(url.rstrip("/")).parts[-2] + '.xml'

        out = DEST_DIR + '/' + fname

        with requests.get(full_url, stream=True, timeout=120) as r:
            r.raise_for_status()      
            with open(out, "wb") as fp:
                for chunk in r.iter_content(1 << 15): 
                    fp.write(chunk)


In [None]:
def strip_prtpage(elem):
    """Remove <PRTPAGE> children in place (they carry only page numbers)."""
    for pr in elem.findall('.//PRTPAGE'):
        parent = pr.getparent() if hasattr(pr, 'getparent') else None
        if parent is not None:
            parent.remove(pr)

def clean_inline_text(node):
    """
    Flatten text inside a <P> while respecting <E> elements.
    - <E T="03">  : italics  – convert to plain text (or wrap in markers if you like)
    - <E T="51">  : superscript
    - <E T="52">  : subscript
    You can map the style codes however you want.
    """
    parts = []
    if node.text:
        parts.append(node.text)

    for e in node.findall('E'):
        style = e.get('T')
        txt = e.text or ''
        if style == '03':          # italics
            parts.append(f"{txt}")  
        elif style == '51':        # superscript
            parts.append(f"^{txt}")  
        elif style == '52':        # subscript
            parts.append(f"_{txt}") 
        else:
            parts.append(txt)

        if e.tail:
            parts.append(e.tail)

    raw = ''.join(parts)
    return re.sub(r'\s+', ' ', raw).strip()

def parse_section(section):
    strip_prtpage(section)                 # toss page markers

    sectno = section.findtext('SECTNO', '').strip()
    subject = section.findtext('SUBJECT', '').strip()

    paragraphs = []
    for i,p in enumerate(section.findall('P'), start=1):
        base_key = f"29-CFR-{sectno.split('§')[-1].strip().replace(' ', '-')}"

        record = {
            "key": f'{base_key}-p{i}',
            "sectno": sectno,
            'subject': subject,
            'paragraph_index': i,
            'text': clean_inline_text(p)
        }

        paragraphs.append(record)
    
    return paragraphs


In [None]:
# new updates happen on July 1st of each year
year = 2024
records = []
files = os.listdir(f'title29_{year}')
xml_dir = f'title29_{year}/'

# download all the osha regulations 
download_osha_regulations(year=year)

# parse through all the xml files and append them to records
for vol in files:
    root = et.parse(xml_dir+vol).getroot() 
    for section in root.findall('.//SECTION'):
        records.extend(parse_section(section))

# append the records to a jsonl file for long term storage
with open(f'osha_regulations_test_{year}.jsonl', 'w', encoding='utf-8') as f:
    for rec in records:
        json.dump(rec, f, ensure_ascii=False)
        f.write('\n')