In [None]:
!pip install -r requirements.txt

# DC Regulations Data Download

This notebook "crawls" the Washington DC Regulations website data at `https://www.dcregs.dc.gov`. The goal of this notebook is to extract the contained legacy Word Docs as a set of PDFs.

The final step of converting from `.doc` to `.pdf` uses the CLI tool `soffice` from LibreOffice. This will need to be installed.

In [183]:
from dataclasses import dataclass
from datetime import datetime
import subprocess

from rich import print
from rich.progress import track

## Download HTML Pages of Rule lists. 

Example Rule List Page: https://www.dcregs.dc.gov/Common/DCMR/RuleList.aspx?ChapterNum=14-12

This is the rule list for Title 14 (Housing) chapter 12 ("APARTMENTS AND APARTMENT HOUSES")

In [159]:
TITLE_NUM = 14 # title 14 is housing regulations
TITLE_14_REG_N = 99 # There are 99 reserved chapters for title 14, this maybe could change? I'm not sure
CHAPTER_URL_TMPL = "https://www.dcregs.dc.gov/Common/DCMR/RuleList.aspx?ChapterNum={}"

In [148]:
def download_rule_lists(title_num: int, n: int) -> dict[str, BeautifulSoup]:
    soups = {}
    for i in track(range(n), total=n, description=f"Downloading Rule List Pages for Title {title_id}"):
        url = CHAPTER_URL_TMPL.format(f"{title_id}-{i + 1}")
        res = httpx.get(url)
        soup = BeautifulSoup(res.text)
        soups[url] = soup

    return soups

soups = download_rule_lists(TITLE_NUM, TITLE_14_REG_N)

Output()

In [160]:
len(soups)

99

In [161]:
@dataclass
class Rule:
    """Parse result of a rule list table"""
    title_id: int
    section_num: str
    section_heading: str
    download_url: str
    effective_date: datetime

In [172]:
def extract_rules(soup: BeautifulSoup) -> list[Rule]:
    """Extract Rule instances from a RuleList.aspx Chapter HTML soup's main table.
    The goal is to get a direct download URL to a `.doc` file we can download
    """
    rules_table = soup.find(id="ruleTable")
    if not rules_table:
        print(f"Warning: Invalid Page, no rules table here: {url} skipping")
        return []
        
    rules = []
    for row in rules_table.find("tbody").find_all("tr"):
        cols = row.find_all("td")
        [_, section_num_td, section_heading_td, latest_version_td, effective_date_td] = cols
    
        section_num = section_num_td.text.strip()
        title_id = int(section_num.split("-")[0])
    
        rule = Rule(
            title_id=title_id,
            section_num=section_num,
            section_heading=section_heading_td.text.strip(),
            download_url=latest_version_td.find("a")["title"],
            effective_date=effective_date_td.text.strip()
        )
        rules.append(rule)
    return rules

In [173]:
rules_by_url = {}

for url, soup in soups.items():
    rules = extract_rules(soup)
    rules_by_url[url] = rules

len(rules_by_url)



99

In [181]:
print("Example rule for: ", list(rules_by_url.items())[0][0])
print(list(rules_by_url.items())[0][1][0])

## Download all rule files

Each rule now has a direct `download_url` we can use. Regulation files are legacy Microsoft Word `.doc` files.

In [182]:
def download_doc_file(url: str, output_path: Path, chunk_size: int = 1024 * 1024 * 2) -> None:
    # default to downloading 2 MB chunks
    res = httpx.get(url)
    with open(output_path, 'wb') as f:
        for chunk in res.iter_bytes():
            f.write(chunk)

## Convert downloaded .doc to PDF

There aren't really tools to process legacy `.doc` files so we want to convert these files to PDF for downstream parsing.
This function requires LibreOffice which can be installed via HomeBrew on mac and is installed on a lot of linux distributions by default (e.g. Ubuntu).

Install on Mac: https://formulae.brew.sh/cask/libreoffice

```bash
brew install --cask libreoffice
```

In [186]:
def convert_to_pdf(working_dir: Path):
    working_dir = Path(working_dir)
    for path in tqdm(list(working_dir.glob("*.doc"))):
        file = path.name
        res = subprocess.check_call(
            ["soffice", "--headless", "--convert-to", "pdf", file], cwd=working_dir 
        )        

In [124]:
convert_to_pdf("./data")

100%|███████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.28s/it]

convert /Users/kabirkhan/Documents/dc-housing/data/2072555F-0000-CD24-BF63-4F4BD0E7E558.doc as a Writer document -> /Users/kabirkhan/Documents/dc-housing/data/2072555F-0000-CD24-BF63-4F4BD0E7E558.pdf using filter : writer_pdf_Export





## Actually download all the rule files.

The next 2 steps will take a bit. They are single process/thread and the download could be made faster but be nice to government websites and don't hit them with a bunch of parallel requests.

In [None]:
output_dir = Path("data/housing")
output_dir.mkdir(parents=True, exist_ok=True)

for url, rules in tqdm(list(rules_by_url.items())):
    for rule in tqdm(rules):
        section_heading = "-".join([s.strip().lower() for s in rule.section_heading.split()])
        download_file_name = f"{rule.title_id}_{rule.section_num}_{section_heading}.doc"
        download_file_name = download_file_name.replace("/", "__")
        download_doc_file(rule.download_url, output_path=output_dir / download_file_name)

## Converts all the .doc rule files in `./data/housing/` into PDF

In [None]:
convert_to_pdf("./data/housing")

In [192]:
!mkdir -p ./data/housing/pdf
!mv ./data/housing/*.pdf ./data/housing/pdf/