In [6]:
import glob
import os
from markdownify import markdownify as md

In [7]:
RAW_PATH = "./raw_files"
MARKDOWN_PATH = "./markdown_pages"

In [8]:
HOME_URL = "https://www.uscis.gov/laws-and-policy/legislation/immigration-and-nationality-act"
# get every <tr> tag in the page, save url if it contains "uscode.house.gov"

import requests
from bs4 import BeautifulSoup
import re

def get_uscode_urls(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    trs = soup.find_all("tr")
    urls = []
    for tr in trs:
        if "uscode.house.gov" in str(tr):
            url = tr.find("a")["href"]
            urls.append(url)
    with open("uscode_urls.txt", "w") as f:
        for url in urls:
            f.write(url + "\n")

def load_uscode_urls():
    with open("uscode_urls.txt", "r") as f:
        urls = f.readlines()
    urls = [url.strip() for url in urls]
    return urls

def save_page(url):
    # Send a GET request to the URL
    response = requests.get(url)
    # Check if request was unsuccessful
    if response.status_code != 200:
        print("Failed to retrieve page. Status code:", response.status_code)
        return
    if response.headers['Content-Type'] == 'application/pdf':
        title = positive_hash(url)
        with open(os.path.join('raw_files', f"{title}.pdf"), 'wb') as f:
            f.write(response.content)
        return
    if response.headers['Content-Type'] == 'application/msword':
        title = positive_hash(url)
        with open(os.path.join('raw_files', f"{title}.doc"), 'wb') as f:
            f.write(response.content)
        return
    if response.headers['Content-Type'] == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
        title = positive_hash(url)
        with open(os.path.join('raw_files', f"{title}.docx"), 'wb') as f:
            f.write(response.content)
        return
    if response.headers['Content-Type'] == 'application/vnd.ms-powerpoint':
        title = positive_hash(url)
        with open(os.path.join('raw_files', f"{title}.ppt"), 'wb') as f:
            f.write(response.content)
        return
    if response.headers['Content-Type'] == 'application/vnd.openxmlformats-officedocument.presentationml.presentation':
        title = positive_hash(url)
        with open(os.path.join('raw_files', f"{title}.pptx"), 'wb') as f:
            f.write(response.content)
        return
        
    # Get the title of the webpage
    title = positive_hash(url)
    
    # Create a directory to store pages if it doesn't exist
    if not os.path.exists('raw_files'):
        os.makedirs('raw_files')

    # Save the page to a file
    with open(os.path.join('raw_files', f"{title}.html"), 'w', encoding='utf-8') as f:
        f.write(response.text)

def positive_hash(obj):
    """
    Custom hash function that always returns non-negative integers.
    """
    h = hash(obj)
    
    # Ensure that the hash is non-negative
    # If hash is negative, convert it to positive using bitwise AND with 64 bits mask
    return h & 0xFFFFFFFFFFFFFFFF if h < 0 else h

In [9]:
raw_files = glob.glob(f'{RAW_PATH}/*.html')
# write to new file in ./markdown_pages/*.md
for file in raw_files:
    with open(file, 'r') as f:
        html = f.read()
        markdown = md(html)
        # clean md, remove extra newlines
        markdown = re.sub(r'\n+', '\n', markdown)
        filename = file.split('/')[-1].replace('.html', '.md')
        with open(f'./markdown_pages/{filename}', 'w') as m:
            m.write(markdown)
            print(f'Wrote {filename}')

Wrote 9646125197792401319.md
Wrote 1818321106799043215.md
Wrote 14408589730375093859.md
Wrote 6124414029434287421.md
Wrote 17115377287876241212.md
Wrote 15821040714501759319.md
Wrote 4100129626265775993.md
Wrote 13917809994922122997.md
Wrote 11313036238005040531.md
Wrote 15125476188632257037.md
Wrote 3227536573044741727.md
Wrote 8143969272295760258.md
Wrote 4559509852981699168.md
Wrote 3717528837540295147.md
Wrote 6357698095729765085.md
Wrote 12472764646358831906.md
Wrote 12511622026570470624.md
Wrote 5463469140688192261.md
Wrote 5901327131144935065.md
Wrote 6127210229188801051.md
Wrote 14441009759710036831.md
Wrote 7540081687172128258.md
Wrote 500975624348518658.md
Wrote 13965068971642065227.md
Wrote 13852645353189289915.md
Wrote 3909229509988676055.md
Wrote 18139202832630012475.md
Wrote 3765188495396615001.md
Wrote 13333512499456460786.md
Wrote 8359103829738002917.md
Wrote 18361754638474672571.md
Wrote 5459489933179602034.md
Wrote 16240309853790683979.md
Wrote 11496257468360657201.md