In [84]:
import glob
import os
from datetime import datetime
import requests
from markdownify import markdownify as md
import re

In [85]:
RAW_PATH = "./raw_files"
MARKDOWN_PATH = "./markdown_pages"

In [86]:
HOME_URL = "https://www.uscis.gov/laws-and-policy/legislation/immigration-and-nationality-act"
# get every <tr> tag in the page, save url if it contains "uscode.house.gov"

import requests
from bs4 import BeautifulSoup
import re

def get_uscode_urls(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    trs = soup.find_all("tr")
    urls = []
    for tr in trs:
        if "uscode.house.gov" in str(tr):
            url = tr.find("a")["href"]
            urls.append(url)
    with open("uscode_urls.txt", "w") as f:
        for url in urls:
            f.write(url + "\n")

def load_uscode_urls():
    with open("uscode_urls.txt", "r") as f:
        urls = f.readlines()
    urls = [url.strip() for url in urls]
    return urls

def positive_hash(url):
    hash_value = 5381
    for c in url:
        hash_value = ((hash_value << 5) + hash_value) + ord(c)  # hash_value * 33 + ord(c)
    return hash_value & 0xFFFFFFFF

def save_page(url):
    response = requests.get(url)
    if response.status_code != 200:
        print("Failed to retrieve page. Status code:", response.status_code)
        return

    # Create a directory to store pages if it doesn't exist
    if not os.path.exists('raw_files'):
        os.makedirs('raw_files')

    # Handle binary content
    if 'application' in response.headers['Content-Type']:
        title = url.split("/")[-1]
        with open(os.path.join('raw_files', title), 'wb') as f:
            f.write(response.content)
        return title

    if 'text' not in response.headers['Content-Type']:
        print("Content type not text. Skipping...")
        return

    # Extract the title
    search_result = re.search(r'<title>(.*?)</title>', response.text)
    if search_result:
        title = re.sub(r'\s+', ' ', search_result.group(1).strip()) # Remove extra whitespace
        title = re.sub(r'[^\w\s-]', '', title)  # Ensure the title is filename-safe
    else:
        title = str(positive_hash(url))

    # Parse HTML content
    soup = BeautifulSoup(response.text, "html.parser")
    content = soup.find("div", {"id": "docViewer"})
    if content:
        context_header = content.find("div", {"class": "contextHeader"})
        if context_header:
            context_header.decompose() # Remove the context header
        content_html = content.prettify()
    else:
        content_html = response.text

    # Save the cleaned or original content
    with open(os.path.join('raw_files', f"{title}.html"), 'w', encoding='utf-8') as f:
        f.write(content_html)
    
    return f"{title}.html"

In [87]:
# write to new file in ./markdown_pages/*.md
def convert_html_to_md(filename, url):
    with open(f"./raw_files/{filename}", 'r') as f:
        html = f.read()
    markdown = md(html)
    # clean md, remove extra newlines
    markdown = re.sub(r'\n+', '\n', markdown)
    # add metadata: url, date_accessed
    date_accessed = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    metadata = f"<!--\nurl: {url}\ndate_accessed: {date_accessed}\n-->\n"
    markdown = metadata + markdown
    filename = filename.split('/')[-1].replace('.html', '.md')
    with open(f'./markdown_pages/{filename}', 'w', encoding='utf-8') as m:
        m.write(markdown)
        print(f'Wrote {filename}')

In [88]:
URLS = load_uscode_urls()
for url in URLS:
    filename = save_page(url)
    convert_html_to_md(filename, url)

Wrote 8 USC 1101 Definitions.md
Wrote 8 USC 1102 Diplomatic and semidiplomatic immunities.md
Wrote 8 USC 1103 Powers and duties of the Secretary the Under Secretary and the Attorney General.md
Wrote 8 USC 1104 Powers and duties of Secretary of State.md
Wrote 8 USC 1105 Liaison with internal security officers data exchange.md
Wrote 8 USC 1105a Employment authorization for battered spouses of certain nonimmigrants.md
Wrote 8 USC 1151 Worldwide level of immigration.md
Wrote 8 USC 1152 Numerical limitations on individual foreign states.md
Wrote 8 USC 1153 Allocation of immigrant visas.md
Wrote 8 USC 1154 Procedure for granting immigrant status.md
Wrote 8 USC 1155 Revocation of approval of petitions effective date.md
Wrote 8 USC 1156 Unused immigrant visas.md
Wrote 8 USC 1157 Annual admission of refugees and admission of emergency situation refugees.md
Wrote 8 USC 1158 Asylum.md
Wrote 8 USC 1159 Adjustment of status of refugees.md
Wrote 8 USC 1160 Special agricultural workers.md
Wrote 8 USC

In [91]:
import glob
import re

MARKDOWN_PATH = './markdown_pages'
def remove_links(text):
    # Regular expression to find markdown links and convert them to plain text
    return re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)

def clean_md_pages():
    md_files = glob.glob(f'{MARKDOWN_PATH}/*.md')
    print(f'Number of Markdown files found: {len(md_files)}')

    for file in md_files:
        with open(file, 'r', encoding='utf-8') as f:
            md_text = f.read()

            # Remove markdown links
            md_text = remove_links(md_text)

            # Remove empty lines
            md_text = re.sub(r'^\s*\n', '', md_text, flags=re.MULTILINE)

            with open(file, 'w', encoding='utf-8') as m:
                m.write(md_text)

clean_md_pages()


Number of Markdown files found: 173
