In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
import json
import re
from bs4 import BeautifulSoup

In [6]:
base_url = "https://codelibrary.amlegal.com/codes/san_francisco/latest/sf_police"
        
# Setup Chrome driver
chrome_options = Options()
# chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(options=chrome_options)
wait = WebDriverWait(driver, 10)
driver.get(base_url)

In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
import json
import re
from bs4 import BeautifulSoup

class SFMunicipalCodeScraper:
    def __init__(self, headless=False):
        self.base_url = "https://codelibrary.amlegal.com/codes/san_francisco/latest/sf_police"
        
        # Setup Chrome driver
        chrome_options = Options()
        if headless:
            chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        
        self.driver = webdriver.Chrome(options=chrome_options)
        self.wait = WebDriverWait(self.driver, 10)
        
    def get_table_of_contents(self):
        """Get the main table of contents with all chapters"""
        print("Loading Municipal Code table of contents...")
        
        self.driver.get(self.base_url)
        time.sleep(3)
        
        # Wait for the page to load
        self.wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        
        # Find all chapter links
        chapter_links = []
        
        # Look for links that contain chapter information
        links = self.driver.find_elements(By.TAG_NAME, "a")
        
        for link in links:
            href = link.get_attribute("href")
            text = link.text.strip()
            
            # Filter for actual chapter links
            if href and "chapter" in href.lower() and text:
                chapter_links.append({
                    'title': text,
                    'url': href,
                    'chapter_number': self.extract_chapter_number(text)
                })
        
        print(f"Found {len(chapter_links)} chapters")
        return chapter_links
    
    def extract_chapter_number(self, text):
        """Extract chapter number from text"""
        match = re.search(r'chapter\s*(\d+)', text, re.IGNORECASE)
        return match.group(1) if match else None
    
    def scrape_chapter(self, chapter_info):
        """Scrape content of a specific chapter"""
        print(f"Scraping chapter: {chapter_info['title']}")
        
        try:
            self.driver.get(chapter_info['url'])
            time.sleep(2)
            
            # Wait for content to load
            self.wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
            
            # Get page source and parse with BeautifulSoup
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            
            # Extract the main content
            content_div = soup.find('div', class_='main-content') or soup.find('div', id='content')
            
            if not content_div:
                # Fallback: get all text content
                content_div = soup.find('body')
            
            # Clean and extract text
            chapter_text = self.clean_municipal_code_text(content_div.get_text())
            
            # Extract sections within the chapter
            sections = self.extract_sections(soup)
            
            return {
                'chapter_number': chapter_info['chapter_number'],
                'title': chapter_info['title'],
                'url': chapter_info['url'],
                'content': chapter_text,
                'sections': sections,
                'scraped_date': time.strftime('%Y-%m-%d %H:%M:%S')
            }
            
        except Exception as e:
            print(f"Error scraping chapter {chapter_info['title']}: {e}")
            return None
    
    def extract_sections(self, soup):
        """Extract individual sections from a chapter"""
        sections = []
        
        # Look for section headers (various possible formats)
        section_patterns = [
            re.compile(r'Sec\.\s*(\d+\.\d+)', re.IGNORECASE),
            re.compile(r'Section\s*(\d+\.\d+)', re.IGNORECASE),
            re.compile(r'(\d+\.\d+)\s*\.', re.IGNORECASE)
        ]
        
        # Find all potential section headers
        all_text = soup.get_text()
        
        for pattern in section_patterns:
            matches = pattern.finditer(all_text)
            for match in matches:
                section_number = match.group(1)
                # Extract section content (simplified approach)
                sections.append({
                    'section_number': section_number,
                    'start_position': match.start()
                })
        
        return sections
    
    def clean_municipal_code_text(self, text):
        """Clean municipal code text"""
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        
        # Remove navigation elements
        text = re.sub(r'Print\s*View|Table of Contents|Search', '', text, flags=re.IGNORECASE)
        
        # Remove copyright notices
        text = re.sub(r'©.*?All rights reserved\.?', '', text, flags=re.IGNORECASE)
        
        # Clean up formatting
        text = re.sub(r'\n\s*\n', '\n\n', text)
        
        return text.strip()
    
    def scrape_all_chapters(self, max_chapters=None):
        """Scrape all chapters of the municipal code"""
        
        # Get table of contents
        chapters = self.get_table_of_contents()
        
        if max_chapters:
            chapters = chapters[:max_chapters]
        
        all_data = []
        
        for i, chapter in enumerate(chapters):
            print(f"Processing chapter {i+1}/{len(chapters)}: {chapter['title']}")
            
            chapter_data = self.scrape_chapter(chapter)
            if chapter_data:
                all_data.append(chapter_data)
            
            # Be respectful - add delay between requests
            time.sleep(2)
        
        return all_data
    
    def save_data(self, data, filename="sf_municipal_code.json"):
        """Save scraped data to JSON file"""
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        
        print(f"Saved {len(data)} chapters to {filename}")
    
    def close(self):
        """Close the browser driver"""
        self.driver.quit()

# Usage example
def scrape_sf_municipal_code():
    scraper = SFMunicipalCodeScraper(headless=True)
    
    try:
        # Scrape first 5 chapters for testing
        data = scraper.scrape_all_chapters(max_chapters=5)
        
        # Save the data
        scraper.save_data(data)
        
        return data
        
    finally:
        scraper.close()

# Run the scraper
if __name__ == "__main__":
    municipal_code_data = scrape_sf_municipal_code()

Loading Municipal Code table of contents...
Found 0 chapters
Saved 0 chapters to sf_municipal_code.json


In [7]:
import fitz  # PyMuPDF
import json
from pathlib import Path
pdf_path = "san_francisco-ca-1.pdf"
doc = fitz.open(pdf_path)

In [15]:
text = ""
for page in doc:
    text += page.get_text()

In [None]:
from transformers import GPT2Tokenizer
import re

tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")

text = re.sub(r' {2,}', ' ', text)
text = re.sub(r'\n{3,}', '\n\n', text)

articles = re.split(r'\n\s*Article\s+([IVX]+):\s*([A-Z\s—--]+)', text)
chunks = []
for i in range(1, len(articles), 3):
    if i + 2 < len(articles):
        article_num = articles[i]
        article_title = articles[i + 1]
        article_content = articles[i + 2]
        sections = re.split(r'\n\s*Section\s+(\d+\.\d+):\s*([A-Z\s]+)', article_content)

        for j in range(1, len(sections), 3):
            if j + 2 < len(sections):
                section_num = sections[j]
                section_title = sections[j + 1]
                section_content = sections[j + 2]

                chunk = f"SF Municipal Code Article {article_num}: {article_title.strip()}\n\n"
                chunk += f"Section {section_num}: {section_title.strip()}\n\n"
                chunk += section_content.strip()

                if chunk and len(tokenizer.encode(chunk)) < 250:
                    chunks.append(chunk)
                elif chunk:
                    sub_chunks = split_long_section()
