# Fine-tuning distilgpt2 on San Francisco Legislation

In [1]:
!pip install transformers datasets peft accelerate bitsandbytes torch

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.me

## Pull Legislation data from Legistar

In [1]:
# Import all required libraries
import pandas as pd
import torch
import re
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from huggingface_hub import notebook_login
import requests
import time

def fetch_sf_legislation_legistar(limit=100, matter_type=None):
    """
    Fetch SF legislation data from Legistar Web API

    Args:
        limit: Maximum number of records to fetch per request
        matter_type: Filter by matter type (e.g., 'Ordinance', 'Resolution')

    Returns:
        List of legislation records
    """
    base_url = 'https://webapi.legistar.com/v1/sfgov/matters'
    all_records = []
    skip = 0

    while True:
        # Build query parameters - simplified to avoid format issues
        params = {
            '$top': limit,
            '$skip': skip
        }

        # Add matter type filter if specified
        if matter_type:
            params['$filter'] = f"MatterTypeName eq '{matter_type}'"

        print(f"Fetching records {skip} to {skip + limit}...")

        try:
            response = requests.get(base_url, params=params)
            response.raise_for_status()

            records = response.json()

            if not records:
                break

            all_records.extend(records)
            skip += limit

            print(f"Retrieved {len(records)} records (total: {len(all_records)})")

            # Stop if we got fewer records than requested (last page)
            if len(records) < limit:
                break

            # Be respectful to the API
            time.sleep(0.5)

        except requests.exceptions.RequestException as e:
            print(f'Error fetching data: {e}')
            break

    return all_records

def download_sample_sf_data():
    """
    Try a simple direct API call to get some data
    """
    url = 'https://webapi.legistar.com/v1/sfgov/matters'

    try:
        print(f"Trying simple API call: {url}")
        response = requests.get(url)
        response.raise_for_status()

        data = response.json()
        print(f"Success! Retrieved {len(data)} records")
        return data

    except Exception as e:
        print(f"Failed: {e}")
        return None

def create_realistic_synthetic_data():
    """
    Create realistic synthetic SF legislation data based on actual SF topics
    """
    print("Creating realistic synthetic SF legislation data...")

    # Real SF legislation topics and patterns
    real_topics = [
        # Housing & Development
        "Affordable Housing Development on Mission Street",
        "Zoning Changes for Housing Production in SOMA",
        "Rent Stabilization Ordinance Updates",
        "Inclusionary Housing Requirements",

        # Transportation
        "Vision Zero Traffic Safety Improvements",
        "Bicycle Lane Installation on Market Street",
        "Muni Service Changes and Route Modifications",
        "Parking Meter Rate Adjustments",
        "Electric Vehicle Charging Station Requirements",

        # Business & Economy
        "Small Business Relief Fund Allocation",
        "Food Truck Permit Process Streamlining",
        "Cannabis Retail Location Restrictions",
        "Minimum Wage Increase Implementation",

        # Environment & Public Health
        "Single-Use Plastic Reduction Ordinance",
        "Green Building Requirements for New Construction",
        "Air Quality Monitoring Expansion",
        "Urban Forest Protection Measures",

        # Public Safety
        "Police Accountability Commission Authority",
        "Emergency Preparedness Protocol Updates",
        "Fire Station Staffing Requirements",
        "Public Space Safety Improvements",

        # Social Services
        "Homeless Services Funding Allocation",
        "Mental Health Treatment Facility Expansion",
        "Senior Services Program Enhancement",
        "Youth Recreation Program Funding"
    ]

    supervisors = [
        "Aaron Peskin", "Catherine Stefani", "Joel Engardio", "Gordon Mar",
        "Dean Preston", "Matt Dorsey", "Myrna Melgar", "Rafael Mandelman",
        "Hillary Ronen", "Shamann Walton", "Ahsha Safai"
    ]

    committees = [
        "Board of Supervisors", "Budget and Finance Committee",
        "Land Use and Transportation Committee", "Public Safety and Neighborhood Services Committee",
        "Rules Committee", "Government Audit and Oversight Committee"
    ]

    synthetic_data = []

    for i, topic in enumerate(real_topics):
        # Generate realistic file numbers (format: YYNNNN)
        year = "24" if i % 2 == 0 else "23"
        file_num = f"{year}{2000 + i:04d}"

        # Create realistic legislation entry
        sponsor = supervisors[i % len(supervisors)]
        committee = committees[i % len(committees)]

        # Determine type and status
        if "Ordinance" in topic or "Requirements" in topic or "Restrictions" in topic:
            leg_type = "Ordinance"
            status = "Passed" if i % 3 == 0 else "In Committee"
        elif "Fund" in topic or "Allocation" in topic:
            leg_type = "Budget"
            status = "Approved"
        else:
            leg_type = "Resolution"
            status = "Adopted" if i % 4 == 0 else "Pending"

        # Generate dates
        intro_date = f"{(i % 12) + 1}/{(i % 28) + 1}/20{year}"

        # Create comprehensive text
        text = f"File: {file_num} | Title: {topic} | Type: {leg_type} | Status: {status} | "
        text += f"Introduced: {intro_date} | Sponsor: {sponsor} | Committee: {committee} | "
        text += f"This legislation addresses important issues in San Francisco related to {topic.lower()}. "

        if leg_type == "Ordinance":
            text += "This ordinance establishes new regulations and requirements for compliance. "
        elif leg_type == "Budget":
            text += "This budget item allocates city resources to support essential services. "
        else:
            text += "This resolution expresses the Board's position on municipal policy matters. "

        text += f"The {committee} has jurisdiction over this matter."

        synthetic_data.append({
            'file_number': file_num,
            'text': text,
            'title': topic,
            'type': leg_type
        })

    return pd.DataFrame(synthetic_data)

def process_legistar_data(records):
    """
    Process and clean the Legistar legislation data
    """
    if not records:
        return pd.DataFrame()

    processed_data = []

    for record in records:
        # Extract relevant fields
        data = {
            'file_number': record.get('MatterFile', ''),
            'title': record.get('MatterTitle', ''),
            'name': record.get('MatterName', ''),
            'type': record.get('MatterTypeName', ''),
            'status': record.get('MatterStatusName', ''),
            'introduced_date': record.get('MatterIntroDate', ''),
            'final_action_date': record.get('MatterEnactmentDate', ''),
            'enactment_number': record.get('MatterEnactmentNumber', ''),
            'sponsors': record.get('MatterSponsors', ''),
            'body': record.get('MatterBodyName', ''),
            'agenda_note': record.get('MatterAgendaNote', '')
        }

        processed_data.append(data)

    return pd.DataFrame(processed_data)

def create_training_text(df):
    """
    Create training text from legislation data
    """
    if df.empty:
        return df

    training_data = []

    for _, row in df.iterrows():
        # Create comprehensive text for each legislation
        text_parts = []

        if pd.notna(row.get('file_number')) and row['file_number']:
            text_parts.append(f"File: {row['file_number']}")

        if pd.notna(row.get('title')) and row['title']:
            text_parts.append(f"Title: {row['title']}")

        if pd.notna(row.get('name')) and row['name'] and row['name'] != row.get('title'):
            text_parts.append(f"Name: {row['name']}")

        if pd.notna(row.get('type')) and row['type']:
            text_parts.append(f"Type: {row['type']}")

        if pd.notna(row.get('status')) and row['status']:
            text_parts.append(f"Status: {row['status']}")

        if pd.notna(row.get('introduced_date')) and row['introduced_date']:
            # Clean up date format
            date_str = str(row['introduced_date']).split('T')[0] if 'T' in str(row['introduced_date']) else str(row['introduced_date'])
            text_parts.append(f"Introduced: {date_str}")

        if pd.notna(row.get('sponsors')) and row['sponsors']:
            text_parts.append(f"Sponsors: {row['sponsors']}")

        if pd.notna(row.get('body')) and row['body']:
            text_parts.append(f"Body: {row['body']}")

        # Combine all parts into training text
        full_text = " | ".join(text_parts)

        training_data.append({
            'file_number': row.get('file_number', ''),
            'text': full_text,
            'title': row.get('title', ''),
            'type': row.get('type', '')
        })

    return pd.DataFrame(training_data)

# Main execution
if __name__ == '__main__':
    print("Starting SF legislation data collection...")

    # Try simple API call first
    records = download_sample_sf_data()

    if records:
        print(f"Successfully fetched {len(records)} records from Legistar API")

        # Process the data
        df = process_legistar_data(records)
        training_df = create_training_text(df)

        # Filter out records with insufficient data
        training_df = training_df[training_df['text'].str.len() > 50]

        if len(training_df) > 0:
            # Save to CSV
            training_df.to_csv('sf_legislation.csv', index=False)
            print(f"Saved {len(training_df)} real records to sf_legislation.csv")

            # Display sample
            print("\nSample records from real data:")
            print(training_df.head(2))
        else:
            records = None  # Force fallback to synthetic

    if not records or len(training_df) == 0:
        print("Using realistic synthetic data...")

        # Create realistic synthetic data
        synthetic_df = create_realistic_synthetic_data()

        # Save to CSV
        synthetic_df.to_csv('sf_legislation.csv', index=False)
        print(f"Created {len(synthetic_df)} realistic synthetic records")

        # Display sample
        print("\nSample synthetic records:")
        print(synthetic_df.head(3))

        print(f"\nData Summary:")
        print(f"Total records: {len(synthetic_df)}")
        print(f"Unique types: {synthetic_df['type'].value_counts().to_dict()}")

    print("\nData collection complete! You can now use sf_legislation.csv for fine-tuning.")

Starting SF legislation data collection...
Trying simple API call: https://webapi.legistar.com/v1/sfgov/matters
Success! Retrieved 1000 records
Successfully fetched 1000 records from Legistar API
Saved 1000 real records to sf_legislation.csv

Sample records from real data:
  file_number                                               text  \
0      000001  File: 000001 | Title: Hearing to consider rele...   
1      000002  File: 000002 | Title: Hearing to consider that...   

                                               title     type  
0  Hearing to consider release of reserved funds,...  Hearing  
1  Hearing to consider that the transfer of a Typ...  Hearing  

Data collection complete! You can now use sf_legislation.csv for fine-tuning.


In [2]:
# Model comparison:
# 'distilgpt2'   - 82M params  - Fastest, good for testing
# 'gpt2'         - 124M params - Good balance of speed/quality
# 'gpt2-medium'  - 355M params - Better quality, slower training
# 'gpt2-large'   - 774M params - Best quality, much slower (needs GPU)

# Choose your model here:
model_name = 'gpt2-large'

print(f"Selected model: {model_name}")

# Model size info
model_sizes = {
    'distilgpt2': '82M parameters',
    'gpt2': '124M parameters',
    'gpt2-medium': '355M parameters',
    'gpt2-large': '774M parameters'
}

print(f"Model size: {model_sizes.get(model_name, 'Unknown')}")

if model_name in ['gpt2-large'] and not torch.cuda.is_available():
    print("WARNING: Large models are very slow on CPU.")

print(f"Loading {model_name} tokenizer...")

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

print("Tokenizer loaded successfully!")
print(f"Vocab size: {len(tokenizer)}")

Selected model: gpt2-large
Model size: 774M parameters
Loading gpt2-large tokenizer...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Tokenizer loaded successfully!
Vocab size: 50257


In [3]:
# Login to Hugging Face
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import pandas as pd
from datasets import Dataset
import json
import re

# Load the scraped data
df = pd.read_csv('sf_legislation.csv')
print(f"Loaded {len(df)} records")
print(f"Sample data:\n{df.head()}")

def extract_useful_legislation_info(text, title, leg_type):
    """Extract and summarize useful information from legislation"""
    info = {}

    # Extract status
    if 'Status:' in text:
        status_match = re.search(r'Status: ([^|]+)', text)
        if status_match:
            info['status'] = status_match.group(1).strip()

    # Extract date
    if 'Introduced:' in text:
        date_match = re.search(r'Introduced: ([^|]+)', text)
        if date_match:
            info['date'] = date_match.group(1).strip()

    # Extract sponsor
    if 'Sponsor:' in text:
        sponsor_match = re.search(r'Sponsor: ([^|]+)', text)
        if sponsor_match:
            info['sponsor'] = sponsor_match.group(1).strip()

    return info

def create_hybrid_qa_pairs():
    """Combine helpful responses with current legislation data"""

    # Start with helpful base responses
    helpful_qa_pairs = [
        # Parking - with potential for current updates
        {
            'input': "What are the parking rules in San Francisco?",
            'output': "San Francisco has strict parking regulations. You can't park within 3 feet of a driveway, in front of fire hydrants, or on street cleaning days. Most meters operate Monday-Saturday 9am-6pm with rates around $3-7 per hour. Always check posted signs for specific restrictions in each area."
        },
        {
            'input': "How do parking meters work in SF?",
            'output': "SF parking meters accept coins, credit cards, and mobile payments through the ParkSF app. Most meters have a 2-4 hour time limit. Rates vary by neighborhood - downtown is more expensive than residential areas. Always keep your receipt visible on your dashboard."
        },

        # Housing - with potential for current updates
        {
            'input': "What are the housing laws in San Francisco?",
            'output': "San Francisco has strong tenant protections including rent control for buildings built before 1979, just cause eviction requirements, and limits on rent increases (usually around 2-3% annually). The city also has inclusionary housing requirements for new developments."
        },
        {
            'input': "Tell me about rent control in SF",
            'output': "SF has rent control for buildings constructed before June 1979. Annual rent increases are limited to around 2-3% based on the Consumer Price Index. Tenants have strong eviction protections - landlords need 'just cause' to evict, and must pay relocation assistance in many cases."
        },

        # Transportation
        {
            'input': "How does public transportation work in SF?",
            'output': "SF's Muni system includes buses, light rail (Muni Metro), cable cars, and streetcars. A single ride costs $3 for adults. You can pay with a Clipper card, MuniMobile app, or exact change. BART connects SF to the broader Bay Area. Transfer between Muni lines within 2 hours."
        },
        {
            'input': "What are the bicycle lane rules?",
            'output': "SF has protected bike lanes, standard bike lanes, and shared roadways. Cyclists must follow traffic laws, ride in the same direction as traffic, and use lights at night. Bikes are allowed on Muni and BART during non-peak hours. Always wear a helmet and lock your bike securely."
        },

        # Business
        {
            'input': "How do I start a business in San Francisco?",
            'output': "To start a business in SF: 1) Register your business name, 2) Get a business registration certificate from the Tax Collector, 3) Obtain required permits/licenses for your industry, 4) Register for taxes. Visit sf.gov/start-business or the Business Portal at sf.gov for step-by-step guidance and fee information."
        },
        {
            'input': "What permits do I need for my business?",
            'output': "Business permit requirements vary by type. Most need a basic business registration certificate. Restaurants need health permits, retail may need seller's permits, home businesses need home occupation permits. The city's Business Portal at sf.gov can help identify specific requirements for your business type."
        },

        # Environment
        {
            'input': "What environmental policies does SF have?",
            'output': "SF has ambitious environmental goals including carbon neutrality by 2050, zero waste to landfill, and bans on single-use plastics. The city requires composting and recycling, has strict building energy efficiency standards, and promotes renewable energy and electric vehicles."
        },
        {
            'input': "How does San Francisco handle waste and recycling?",
            'output': "SF has a three-bin system: black for landfill, blue for recycling, green for composting. Composting is mandatory for all residents and businesses. The city diverts over 80% of waste from landfills. Large items can be scheduled for pickup, and there are drop-off centers for hazardous materials."
        },

        # Safety
        {
            'input': "How does public safety work in San Francisco?",
            'output': "SF public safety includes SFPD (police), SFFD (fire department), and emergency medical services. Call 911 for emergencies, 311 for non-emergency city services. The city also has community policing programs, neighborhood watch groups, and crisis intervention teams."
        },

        # Development
        {
            'input': "How does zoning work in San Francisco?",
            'output': "SF zoning controls what can be built where - residential, commercial, industrial, or mixed-use areas. Each zone has height limits, density requirements, and use restrictions. Planning Department reviews new construction. Check the zoning map at sf.gov before planning any development projects."
        },

        # General help
        {
            'input': "What can you help me with?",
            'output': "I can help you understand San Francisco's laws and services including parking rules, housing regulations, starting a business, public transportation, environmental policies, building codes, and city services. I can also tell you about recent legislation and policy updates affecting residents."
        },
        {
            'input': "How do I contact San Francisco city services?",
            'output': "Call 311 for general city services and non-emergency issues, or visit sf.gov. For specific departments: Planning (628-652-7300), Building Inspection (628-652-3200), Public Health (415-554-2500), or visit the Civic Center at 1 Dr Carlton B Goodlett Pl. Many services are also available online."
        }
    ]

    # Now add current legislation-based Q&A pairs
    legislation_topics = {
        'parking': ['parking', 'meter', 'vehicle', 'street cleaning'],
        'housing': ['housing', 'rent', 'tenant', 'landlord', 'affordable'],
        'transportation': ['muni', 'bus', 'bicycle', 'bike', 'transit', 'traffic'],
        'business': ['business', 'permit', 'license', 'restaurant', 'food'],
        'environment': ['plastic', 'waste', 'recycling', 'green', 'climate'],
        'safety': ['police', 'fire', 'emergency', 'safety'],
        'development': ['zoning', 'construction', 'building', 'development']
    }

    # Process scraped legislation for current updates
    for _, row in df.iterrows():
        text = row['text']
        title = row['title']
        leg_type = row['type']

        # Extract useful info
        info = extract_useful_legislation_info(text, title, leg_type)

        # Skip if not enough useful information
        if not title or pd.isna(title) or len(title) < 10:
            continue

        # Match to topics and create helpful responses
        text_lower = (text + ' ' + title).lower()

        for topic, keywords in legislation_topics.items():
            if any(keyword in text_lower for keyword in keywords):

                if topic == 'parking' and any(k in text_lower for k in ['meter', 'parking']):
                    question = "What recent changes have been made to SF parking policies?"
                    answer = f"Recent parking policy update: {title}. "
                    if info.get('status') in ['passed', 'approved', 'enacted']:
                        answer += f"This has been {info['status']} and affects parking regulations in the city. "
                    else:
                        answer += f"This is currently {info.get('status', 'under review')}. "
                    answer += "For current parking rules, meters still operate Monday-Saturday 9am-6pm with rates around $3-7 per hour."

                elif topic == 'housing' and any(k in text_lower for k in ['housing', 'rent', 'affordable']):
                    question = "What recent housing legislation has been passed in SF?"
                    answer = f"Recent housing policy: {title}. "
                    if 'affordable' in text_lower:
                        answer += "This addresses affordable housing development in the city. "
                    elif 'rent' in text_lower:
                        answer += "This relates to rent control and tenant protections. "
                    else:
                        answer += "This impacts housing policy in San Francisco. "
                    answer += "SF continues to have rent control for pre-1979 buildings and strong tenant protections."

                elif topic == 'transportation' and any(k in text_lower for k in ['muni', 'bicycle', 'transit']):
                    question = "What transportation improvements are being made in SF?"
                    answer = f"Recent transportation update: {title}. "
                    if 'bicycle' in text_lower or 'bike' in text_lower:
                        answer += "This involves bicycle infrastructure and safety improvements. "
                    elif 'muni' in text_lower:
                        answer += "This affects Muni public transit services. "
                    answer += "Muni continues to operate throughout the city with a $3 adult fare."

                elif topic == 'business' and any(k in text_lower for k in ['business', 'permit', 'license']):
                    question = "What changes are being made to SF business regulations?"
                    answer = f"Recent business policy update: {title}. "
                    if 'permit' in text_lower:
                        answer += "This affects business permitting processes. "
                    answer += "Businesses still need basic registration certificates and industry-specific permits. Visit sf.gov/start-business for current requirements."

                elif topic == 'environment' and any(k in text_lower for k in ['plastic', 'green', 'climate']):
                    question = "What new environmental policies has SF implemented?"
                    answer = f"Recent environmental initiative: {title}. "
                    if 'plastic' in text_lower:
                        answer += "This addresses single-use plastic reduction. "
                    elif 'green' in text_lower:
                        answer += "This promotes sustainable practices and green building. "
                    answer += "SF continues working toward carbon neutrality by 2050 and zero waste goals."

                else:
                    continue  # Skip if doesn't match well

                # Add date context if available
                if info.get('date'):
                    answer += f" This was introduced in {info['date']}."

                helpful_qa_pairs.append({
                    'input': question,
                    'output': answer
                })

                break  # Only add one Q&A per legislation item

    return helpful_qa_pairs

# Generate hybrid Q&A pairs
qa_pairs = create_hybrid_qa_pairs()
print(f"Created {len(qa_pairs)} hybrid Q&A pairs (helpful responses + current legislation)")

# Remove duplicates
seen_questions = set()
unique_qa_pairs = []
for pair in qa_pairs:
    if pair['input'] not in seen_questions:
        unique_qa_pairs.append(pair)
        seen_questions.add(pair['input'])

qa_pairs = unique_qa_pairs
print(f"After removing duplicates: {len(qa_pairs)} unique Q&A pairs")

# Create dataset
qa_df = pd.DataFrame(qa_pairs)
dataset = Dataset.from_pandas(qa_df)

print(f"\nSample hybrid Q&A pairs:")
for i in range(min(8, len(qa_pairs))):
    print(f"\nQ{i+1}: {qa_pairs[i]['input']}")
    print(f"A{i+1}: {qa_pairs[i]['output'][:200]}...")
    print("-" * 50)

print(f"\nTotal Q&A pairs: {len(qa_pairs)}")
print("This combines practical information with current legislation updates!")

Loaded 1000 records
Sample data:
   file_number                                               text  \
0            1  File: 000001 | Title: Hearing to consider rele...   
1            2  File: 000002 | Title: Hearing to consider that...   
2            3  File: 000003 | Title: Resolution authorizing S...   
3            4  File: 000004 | Title: Resolution authorizing t...   
4            5  File: 000005 | Title: Resolution authorizing S...   

                                               title        type  
0  Hearing to consider release of reserved funds,...     Hearing  
1  Hearing to consider that the transfer of a Typ...     Hearing  
2  Resolution authorizing San Francisco Public Li...  Resolution  
3  Resolution authorizing the Executive Director ...  Resolution  
4  Resolution authorizing San Francisco Public Li...  Resolution  
Created 363 hybrid Q&A pairs (helpful responses + current legislation)
After removing duplicates: 19 unique Q&A pairs

Sample hybrid Q&A pairs:

Q1: W

In [5]:
# Data processing complete - tokenizer already loaded in previous cell
print("Ready to proceed with model loading...")
print(f"Using model: {model_name}")
print(f"Tokenizer vocabulary size: {len(tokenizer)}")

Ready to proceed with model loading...
Using model: gpt2-large
Tokenizer vocabulary size: 50257


In [6]:
def tokenize_function(examples):
    # Create a simpler, more focused training format
    model_inputs = []
    labels = []

    for inp, out in zip(examples['input'], examples['output']):
        # Create a clear instruction-following format
        conversation = f"### Question: {inp}\n\n### Answer: {out}\n\n### End"
        model_inputs.append(conversation)

    # Tokenize with consistent parameters
    tokenized = tokenizer(
        model_inputs,
        truncation=True,
        padding='max_length',
        max_length=256,  # Reduced from 512 for better focus
        return_tensors='pt'
    )

    # Create labels - mask the question part, only train on answer
    labels_list = []
    for i, conversation in enumerate(model_inputs):
        # Find where the answer starts
        answer_start = conversation.find("### Answer:") + len("### Answer:")

        # Tokenize just the question part to get its length
        question_part = conversation[:conversation.find("### Answer:") + len("### Answer:")]
        question_tokens = tokenizer(question_part, add_special_tokens=False)['input_ids']

        # Create labels - -100 for question tokens (ignored), actual tokens for answer
        full_labels = tokenized['input_ids'][i].clone()
        full_labels[:len(question_tokens)] = -100  # Don't train on question
        labels_list.append(full_labels)

    tokenized['labels'] = torch.stack(labels_list)

    return tokenized

# Apply tokenization
print("Tokenizing dataset with improved format...")
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['input', 'output'])
print(f"Tokenized dataset size: {len(tokenized_dataset)}")

# Split dataset for training and validation
train_size = int(0.9 * len(tokenized_dataset))
val_size = len(tokenized_dataset) - train_size

train_dataset = tokenized_dataset.select(range(train_size))
val_dataset = tokenized_dataset.select(range(train_size, train_size + val_size))

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

# Check a sample to verify format
print("\nSample tokenized training example:")
sample_idx = 0
decoded_sample = tokenizer.decode(train_dataset[sample_idx]['input_ids'])
print(f"Decoded text: {decoded_sample[:500]}...")
print(f"Labels size: {len(train_dataset[sample_idx]['labels'])}")

Tokenizing dataset with improved format...


Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Tokenized dataset size: 19
Training samples: 17
Validation samples: 2

Sample tokenized training example:
Decoded text: ### Question: What are the parking rules in San Francisco?

### Answer: San Francisco has strict parking regulations. You can't park within 3 feet of a driveway, in front of fire hydrants, or on street cleaning days. Most meters operate Monday-Saturday 9am-6pm with rates around $3-7 per hour. Always check posted signs for specific restrictions in each area.

### End<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|...
Labels size: 256


In [8]:
# Load and prepare the model for LoRA fine-tuning
print("Loading base model...")

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load the model without quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto"
)
model = prepare_model_for_kbit_training(model)

# Configure LoRA - adjusted for GPT2-medium
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias='none',
    task_type='CAUSAL_LM',
    target_modules=['c_attn', 'c_proj', 'c_fc']
)

print("Applying LoRA configuration...")
peft_model = get_peft_model(model, lora_config)

# Print trainable parameters
trainable_params = sum(p.numel() for p in peft_model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in peft_model.parameters())
print(f"Trainable parameters: {trainable_params:,}")
print(f"Total parameters: {total_params:,}")
print(f"Trainable %: {100 * trainable_params / total_params:.2f}%")

print("Model prepared successfully!")

Loading base model...
Using device: cuda
Applying LoRA configuration...
Trainable parameters: 11,796,480
Total parameters: 785,826,560
Trainable %: 1.50%
Model prepared successfully!


In [11]:
from transformers import DataCollatorForLanguageModeling

# Use a simple data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Set aggressive training settings
training_args = TrainingArguments(
    output_dir='./sf-chatbot-v2',
    num_train_epochs=5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=50,
    save_steps=100,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to=None,
    dataloader_pin_memory=False,
    push_to_hub=False,
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=0,
    remove_unused_columns=False,
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Training for {training_args.num_train_epochs} epochs")
print(f"Learning rate: {training_args.learning_rate}")

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Training samples: 17
Validation samples: 2
Training for 5 epochs
Learning rate: 5e-05


In [12]:
# Train the model with improved settings
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mgalluptj[0m ([33mgalluptj-university-of-san-francisco[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss


TrainOutput(global_step=15, training_loss=3.0463333129882812, metrics={'train_runtime': 26.5769, 'train_samples_per_second': 3.198, 'train_steps_per_second': 0.564, 'total_flos': 94027599052800.0, 'train_loss': 3.0463333129882812, 'epoch': 5.0})

In [15]:
def test_model(model, tokenizer, question, max_length=200):
    """
    Test the model with example questions
    """
    # Get the device the model is on
    model_device = next(model.parameters()).device

    # Format to match training data
    prompt = f"### Question: {question}\n\n### Answer:"

    # Tokenize and move to device
    inputs = tokenizer(prompt, return_tensors='pt')
    inputs = {k: v.to(model_device) for k, v in inputs.items()}

    # Generate response
    model.eval()
    with torch.no_grad():
        outputs = model.generate(
            inputs['input_ids'],
            max_new_tokens=100,
            num_return_sequences=1,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            top_k=50,
            repetition_penalty=1.15,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            no_repeat_ngram_size=3,
        )

    # Decode response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract just the answer part
    if "### Answer:" in response:
        response = response.split("### Answer:")[-1].strip()

    # Clean up - stop at "### End" or double newlines
    if "### End" in response:
        response = response.split("### End")[0].strip()

    # Remove extra whitespace and truncate at reasonable length
    response = ' '.join(response.split())

    # Try to end at sentence boundary
    if len(response) > 200:
        sentences = response.split('.')
        if len(sentences) > 1:
            response = '.'.join(sentences[:-1]) + '.'

    return response if response else "I don't have information about that topic."

# First, let's check our training data quality
print("Checking training data quality:")
print("="*50)

sample_qa = qa_pairs[:3]
for i, pair in enumerate(sample_qa):
    print(f"\nSample {i+1}:")
    print(f"Q: {pair['input']}")
    print(f"A: {pair['output'][:200]}...")
    print("-" * 30)

print(f"\nTotal Q&A pairs: {len(qa_pairs)}")
print("="*60)

test_questions = [
    "What are the parking rules in San Francisco?",
    "How do I start a business in SF?",
    "Tell me about rent control in San Francisco",
    "What environmental policies does SF have?",
    "How does zoning work in San Francisco?"

]

model_device = next(peft_model.parameters()).device
print(f"Model device: {model_device}")

for i, question in enumerate(test_questions):
    print(f"\nTest {i+1}:")
    print(f"Q: {question}")
    try:
        response = test_model(peft_model, tokenizer, question)
        print(f"A: {response}")
    except Exception as e:
        print(f"Error: {e}")
    print("-" * 40)

Checking training data quality:

Sample 1:
Q: What are the parking rules in San Francisco?
A: San Francisco has strict parking regulations. You can't park within 3 feet of a driveway, in front of fire hydrants, or on street cleaning days. Most meters operate Monday-Saturday 9am-6pm with rates ...
------------------------------

Sample 2:
Q: How do parking meters work in SF?
A: SF parking meters accept coins, credit cards, and mobile payments through the ParkSF app. Most meters have a 2-4 hour time limit. Rates vary by neighborhood - downtown is more expensive than residenti...
------------------------------

Sample 3:
Q: What are the housing laws in San Francisco?
A: San Francisco has strong tenant protections including rent control for buildings built before 1979, just cause eviction requirements, and limits on rent increases (usually around 2-3% annually). The c...
------------------------------

Total Q&A pairs: 19
Model device: cuda:0

Test 1:
Q: What are the parking rules in San F

In [16]:
# Push model to Hugging Face Hub
# Push the adapter weights
peft_model.push_to_hub("tgallup/sf-legislation-chatbot-lora")

# Push the tokenizer
tokenizer.push_to_hub("tgallup/sf-legislation-chatbot-lora")

print("Model successfully pushed to Hugging Face Hub!")

README.md: 0.00B [00:00, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/47.2M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Model successfully pushed to Hugging Face Hub!


In [17]:
!pip install -q streamlit transformers torch peft accelerate bitsandbytes pyngrok

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m97.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m97.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m97.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Set up ngrok for tunneling

In [21]:
from pyngrok import ngrok
from google.colab import userdata
ngrok.set_auth_token(userdata.get("NGROK_TOKEN"))
USE_NGROK = True
print("Public ngrok URL setup")

Public ngrok URL setup


## Frontend for Chatbot

In [27]:
%%writefile app.py
"""
SF Legislation Chatbot - Colab
"""

import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import os

# Page configuration
st.set_page_config(
    page_title="SF Legislation Chatbot",
    page_icon="🏛️",
    layout="wide"
)

# Custom CSS
st.markdown("""
<style>
    .main-header {
        font-size: 2.5rem;
        color: #1f77b4;
        text-align: center;
        margin-bottom: 2rem;
        border-bottom: 2px solid #1f77b4;
        padding-bottom: 1rem;
    }
    .chat-message {
        padding: 1rem;
        border-radius: 10px;
        margin: 1rem 0;
    }
    .user-message {
        background-color: #1565c0;
        border-left: 4px solid #2196f3;
        color: white;
    }
    .bot-message {
        background-color: #2e7d32;
        border-left: 4px solid #4caf50;
        color: white;
    }
    .status-info {
        background-color: #e65100;
        border: 1px solid #ffeaa7;
        border-radius: 5px;
        padding: 1rem;
        margin: 1rem 0;
        color: white;
    }
</style>
""", unsafe_allow_html=True)

class SFChatbot:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = None
        self.tokenizer = None

    @st.cache_resource
    def load_model(_self):
        """Load model with GPU optimization"""
        try:
            with st.spinner("🚀 Loading SF Chatbot (using GPU for fast loading)..."):
                # Base model
                base_model_name = "gpt2-large"

                # Load tokenizer
                _self.tokenizer = AutoTokenizer.from_pretrained(base_model_name)
                _self.tokenizer.pad_token = _self.tokenizer.eos_token

                # Load base model with GPU optimization
                if torch.cuda.is_available():
                    base_model = AutoModelForCausalLM.from_pretrained(
                        base_model_name,
                        torch_dtype=torch.float16,
                        device_map="auto",
                        low_cpu_mem_usage=True
                    )
                else:
                    base_model = AutoModelForCausalLM.from_pretrained(
                        base_model_name,
                        torch_dtype=torch.float32
                    )

                # Load LoRA weights
                _self.model = PeftModel.from_pretrained(
                    base_model,
                    "tgallup/sf-legislation-chatbot-lora"
                )
                _self.model.eval()

                if not torch.cuda.is_available():
                    _self.model = _self.model.to(_self.device)

                return True

        except Exception as e:
            st.error(f"Error loading model: {str(e)}")
            return False

    def chat(self, question):
        """Generate response"""
        if not self.model or not self.tokenizer:
            return "Model not loaded properly."

        try:
            # Format prompt
            prompt = f"### Question: {question}\n\n### Answer:"

            # Tokenize
            inputs = self.tokenizer(prompt, return_tensors='pt')
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            # Generate
            with torch.no_grad():
                outputs = self.model.generate(
                    inputs['input_ids'],
                    max_new_tokens=150,
                    temperature=0.7,
                    do_sample=True,
                    top_p=0.9,
                    repetition_penalty=1.15,
                    pad_token_id=self.tokenizer.eos_token_id,
                    no_repeat_ngram_size=3
                )

            # Decode
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Extract answer
            if "### Answer:" in response:
                response = response.split("### Answer:")[-1].strip()

            return response if response else "I don't have information about that topic."

        except Exception as e:
            return f"Error: {str(e)}"

# Initialize chatbot
@st.cache_resource
def get_chatbot():
    return SFChatbot()

def main():
    # Header
    st.markdown('<h1 class="main-header">🏛️ San Francisco Legislation Chatbot</h1>', unsafe_allow_html=True)

    # System info
    device_info = "GPU" if torch.cuda.is_available() else "CPU"
    st.markdown(f'<div class="status-info">Running on: {device_info} | Powered by fine-tuned GPT-2</div>', unsafe_allow_html=True)

    # Load chatbot
    chatbot = get_chatbot()

    if chatbot.model is None:
        if not chatbot.load_model():
            st.stop()
        st.success("Model loaded successfully! Ready to chat.")
        st.balloons()

    # Sidebar
    with st.sidebar:
        st.markdown("### 💡 Ask me about:")
        topics = [
            "🅿️ Parking rules and meters",
            "🏠 Housing laws and rent control",
            "🚌 Public transportation (Muni)",
            "🏢 Starting a business",
            "♻️ Environmental policies",
            "🚧 Building codes and zoning"
        ]
        for topic in topics:
            st.markdown(f"- {topic}")

        st.markdown("### 📞 Need More Help?")
        st.markdown("- **Call 311** for city services")
        st.markdown("- **Visit sf.gov** for official info")

    # Chat interface
    if "messages" not in st.session_state:
        st.session_state.messages = [{
            "role": "assistant",
            "content": "👋 Hi! I'm your SF legislation assistant. Ask me about city laws, policies, and services!"
        }]

    # Display messages
    for msg in st.session_state.messages:
        if msg["role"] == "user":
            st.markdown(f'<div class="chat-message user-message"><strong>You:</strong> {msg["content"]}</div>', unsafe_allow_html=True)
        else:
            st.markdown(f'<div class="chat-message bot-message"><strong>🏛️ Assistant:</strong> {msg["content"]}</div>', unsafe_allow_html=True)

    # Quick start buttons
    st.markdown("### Quick Start:")
    col1, col2, col3 = st.columns(3)

    sample_questions = [
        "What are the parking rules in SF?",
        "How does rent control work?",
        "How do I start a business?"
    ]

    for i, (col, question) in enumerate(zip([col1, col2, col3], sample_questions)):
        if col.button(question, key=f"sample_{i}"):
            st.session_state.messages.append({"role": "user", "content": question})
            with st.spinner("🤔 Thinking..."):
                response = chatbot.chat(question)
            st.session_state.messages.append({"role": "assistant", "content": response})
            st.rerun()

    # Chat input
    if user_input := st.chat_input("Ask about SF laws, policies, or city services..."):
        st.session_state.messages.append({"role": "user", "content": user_input})

        with st.spinner("🤔 Thinking..."):
            response = chatbot.chat(user_input)

        st.session_state.messages.append({"role": "assistant", "content": response})
        st.rerun()

    # Clear chat
    if st.button("🗑️ Clear Chat"):
        st.session_state.messages = [{
            "role": "assistant",
            "content": "👋 Hi! I'm your SF legislation assistant. Ask me about city laws, policies, and services!"
        }]
        st.rerun()

if __name__ == "__main__":
    main()

Overwriting app.py


In [32]:
import subprocess
import threading
import time
import requests

def run_streamlit():
    """Run Streamlit in background"""
    subprocess.Popen([
        "streamlit", "run", "app.py",
        "--server.port", "8501",
        "--server.address", "0.0.0.0",
        "--server.headless", "true",
        "--browser.gatherUsageStats", "false"
    ])

def wait_for_streamlit():
    """Wait for Streamlit to start"""
    print("Starting Streamlit server...")
    for i in range(30):  # Wait up to 30 seconds
        try:
            response = requests.get("http://localhost:8501")
            if response.status_code == 200:
                print("Streamlit server is ready!")
                return True
        except:
            pass
        time.sleep(1)
        if i % 5 == 0:
            print(f"Waiting for server... ({i+1}/30)")

    print("Streamlit server failed to start")
    return False

# Start Streamlit
run_streamlit()

# Wait for it to be ready
if wait_for_streamlit():
    print("\nSF Legislation Chatbot is ready!")
    print("Local URL: http://localhost:8501")

    # Setup public URL
    if USE_NGROK:
        try:
            from pyngrok import ngrok
            public_url = ngrok.connect(8501)
            print(f"Public URL: {public_url}")
            print("\nClick the public URL above to access your chatbot from anywhere!")
        except Exception as e:
            print(f"ngrok setup failed: {e}")
            print("Try using Colab's built-in tunneling instead")
    else:
        print("\nTo create a public URL:")
        print("1. Use the local URL above if in Colab")
        print("2. Or set up ngrok in the previous cell")

    print("\nThe app will run until you stop this cell or restart the runtime")
else:
    print("Failed to start the chatbot. Check the error messages above.")

Starting Streamlit server...
Streamlit server is ready!

SF Legislation Chatbot is ready!
Local URL: http://localhost:8501
Public URL: NgrokTunnel: "https://0639-34-142-168-60.ngrok-free.app" -> "http://localhost:8501"

Click the public URL above to access your chatbot from anywhere!

The app will run until you stop this cell or restart the runtime
