In [1]:
# Cell 1: Environment Setup and Imports
import sys
import json
from pathlib import Path
import pandas as pd
from IPython.display import display

# Project root directory
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'script' else Path.cwd()
print(f"Project root: {PROJECT_ROOT}")

# Add to path
sys.path.insert(0, str(PROJECT_ROOT))

# Import refactored step modules (with function interfaces)
from deeds_pipeline.step3_scraper import process_deeds_scraping
from deeds_pipeline.step4_geolocation import process_deeds_geolocation
from deeds_pipeline.step5_integration import process_deeds_integration

print("✓ All imports successful")
print(f"✓ nest_asyncio applied for Jupyter compatibility")

Project root: /Users/mingyang/Desktop/MCP/2025 Fall/11.458 Crowd Sourced City/deeds_pipeline
✓ All imports successful
✓ nest_asyncio applied for Jupyter compatibility


In [2]:
# Add this at the very beginning of your notebook (after the first cell)
import os
from pathlib import Path

# Set up environment variables for Google Cloud
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'script' else Path.cwd()

# Set Google credentials path
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = str(
    PROJECT_ROOT / "data" / "gen-lang-client-0299712015-e4edfcee773f.json"
)

# Set other Google variables
os.environ['GOOGLE_CLOUD_PROJECT'] = "gen-lang-client-0299712015"
# From your .env
os.environ['GOOGLE_API_KEY'] = "AIzaSyAecLcI_ODFaeEeJMRoGfdQ2ilVMut5INw"

print(f"✓ Environment variables set")
print(
    f"✓ GOOGLE_APPLICATION_CREDENTIALS: {os.environ.get('GOOGLE_APPLICATION_CREDENTIALS')}")
print(
    f"✓ File exists: {os.path.exists(os.environ.get('GOOGLE_APPLICATION_CREDENTIALS'))}")

✓ Environment variables set
✓ GOOGLE_APPLICATION_CREDENTIALS: /Users/mingyang/Desktop/MCP/2025 Fall/11.458 Crowd Sourced City/deeds_pipeline/data/gen-lang-client-0299712015-e4edfcee773f.json
✓ File exists: True


# Pipeline Step 1-2: Json reformatting and ORC extraction


In [3]:
from deeds_pipeline.step1_json_reformat import run_step1

input_path = Path(
    "/Users/mingyang/Desktop/MCP/2025 Fall/11.458 Crowd Sourced City/deeds_pipeline/data/deed_reviews_northern_middlesex_20251103_110333.json")
output_path = Path(
    "/Users/mingyang/Desktop/MCP/2025 Fall/11.458 Crowd Sourced City/deeds_pipeline/output/step1_reformatted_by_deed_id.json")
# 使用默认配置运行
result = run_step1()

# 或指定自定义文件路径
result = run_step1(
    input_file=input_path,
    output_file=output_path
)

2025-11-27 14:14:28,637 - step1_json_reformat - INFO - Starting Step 1: JSON Reformat
2025-11-27 14:14:28,638 - step1_json_reformat - INFO - Input file: /Users/mingyang/Desktop/MCP/2025 Fall/11.458 Crowd Sourced City/deeds_pipeline/data/deed_reviews_northern_middlesex_20251103_110333.json
2025-11-27 14:14:28,639 - step1_json_reformat - INFO - Output file: /Users/mingyang/Desktop/MCP/2025 Fall/11.458 Crowd Sourced City/deeds_pipeline/output/step1_reformatted_by_deed_id.json
2025-11-27 14:14:28,639 - step1_json_reformat - INFO - Loading input data...
2025-11-27 14:14:28,661 - step1_json_reformat - INFO - Loaded 742 records
2025-11-27 14:14:28,662 - step1_json_reformat - INFO - Reformatting data by deed_id...
2025-11-27 14:14:28,662 - step1_json_reformat - INFO - Processing 742 deed review records
2025-11-27 14:14:28,664 - step1_json_reformat - INFO - Consolidated into 570 unique deed records
2025-11-27 14:14:28,664 - step1_json_reformat - INFO - Saving reformatted data...
2025-11-27 14:1

In [4]:
output_path = Path(
    "/Users/mingyang/Desktop/MCP/2025 Fall/11.458 Crowd Sourced City/deeds_pipeline/output/step1_reformatted_by_deed_id.json")

# read the output file from step1
with open(output_path, 'r') as f:
    step1_data = json.load(f)

# print the first 3 records
print(step1_data[list(step1_data.keys())[3]])

{'deed_id': '2382', 'reviews': {'26': {'city': None, 'deed_date': '1947-11-13', 'addresses': None, 'is_restrictive_covenant': True, 'exact_language_covenants': None, 'grantors': 'E. Gaston Campbell, Frank J. Rochette, Thomas Rochette', 'grantees': 'William C. Martin, Anna E. Martin', 'additional_locational_information': None, 'exclusion_types': None, 'county': 'Northern Middlesex', 'full_texts': None, 'book_page_urls': ['https://ma-covenants.dataplusfeminism.mit.edu/api/book_pages/1149307/show_page.jpg', 'https://ma-covenants.dataplusfeminism.mit.edu/api/book_pages/1149308/show_page.jpg']}}, 'city': None, 'deed_date': '1947-11-13', 'addresses': None, 'grantors': 'E. Gaston Campbell, Frank J. Rochette, Thomas Rochette', 'grantees': 'William C. Martin, Anna E. Martin', 'additional_locational_information': None, 'exclusion_types': None, 'county': 'Northern Middlesex', 'full_texts': None, 'book_page_urls': ['https://ma-covenants.dataplusfeminism.mit.edu/api/book_pages/1149307/show_page.jpg

## Step 2 Test: OCR Extraction (Testing with 3 Deeds)

Before running Step 2 on all 570 deeds, let's test with just 3 deeds to verify everything works correctly.

In [None]:
# Step 2: OCR Extraction
from deeds_pipeline.step2_ocr_extraction import run_step2

# 运行 Step 2
result = run_step2()

# 访问特定契约的 OCR 结果
deed_1612 = result["1612"]
for ocr_result in deed_1612["ocr_results"]:
    print(f"Text: {ocr_result['ocr_text'][:100]}")
    print(f"Covenant: {ocr_result['covenant_detection']['covenant_detected']}")
    print(f"Addresses: {ocr_result['extracted_info']['street_addresses']}")

2025-11-27 14:41:53,759 - step2_ocr_extraction - INFO - Starting Step 2: OCR and Information Extraction (file mode)
2025-11-27 14:41:53,761 - step2_ocr_extraction - INFO - Input file: /Users/mingyang/Desktop/MCP/2025 Fall/11.458 Crowd Sourced City/deeds_pipeline/output/step1_reformatted_by_deed_id.json
2025-11-27 14:41:53,762 - step2_ocr_extraction - INFO - Loading Step 1 output...
2025-11-27 14:41:53,771 - step2_ocr_extraction - INFO - Loaded 570 deed records
2025-11-27 14:41:53,772 - step2_ocr_extraction - INFO - Starting Step 2 processing for 570 deed(s)
2025-11-27 14:41:53,773 - step2_ocr_extraction - INFO - Processing deed 1612 (1/570)
2025-11-27 14:41:53,773 - step2_ocr_extraction - INFO - Processing 2 images for deed 1612
2025-11-27 14:41:53,774 - step2_ocr_extraction - INFO - Deed 1612: Processing image 1/2
2025-11-27 14:41:54,144 - step2_ocr_extraction - INFO - 
2️⃣ Performing OCR using Google Vision API
2025-11-27 14:41:55,479 - step2_ocr_extraction - INFO - ✅ OCR completed!


# Pipeline Step 3-5: Unified Deed Processing Notebook

This notebook implements a stable, function-based pipeline:
- **Step 3**: Scrape MassLand Records (using proven massland_scraper.py)
- **Step 4**: Geocode streets (with nest_asyncio for Jupyter compatibility)
- **Step 5**: Integrate and export data

## Key Features
- Function call interfaces (no global JSON files required)
- Optional JSON checkpointing between steps
- Fresh browser per deed (prevents session pollution)
- Async-compatible geolocation

## Input Format
List of deed records:
```python
[
    {"deed_id": "5767", "book": "57", "page": "21", "county": "Middlesex County", "town": "Dracut"},
    ...
]
```

In [23]:
# Cell 2: Utility Functions
def load_json(filepath):
    """Load JSON file"""
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)

def save_json(data, filepath):
    """Save data to JSON file"""
    Path(filepath).parent.mkdir(parents=True, exist_ok=True)
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

def extract_book_page_from_urls(urls):
    """Extract book/page from MassLand URLs"""
    import re
    pairs = []
    for url in urls:
        match = re.search(r'[Bb]ook=(\d+).*?[Pp]age=(\d+)', url)
        if match:
            pairs.append({"book": match.group(1), "page": match.group(2)})
    return pairs

print("✓ Utility functions defined")

✓ Utility functions defined


## Input Data: Choose Your Source

You can either:
1. **Option A**: Load from Step 2 output JSON file
2. **Option B**: Manually define test records

In [24]:
# Cell 3A: OPTION A - Load from Step 2 Output

# Load Step 2 output
step2_file = PROJECT_ROOT / "output" / "step2_ocr_extracted.json"
step2_data = load_json(step2_file)

# Convert to list format
input_records = []
for deed_id, deed_record in step2_data.items():
    # Extract book/page from OCR results
    ocr_results = deed_record.get("ocr_results", [])
    
    # Collect all book/page pairs from OCR extracted info
    books = set()
    pages = set()
    
    for ocr_result in ocr_results:
        extracted_info = ocr_result.get("extracted_info", {})
        
        # Get plan_book (can be list or single value)
        plan_book = extracted_info.get("plan_book")
        if plan_book:
            if isinstance(plan_book, list):
                books.update(plan_book)
            else:
                books.add(plan_book)
        
        # Get plan_pages (can be list or single value)
        plan_pages = extracted_info.get("plan_pages")
        if plan_pages:
            if isinstance(plan_pages, list):
                pages.update(plan_pages)
            else:
                pages.add(plan_pages)
    
    # Create records for each book/page combination
    if books and pages:
        for book in books:
            for page in pages:
                input_records.append({
                    "deed_id": deed_id,
                    "book": book,
                    "page": page,
                    "county": deed_record.get("county", ""),
                    "town": deed_record.get("city", "")  # Use 'city' field as town
                })
    else:
        # No book/page found in OCR, skip this deed
        print(f"⚠ Warning: Deed {deed_id} has no plan_book/plan_pages in OCR results")

print(f"\n✓ Loaded {len(step2_data)} deeds from Step 2")
print(f"✓ Converted to {len(input_records)} book/page records for scraping")
print(f"\nSample records:")
for rec in input_records[:3]:
    print(f"  - Deed {rec['deed_id']}: Book {rec['book']}, Page {rec['page']}, Town: {rec['town']}")


✓ Loaded 570 deeds from Step 2
✓ Converted to 0 book/page records for scraping

Sample records:


In [None]:
# Cell 3B: OPTION B - Manual Input (Alternative)

# Uncomment and run this cell instead of 3A for manual testing
# input_records = [
#     {"deed_id": "5767", "book": "57", "page": "21", "county": "Middlesex County", "town": "Dracut"},
#     {"deed_id": "6188", "book": "61", "page": "88", "county": "Middlesex County", "town": "Dracut"}
# ]

# print("✓ Using manual input")
# print(f"✓ Total records: {len(input_records)}")
# for rec in input_records:
#     print(f"  - Deed {rec['deed_id']}: Book {rec['book']}, Page {rec['page']}")

✓ Using manual input
✓ Total records: 2
  - Deed 5767: Book 57, Page 21
  - Deed 6188: Book 61, Page 88


## Step 3: Web Scraping

Scrape MassLand Records using the proven massland_scraper.py.
- Fresh browser created for each deed (prevents session pollution)
- Automatic retry and error handling
- Progress logging

In [16]:
# Cell 4: Run Step 3 - Web Scraping

print("="*80)
print("STEP 3: SCRAPING MASSLAND RECORDS")
print("="*80)
print(f"\nInput: {len(input_records)} records")
print("Note: Browser will open for each deed (fresh session prevents errors)\n")

# Run Step 3 using function interface
step3_results = process_deeds_scraping(
    deed_records=input_records,
    headless=False  # Set to True to hide browser
)

# Optional: Save checkpoint
step3_checkpoint_file = PROJECT_ROOT / "output" / "notebook_step3_checkpoint.json"
save_json({r["deed_id"]: r for r in step3_results}, step3_checkpoint_file)

# Statistics
total_scraped = sum(1 for r in step3_results if r.get("step3_completed"))
total_streets = sum(len(r.get("extracted_streets", [])) for r in step3_results)

print(f"\n{'='*80}")
print(f"STEP 3 COMPLETED")
print(f"{'='*80}")
print(f"Checkpoint saved: {step3_checkpoint_file}")
print(f"Deeds processed: {total_scraped}/{len(step3_results)}")
print(f"Total unique streets: {total_streets}")
print(f"\nSample result:")
if step3_results:
    sample = step3_results[0]
    print(f"  Deed {sample['deed_id']}: {len(sample.get('extracted_streets', []))} streets found")
    print(f"  Streets: {', '.join(sample.get('extracted_streets', [])[:5])}")

2025-11-24 14:27:32,680 - step3_scraper - INFO - Starting Step 3 processing for 2 deed(s)
2025-11-24 14:27:32,682 - step3_scraper - INFO - [1/2] Processing deed 5767
2025-11-24 14:27:32,684 - step3_scraper - INFO - Deed 5767: Loaded from cache
2025-11-24 14:27:32,684 - step3_scraper - INFO - [2/2] Processing deed 6188
2025-11-24 14:27:32,685 - step3_scraper - INFO - Deed 6188: Loaded from cache
2025-11-24 14:27:32,686 - step3_scraper - INFO - Step 3 completed for 2 deed(s)


STEP 3: SCRAPING MASSLAND RECORDS

Input: 2 records
Note: Browser will open for each deed (fresh session prevents errors)


STEP 3 COMPLETED
Checkpoint saved: /Users/mingyang/Desktop/MCP/2025 Fall/11.458 Crowd Sourced City/deeds_pipeline/output/notebook_step3_checkpoint.json
Deeds processed: 2/2
Total unique streets: 0

Sample result:
  Deed 5767: 0 streets found
  Streets: 


## Step 4: Geolocation

Geocode streets using OpenStreetMap Nominatim API.
- Async HTTP requests for better performance
- Clustering validation to identify primary location
- Works seamlessly in Jupyter (nest_asyncio applied)

In [12]:
# Cell 5: Run Step 4 - Geolocation

print("="*80)
print("STEP 4: GEOCODING STREETS")
print("="*80)
print(f"\nInput: {len(step3_results)} deed records")
print("Using OpenStreetMap Nominatim API...\n")

# Run Step 4 using function interface (nest_asyncio handles event loop)
step4_results = process_deeds_geolocation(deed_records=step3_results)

# Optional: Save checkpoint
step4_checkpoint_file = PROJECT_ROOT / "output" / "notebook_step4_checkpoint.json"
save_json({r["deed_id"]: r for r in step4_results}, step4_checkpoint_file)

# Statistics
geocoded_count = sum(
    1 for r in step4_results 
    if r.get("geolocation") and r["geolocation"].get("cluster_center_lat")
)
avg_confidence = sum(
    r.get("geolocation", {}).get("confidence", 0) 
    for r in step4_results
    if r.get("geolocation")
) / max(geocoded_count, 1)

print(f"\n{'='*80}")
print(f"STEP 4 COMPLETED")
print(f"{'='*80}")
print(f"Checkpoint saved: {step4_checkpoint_file}")
print(f"Deeds geocoded: {geocoded_count}/{len(step4_results)}")
print(f"Average confidence: {avg_confidence:.1%}")
print(f"\nSample geolocation:")
for r in step4_results:
    if r.get("geolocation") and r["geolocation"].get("cluster_center_lat"):
        geo = r["geolocation"]
        print(f"  Deed {r['deed_id']}: ({geo['cluster_center_lat']:.4f}, {geo['cluster_center_lon']:.4f})")
        print(f"    Town: {geo.get('primary_town')}, Confidence: {geo.get('confidence', 0):.1%}")
        break

2025-11-24 14:26:12,919 - step4_geolocation - INFO - Event loop detected (likely Jupyter). Using nest_asyncio.
2025-11-24 14:26:12,921 - step4_geolocation - INFO - Starting Step 4 processing for 2 deed(s)
2025-11-24 14:26:12,922 - step4_geolocation - INFO - StreetClusteringValidator initialized (in-module implementation)
2025-11-24 14:26:12,923 - step4_geolocation - INFO - [1/2] Processing deed 5767
2025-11-24 14:26:12,924 - step4_geolocation - INFO - [2/2] Processing deed 6188
2025-11-24 14:26:12,925 - step4_geolocation - INFO - Step 4 completed for 2 deed(s)


STEP 4: GEOCODING STREETS

Input: 2 deed records
Using OpenStreetMap Nominatim API...


STEP 4 COMPLETED
Checkpoint saved: /Users/mingyang/Desktop/MCP/2025 Fall/11.458 Crowd Sourced City/deeds_pipeline/output/notebook_step4_checkpoint.json
Deeds geocoded: 0/2
Average confidence: 0.0%

Sample geolocation:


## Step 5: Data Integration

Integrate all processing results and export:
- Flatten nested data for CSV export
- Generate quality report
- Export to both JSON and CSV formats

In [13]:
# Cell 6: Run Step 5 - Integration and Export

print("="*80)
print("STEP 5: DATA INTEGRATION AND EXPORT")
print("="*80)
print(f"\nInput: {len(step4_results)} deed records")

# Run Step 5 using function interface
final_records, final_df, quality_report = process_deeds_integration(deed_records=step4_results)

# Save outputs
step5_json_file = PROJECT_ROOT / "output" / "notebook_final_output.json"
step5_csv_file = PROJECT_ROOT / "output" / "notebook_final_output.csv"

# Save JSON (full nested structure)
final_json_output = {
    "metadata": {
        "total_deeds": len(final_records),
        "quality_report": quality_report
    },
    "deeds": {r["deed_id"]: r for r in final_records}
}
save_json(final_json_output, step5_json_file)

# Save CSV (flattened structure)
final_df.to_csv(step5_csv_file, index=False, encoding='utf-8')

print(f"\n{'='*80}")
print(f"STEP 5 COMPLETED")
print(f"{'='*80}")
print(f"JSON output: {step5_json_file}")
print(f"CSV output: {step5_csv_file}")
print(f"\nQuality Report:")
for key, value in quality_report.items():
    print(f"  {key}: {value}")

2025-11-24 14:26:36,116 - step5_integration - INFO - Starting Step 5 processing for 2 deed(s)
2025-11-24 14:26:36,119 - step5_integration - INFO - Flattening deed records...
2025-11-24 14:26:36,120 - step5_integration - INFO - Flattened 2 records
2025-11-24 14:26:36,120 - step5_integration - INFO - Generating quality report...
2025-11-24 14:26:36,121 - step5_integration - INFO - DATA QUALITY REPORT
2025-11-24 14:26:36,123 - step5_integration - INFO - total_deeds: 2
2025-11-24 14:26:36,123 - step5_integration - INFO - step2_ocr_completed: 0
2025-11-24 14:26:36,123 - step5_integration - INFO - step2_completion_rate: 0.0%
2025-11-24 14:26:36,124 - step5_integration - INFO - step3_scraper_completed: 2
2025-11-24 14:26:36,124 - step5_integration - INFO - step3_completion_rate: 100.0%
2025-11-24 14:26:36,125 - step5_integration - INFO - step4_geolocation_completed: 2
2025-11-24 14:26:36,126 - step5_integration - INFO - step4_completion_rate: 100.0%
2025-11-24 14:26:36,127 - step5_integration

STEP 5: DATA INTEGRATION AND EXPORT

Input: 2 deed records

STEP 5 COMPLETED
JSON output: /Users/mingyang/Desktop/MCP/2025 Fall/11.458 Crowd Sourced City/deeds_pipeline/output/notebook_final_output.json
CSV output: /Users/mingyang/Desktop/MCP/2025 Fall/11.458 Crowd Sourced City/deeds_pipeline/output/notebook_final_output.csv

Quality Report:
  total_deeds: 2
  step2_ocr_completed: 0
  step2_completion_rate: 0.0%
  step3_scraper_completed: 2
  step3_completion_rate: 100.0%
  step4_geolocation_completed: 2
  step4_completion_rate: 100.0%
  original_covenant_count: 0
  ocr_detected_covenant_count: 0
  geocoded_count: 0
  geocoded_rate: 0.0%
  with_streets_count: 0
  with_streets_rate: 0.0%


## Results Visualization

View the final processed data and create visualizations.

In [14]:
# Cell 7: View and Analyze Results

print("="*80)
print("FINAL RESULTS SUMMARY")
print("="*80)
print(f"\nDataFrame shape: {final_df.shape}")
print(f"Columns: {list(final_df.columns)}")
print(f"\nFirst few rows:")
display(final_df.head())

# Geocoding statistics
geocoded_count = final_df['geo_latitude'].notna().sum()
avg_confidence = final_df['geo_confidence'].mean()

print(f"\nGeocoding Statistics:")
print(f"  Total deeds: {len(final_df)}")
print(f"  Successfully geocoded: {geocoded_count} ({geocoded_count/len(final_df)*100:.1f}%)")
if pd.notna(avg_confidence):
    print(f"  Average confidence: {avg_confidence:.1%}")

# Street extraction statistics
with_streets = final_df['scraped_streets'].notna().sum()
total_streets = final_df['scraped_street_count'].sum()
print(f"\nStreet Extraction:")
print(f"  Deeds with streets: {with_streets} ({with_streets/len(final_df)*100:.1f}%)")
print(f"  Total streets found: {int(total_streets)}")
print(f"  Average streets per deed: {total_streets/max(with_streets,1):.1f}")

FINAL RESULTS SUMMARY

DataFrame shape: (2, 30)
Columns: ['deed_id', 'review_ids', 'city', 'deed_date', 'address', 'is_restrictive_covenant', 'county', 'grantors', 'grantees', 'covenant_text', 'ocr_covenant_detected', 'ocr_covenant_text', 'plan_books', 'plan_pages', 'lot_numbers', 'extracted_streets', 'extracted_towns', 'scraped_streets', 'scraped_street_count', 'geo_latitude', 'geo_longitude', 'geo_address', 'geo_town', 'geo_cluster_radius_miles', 'geo_confidence', 'geo_validated_street_count', 'geo_invalid_street_count', 'step2_completed', 'step3_completed', 'step4_completed']

First few rows:


Unnamed: 0,deed_id,review_ids,city,deed_date,address,is_restrictive_covenant,county,grantors,grantees,covenant_text,...,geo_longitude,geo_address,geo_town,geo_cluster_radius_miles,geo_confidence,geo_validated_street_count,geo_invalid_street_count,step2_completed,step3_completed,step4_completed
0,5767,,,,,,Middlesex County,,,,...,,,,,,0,0,False,True,True
1,6188,,,,,,Middlesex County,,,,...,,,,,,0,0,False,True,True



Geocoding Statistics:
  Total deeds: 2
  Successfully geocoded: 0 (0.0%)

Street Extraction:
  Deeds with streets: 0 (0.0%)
  Total streets found: 0
  Average streets per deed: 0.0


In [15]:
# Cell 8: Create Interactive Map (Optional)

try:
    import folium
    from folium import plugins
    
    # Create base map
    center_lat = final_df['geo_latitude'].mean()
    center_lon = final_df['geo_longitude'].mean()
    
    if pd.notna(center_lat) and pd.notna(center_lon):
        m = folium.Map(location=[center_lat, center_lon], zoom_start=11)
        
        # Add markers for each geocoded deed
        for idx, row in final_df.iterrows():
            if pd.notna(row['geo_latitude']):
                confidence = row.get('geo_confidence', 0)
                color = 'green' if confidence > 0.7 else 'orange' if confidence > 0.4 else 'red'
                
                popup_html = f"""
                <b>Deed {row['deed_id']}</b><br>
                Town: {row.get('geo_town', 'N/A')}<br>
                Confidence: {confidence:.1%}<br>
                Streets: {row.get('scraped_street_count', 0)}<br>
                Radius: {row.get('geo_cluster_radius_miles', 0):.2f} mi
                """
                
                folium.CircleMarker(
                    location=[row['geo_latitude'], row['geo_longitude']],
                    radius=8,
                    popup=folium.Popup(popup_html, max_width=300),
                    color=color,
                    fill=True,
                    fillColor=color,
                    fillOpacity=0.6
                ).add_to(m)
        
        # Add legend
        legend_html = '''
        <div style="position: fixed; 
                    bottom: 50px; right: 50px; width: 200px; height: 120px; 
                    background-color: white; z-index:9999; font-size:14px;
                    border:2px solid grey; border-radius: 5px; padding: 10px">
        <b>Confidence Legend</b><br>
        <i class="fa fa-circle" style="color:green"></i> High (&gt;70%)<br>
        <i class="fa fa-circle" style="color:orange"></i> Medium (40-70%)<br>
        <i class="fa fa-circle" style="color:red"></i> Low (&lt;40%)
        </div>
        '''
        m.get_root().html.add_child(folium.Element(legend_html))
        
        # Save map
        map_file = PROJECT_ROOT / "output" / "deeds_map.html"
        m.save(str(map_file))
        print(f"✓ Interactive map saved to {map_file}")
        print(f"  Open in browser to view")
        
        # Display in notebook
        display(m)
    else:
        print("⚠ No geocoded data available for mapping")
        
except ImportError:
    print("⚠ folium not installed. Install with: pip install folium")
    print("  Skipping map visualization")
except Exception as e:
    print(f"⚠ Error creating map: {e}")

⚠ No geocoded data available for mapping


## Pipeline Complete!

### Summary
- **Step 3**: Web scraping completed with fresh browser instances
- **Step 4**: Geocoding completed with async compatibility
- **Step 5**: Data integrated and exported

### Outputs
- `notebook_final_output.json` - Full nested data structure
- `notebook_final_output.csv` - Flattened data for analysis
- `deeds_map.html` - Interactive map visualization
- Checkpoint files at each step for debugging

### Next Steps
1. Review quality report and identify any failed records
2. Analyze geocoding confidence scores
3. Use CSV for further analysis or database import