# Bexar County PPP Loans Interactive Mapping
This notebook creates an interactive map of PPP loan concentrations in Bexar County, Texas using a local Nominatim server for geocoding.

In [None]:
# Install required packages
!pip3 install folium geopy pandas numpy

In [None]:
# Import libraries and load data
import pandas as pd
import numpy as np
import folium
from geopy.geocoders import Nominatim
import time
import re
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Load the Bexar County PPP data
df = pd.read_csv("bexar_county_frequent_addresses_3plus.csv")

print(f"Loaded {len(df):,} PPP loan records")
print(f"Data columns: {list(df.columns)}")
if 'DateApproved' in df.columns:
    print(f"Date range: {df['DateApproved'].min()} to {df['DateApproved'].max()}")

# Display basic info
df.info()

In [None]:
# Data preprocessing and aggregation by address
address_cols = ['BorrowerAddress', 'BorrowerCity', 'BorrowerZip', 'BorrowerState']
available_address_cols = [col for col in address_cols if col in df.columns]

# Group by address and aggregate key metrics
agg_dict = {
    'BorrowerName': ['count'],
    'InitialApprovalAmount': ['sum', 'mean', 'max', 'min'],
    'address_loan_count': 'first'  # This should be the same for all records at an address
}

# Add forgiveness columns if available
if 'ForgivenessAmount' in df.columns:
    agg_dict['ForgivenessAmount'] = ['sum', 'mean', 'count']

# Create aggregated dataframe
df_agg = df.groupby(available_address_cols).agg(agg_dict).reset_index()

# Flatten column names
df_agg.columns = [
    col[0] if col[1] == '' else f"{col[0]}_{col[1]}" 
    for col in df_agg.columns
]

# Rename for clarity
rename_dict = {
    'BorrowerName_count': 'total_loans',
    'InitialApprovalAmount_sum': 'total_loan_amount',
    'InitialApprovalAmount_mean': 'avg_loan_amount',
    'InitialApprovalAmount_max': 'max_loan_amount',
    'InitialApprovalAmount_min': 'min_loan_amount'
}

if 'ForgivenessAmount_sum' in df_agg.columns:
    rename_dict.update({
        'ForgivenessAmount_sum': 'total_forgiven',
        'ForgivenessAmount_mean': 'avg_forgiven',
        'ForgivenessAmount_count': 'loans_with_forgiveness'
    })

df_agg = df_agg.rename(columns=rename_dict)

print(f"Aggregated to {len(df_agg):,} unique addresses")
print(f"Address loan counts range: {df_agg['total_loans'].min()} to {df_agg['total_loans'].max()}")
print(f"Total loan amounts range: ${df_agg['total_loan_amount'].min():,.2f} to ${df_agg['total_loan_amount'].max():,.2f}")

# Display sample of aggregated data
df_agg.head()

In [None]:
# Setup local Nominatim geocoder and geocoding function
# Replace 'localhost:8080' with your actual Nominatim server address
LOCAL_NOMINATIM_URL = "http://localhost:8080"  # Adjust this to your server

# Initialize geocoder with local Nominatim server
geolocator = Nominatim(
    user_agent="bexar_ppp_mapping", 
    domain=LOCAL_NOMINATIM_URL.replace('http://', '').replace('https://', ''),
    scheme='http'  # or 'https' if your server uses SSL
)

def normalize_address(address_str):
    """Normalize address string for better geocoding success"""
    if pd.isna(address_str):
        return ""
    
    addr = str(address_str).strip()
    
    # Common abbreviations that cause geocoding failures
    abbreviation_fixes = {
        # Suite variations
        r'\\bSte\\b\\.?': 'Suite',
        r'\\bSTE\\b\\.?': 'Suite', 
        r'\\b#': 'Suite ',
        
        # Street type abbreviations
        r'\\bBlvd\\b\\.?': 'Boulevard',
        r'\\bBLVD\\b\\.?': 'Boulevard',
        r'\\bAve\\b\\.?': 'Avenue',
        r'\\bAVE\\b\\.?': 'Avenue',
        r'\\bSt\\b\\.?$': 'Street',
        r'\\bST\\b\\.?$': 'Street',
        r'\\bRd\\b\\.?': 'Road',
        r'\\bRD\\b\\.?': 'Road',
        r'\\bDr\\b\\.?': 'Drive',
        r'\\bDR\\b\\.?': 'Drive',
        r'\\bLn\\b\\.?': 'Lane',
        r'\\bLN\\b\\.?': 'Lane',
        r'\\bCt\\b\\.?': 'Court',
        r'\\bCT\\b\\.?': 'Court',
        r'\\bPl\\b\\.?': 'Place',
        r'\\bPL\\b\\.?': 'Place',
        r'\\bPkwy\\b\\.?': 'Parkway',
        r'\\bPKWY\\b\\.?': 'Parkway',
        r'\\bCir\\b\\.?': 'Circle',
        r'\\bCIR\\b\\.?': 'Circle',
        
        # Direction abbreviations
        r'\\bN\\b\\.?': 'North',
        r'\\bS\\b\\.?': 'South',
        r'\\bE\\b\\.?': 'East',
        r'\\bW\\b\\.?': 'West',
        r'\\bNE\\b\\.?': 'Northeast',
        r'\\bNW\\b\\.?': 'Northwest',
        r'\\bSE\\b\\.?': 'Southeast',
        r'\\bSW\\b\\.?': 'Southwest',
        
        # Common abbreviations
        r'\\bBnd\\b\\.?': 'Bend',
        r'\\bLk\\b\\.?': 'Lake',
        r'\\bMt\\b\\.?': 'Mount',
        r'\\bFt\\b\\.?': 'Fort',
    }
    
    for pattern, replacement in abbreviation_fixes.items():
        addr = re.sub(pattern, replacement, addr, flags=re.IGNORECASE)
    
    # Clean up extra spaces
    addr = re.sub(r'\\s+', ' ', addr).strip()
    return addr

def get_coordinates_local(row, delay=0.1):
    """Enhanced geocoding with multiple fallback strategies"""
    original_address = row.get('BorrowerAddress', '')
    city = row.get('BorrowerCity', '')
    zip_code = str(row.get('BorrowerZip', '')).strip()[:5] if pd.notna(row.get('BorrowerZip')) else ''
    
    # Strategy 1: Full normalized address with ZIP
    normalized_addr = normalize_address(original_address)
    full_address = f"{normalized_addr}, {city}, TX"
    if zip_code and len(zip_code) == 5:
        full_address += f", {zip_code}"
    full_address += ", USA"
    
    strategies = [
        ("Full normalized", full_address),
        ("Without ZIP", f"{normalized_addr}, {city}, TX, USA"),
        ("Without suite", re.sub(r'\\bSuite\\s+\\w+', '', normalized_addr).strip() + f", {city}, TX, USA"),
        ("Street only", re.sub(r'\\bSuite.*$', '', normalized_addr).strip() + f", {city}, TX, USA"),
        ("Original format", f"{original_address}, {city}, TX, USA"),
        ("Just street number and name", re.sub(r'\\b(Suite|#).*$', '', original_address).strip() + f", {city}, TX, USA")
    ]
    
    # Remove empty strategies
    strategies = [(name, addr) for name, addr in strategies if addr.strip() != f", {city}, TX, USA"]
    
    try:
        for strategy_name, address in strategies:
            if not address.strip() or address.count(',') < 2:
                continue
                
            if strategy_name == "Full normalized":
                print(f"Geocoding: {address[:80]}{'...' if len(address) > 80 else ''}")
            
            try:
                location = geolocator.geocode(address, timeout=10)
                if location:
                    if strategy_name != "Full normalized":
                        print(f"  ✓ Found with {strategy_name}: {location.latitude:.6f}, {location.longitude:.6f}")
                    else:
                        print(f"  ✓ Found: {location.latitude:.6f}, {location.longitude:.6f}")
                    return location.latitude, location.longitude, address
            except Exception as e:
                if strategy_name == "Full normalized":
                    print(f"  ⚠ Error with {strategy_name}: {e}")
                continue
        
        print(f"  ✗ No coordinates found after trying {len(strategies)} strategies")
        return None, None, strategies[0][1] if strategies else full_address
    
    except Exception as e:
        print(f"  ✗ Critical error geocoding: {e}")
        return None, None, full_address
    
    finally:
        if delay > 0:
            time.sleep(delay)

print(f"Geocoder configured for: {LOCAL_NOMINATIM_URL}")
print("Ready to geocode addresses...")

In [None]:
# Geocode addresses with proper progress tracking
print(f"Starting geocoding of {len(df_agg)} addresses...")
print("This may take several minutes depending on your server performance.")

# Add columns for coordinates
df_agg['Latitude'] = None
df_agg['Longitude'] = None
df_agg['geocoded_address'] = None

# Track progress properly
total_addresses = len(df_agg)
processed_count = 0
successful_count = 0
start_time = time.time()

for idx, row in df_agg.iterrows():
    # Get coordinates using geocoding function
    lat, lon, geocoded_addr = get_coordinates_local(row, delay=0.1)
    
    # Assign to dataframe
    df_agg.loc[idx, 'Latitude'] = lat
    df_agg.loc[idx, 'Longitude'] = lon
    df_agg.loc[idx, 'geocoded_address'] = geocoded_addr
    
    # Count success
    if lat is not None and lon is not None:
        successful_count += 1
    
    processed_count += 1
    
    # Progress update every 25 addresses
    if processed_count % 25 == 0:
        elapsed = time.time() - start_time
        rate = processed_count / (elapsed / 60) if elapsed > 0 else 0  # addresses per minute
        remaining_time = (total_addresses - processed_count) / rate if rate > 0 else 0
        success_rate = successful_count / processed_count * 100 if processed_count > 0 else 0
        
        print(f"Progress: {processed_count}/{total_addresses} ({processed_count/total_addresses*100:.1f}%) - "
              f"Success: {successful_count} ({success_rate:.1f}%) - "
              f"Rate: {rate:.1f}/min - ETA: {remaining_time:.1f} min")

# Final summary with accurate counts
final_successful = df_agg['Latitude'].notna().sum()
final_failed = len(df_agg) - final_successful

print(f"\n" + "="*60)
print("GEOCODING COMPLETE!")
print("="*60)
print(f"Addresses processed: {processed_count:,}")
print(f"Coordinates found during processing: {successful_count:,}")
print(f"Final successful geocodes in dataframe: {final_successful:,} ({final_successful/len(df_agg)*100:.1f}%)")
print(f"Final failed geocodes: {final_failed:,} ({final_failed/len(df_agg)*100:.1f}%)")

if successful_count != final_successful:
    print(f"\n⚠ Warning: Mismatch between processing count ({successful_count}) and final count ({final_successful})")
    print("This indicates potential issues with coordinate assignment.")

# Save results
df_agg.to_csv("bexar_ppp_geocoded.csv", index=False)
print(f"\nGeocoded data saved to: bexar_ppp_geocoded.csv")

In [None]:
# Create interactive map with detailed markers
import folium

# Filter for successfully geocoded addresses
df_map = df_agg.dropna(subset=['Latitude', 'Longitude']).copy()

print(f"Creating map with {len(df_map):,} geocoded addresses")

# Create base map centered on San Antonio
ppp_map = folium.Map(
    location=[29.5187, -98.6047], 
    zoom_start=11,
    tiles='OpenStreetMap'
)

# Calculate scaling for marker sizes based on total loan amounts
min_amount = df_map['total_loan_amount'].min()
max_amount = df_map['total_loan_amount'].max()
amount_range = max_amount - min_amount

print(f"Loan amount range: ${min_amount:,.2f} to ${max_amount:,.2f}")

# Color scheme based on loan count per address
def get_marker_color(loan_count):
    if loan_count >= 20:
        return 'darkblue'
    elif loan_count >= 10:
        return 'blue' 
    elif loan_count >= 7:
        return 'lightblue'
    elif loan_count >= 5:
        return 'green'
    else:
        return 'lightgreen'

def get_marker_size(total_amount, min_amt, max_amt, range_amt):
    """Scale marker size between 4 and 12 based on total loan amount (smaller dots)"""
    if range_amt == 0:
        return 8
    normalized = (total_amount - min_amt) / range_amt
    return 4 + (8 * normalized)  # 4 to 12 range for smaller markers

# Add scaled circle markers for each address
for idx, row in df_map.iterrows():
    # Calculate marker properties
    marker_size = get_marker_size(row['total_loan_amount'], min_amount, max_amount, amount_range)
    marker_color = get_marker_color(row['total_loans'])
    
    # Get all businesses at this address from the original detailed data
    address_match = (df['BorrowerAddress'] == row['BorrowerAddress']) & (df['BorrowerCity'] == row['BorrowerCity'])
    businesses_at_address = df[address_match]['BorrowerName'].unique()
    
    # Create business list for popup (limit to prevent overly long popups)
    if len(businesses_at_address) <= 10:
        business_list = "<br>".join(f"• {business}" for business in businesses_at_address)
    else:
        business_list = "<br>".join(f"• {business}" for business in businesses_at_address[:10])
        business_list += f"<br>• ... and {len(businesses_at_address) - 10} more businesses"
    
    # Create detailed popup content
    popup_content = f"""
    <b>{row['BorrowerAddress']}</b><br>
    Location: {row['BorrowerCity']}, {row['BorrowerState']} {row.get('BorrowerZip', 'N/A')}<br>
    Total Loans: {row['total_loans']}<br>
    Total Amount: ${row['total_loan_amount']:,.2f}<br>
    Avg Loan: ${row['avg_loan_amount']:,.2f}<br>
    """
    
    # Add forgiveness info if available
    if 'total_forgiven' in row and pd.notna(row['total_forgiven']):
        forgiveness_rate = (row['total_forgiven'] / row['total_loan_amount']) * 100 if row['total_loan_amount'] > 0 else 0
        popup_content += f"Total Forgiven: ${row['total_forgiven']:,.2f}<br>Forgiveness Rate: {forgiveness_rate:.1f}%<br>"
    
    # Add business names
    popup_content += f"<br><b>Businesses at this address:</b><br>{business_list}"
    
    # Create circle marker
    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=marker_size,
        color=marker_color,
        fill=True,
        fillColor=marker_color,
        fillOpacity=0.7,
        popup=folium.Popup(popup_content, max_width=400)
    ).add_to(ppp_map)

# Add a title to the map
title_html = """
<div style="position: fixed; 
    top: 10px; left: 50px; width: 400px; height: 80px; 
    background-color: white; border:2px solid grey; z-index:9999; 
    font-size:16px; font-family: Arial; font-weight: bold;
    text-align: center; padding: 15px; display: flex; align-items: center; justify-content: center;">
    <h3 style="margin: 0; color: #2E86AB;">Bexar County PPP Loans - Addresses with 3+ Loans</h3>
</div>
"""
ppp_map.get_root().html.add_child(folium.Element(title_html))

# Add a legend
legend_html = """
<div style="position: fixed; 
    top: 10px; right: 10px; width: 220px; height: 200px; 
    background-color: white; border:2px solid grey; z-index:9999; 
    font-size:12px; font-family: Arial;
    padding: 10px">
    <h4 style="margin-top: 0; color: #2E86AB;">PPP Loans per Address</h4>
    <p><i class="fa fa-circle" style="color:lightgreen; font-size:16px;"></i> 3-4 loans</p>
    <p><i class="fa fa-circle" style="color:green; font-size:16px;"></i> 5-6 loans</p>
    <p><i class="fa fa-circle" style="color:lightblue; font-size:16px;"></i> 7-9 loans</p>
    <p><i class="fa fa-circle" style="color:blue; font-size:16px;"></i> 10-19 loans</p>
    <p><i class="fa fa-circle" style="color:darkblue; font-size:16px;"></i> 20+ loans</p>
    <hr style="margin: 10px 0;">
    <p style="margin: 5px 0;"><strong>Marker size:</strong> Total loan amount</p>
    <p style="margin: 5px 0; font-size:10px; color:grey;">Larger dots = Higher total $ amount</p>
</div>
"""
ppp_map.get_root().html.add_child(folium.Element(legend_html))

print("Map created successfully!")

# Save the interactive map
map_filename = "bexar_county_ppp_loans_map.html"
ppp_map.save(map_filename)

print(f"Interactive map saved as: {map_filename}")
print(f"Map includes {len(df_map):,} locations with PPP loans")
print(f"Total PPP loans mapped: {df_map['total_loans'].sum():,}")
print(f"Total PPP amount mapped: ${df_map['total_loan_amount'].sum():,.2f}")

# Display the map in Jupyter (if running in notebook)
ppp_map

In [None]:
# Generate summary statistics and analysis
print("="*60)
print("BEXAR COUNTY PPP LOANS - MAPPING SUMMARY")
print("="*60)

print(f"\n📊 DATA OVERVIEW:")
print(f"   • Total unique addresses: {len(df_agg):,}")
print(f"   • Successfully geocoded: {len(df_map):,} ({len(df_map)/len(df_agg)*100:.1f}%)")
print(f"   • Total loans represented: {df_map['total_loans'].sum():,}")
print(f"   • Total loan amount: ${df_map['total_loan_amount'].sum():,.2f}")

print(f"\n📍 GEOGRAPHIC DISTRIBUTION:")
print(f"   • Latitude range: {df_map['Latitude'].min():.4f} to {df_map['Latitude'].max():.4f}")
print(f"   • Longitude range: {df_map['Longitude'].min():.4f} to {df_map['Longitude'].max():.4f}")

print(f"\n🏢 LOAN CONCENTRATION:")
loan_count_dist = df_map['total_loans'].value_counts().sort_index()
for loans, addresses in loan_count_dist.items():
    total_loans_at_count = loans * addresses
    print(f"   • {loans:2d} loans per address: {addresses:3d} addresses ({total_loans_at_count:,} total loans)")

print(f"\n💰 TOP 10 ADDRESSES BY TOTAL LOAN AMOUNT:")
top_addresses = df_map.nlargest(10, 'total_loan_amount')
for idx, (_, row) in enumerate(top_addresses.iterrows(), 1):
    address = f"{row['BorrowerAddress']}, {row['BorrowerCity']}"
    print(f"   {idx:2d}. {address[:50]:<50} ${row['total_loan_amount']:>12,.2f} ({row['total_loans']:2d} loans)")

print(f"\n🎯 TOP 10 ADDRESSES BY LOAN COUNT:")
top_by_count = df_map.nlargest(10, 'total_loans')
for idx, (_, row) in enumerate(top_by_count.iterrows(), 1):
    address = f"{row['BorrowerAddress']}, {row['BorrowerCity']}"
    print(f"   {idx:2d}. {address[:50]:<50} {row['total_loans']:2d} loans (${row['total_loan_amount']:,.2f})")

# Additional export options
print(f"\n💾 EXPORT OPTIONS:")
print(f"   • Full geocoded data: bexar_ppp_geocoded.csv")
print(f"   • Interactive map: {map_filename}")

# Option to export specific subsets
export_high_concentration = df_map[df_map['total_loans'] >= 10]
if len(export_high_concentration) > 0:
    export_high_concentration.to_csv("high_concentration_addresses_10plus.csv", index=False)
    print(f"   • High concentration addresses (10+ loans): high_concentration_addresses_10plus.csv ({len(export_high_concentration)} addresses)")

print(f"\n🗺️  Map Features:")
print(f"   • Interactive markers with detailed loan information")
print(f"   • Color coding by loan count per address")
print(f"   • Marker size scaled by total loan amount")
print(f"   • Individual business names in popups")
print(f"   • Clean white background with title and legend")

print(f"\nMap ready! Open '{map_filename}' in your web browser to explore the interactive map.")