In [None]:
!pip3 install pandas
!pip3 install chardet
!pip3 install bs4

In [None]:
!date

In [None]:
import requests
from bs4 import BeautifulSoup
import os
import logging
import pandas as pd
import chardet
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
!pip3 install pandas
# ======================
# 📜 Enhanced Logging Setup
# ======================
logger = logging.getLogger()
logger.setLevel(logging.INFO)

formatter = logging.Formatter(
    '%(asctime)s [%(levelname)s] %(filename)s:%(lineno)d - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

file_handler = logging.FileHandler('sba_download.log', mode='a')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

# ======================
# 🌐 Setup Requests Session
# ======================
session = requests.Session()
retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
session.mount('https://', HTTPAdapter(max_retries=retries))

# ======================
# 🗂️ Setup Directory
# ======================
os.makedirs("sba_csv", exist_ok=True)

# ======================
# 🕵️ Encoding Detector (Optional)
# ======================
def detect_encoding(file_path, sample_size=10000):
    with open(file_path, 'rb') as f:
        raw_data = f.read(sample_size)
    result = chardet.detect(raw_data)
    return result['encoding'], result['confidence']

# ======================
# 🔍 Step 1: Scrape SBA Resource Links
# ======================
parent_url = "https://data.sba.gov/dataset/ppp-foia"

try:
    logger.info(f"Accessing parent page: {parent_url}")
    response = session.get(parent_url, timeout=20)
    response.raise_for_status()
except Exception as e:
    logger.error(f"Parent page access failed: {e}")
    raise SystemExit

soup = BeautifulSoup(response.text, "html.parser")
resource_links = [
    "https://data.sba.gov" + a['href']
    for a in soup.select('a[href*="/dataset/ppp-foia/resource/"]')
]
logger.info(f"Found {len(resource_links)} resource pages")

# ======================
# 🔗 Step 2: Extract CSV Download Links
# ======================
csv_links, csv_filenames = [], []

for url in resource_links:
    try:
        logger.info(f"Processing resource page: {url}")
        res = session.get(url, timeout=15)
        res.raise_for_status()
        sub_soup = BeautifulSoup(res.text, "html.parser")
        link_tag = sub_soup.select_one('a.resource-url-analytics')

        if link_tag and link_tag['href'].endswith('.csv'):
            link = link_tag['href']
            filename = link.split("/")[-1]
            csv_links.append(link)
            csv_filenames.append(filename)
            logger.info(f"CSV found: {filename}")
        else:
            logger.warning(f"No CSV link found on page: {url}")
    except Exception as e:
        logger.warning(f"Failed to extract link from {url}: {e}")

logger.info(f"Total CSVs identified: {len(csv_links)}")

# ======================
# 💾 Step 3: Download CSVs with Cache Check
# ======================
MAX_AGE = 5 * 86400  # 5 days

for i, (link, filename) in enumerate(zip(csv_links, csv_filenames)):
    file_path = os.path.join("sba_csv", filename)
    logger.info(f"[{i+1}/{len(csv_links)}] Checking: {filename}")

    if os.path.exists(file_path):
        age = time.time() - os.path.getmtime(file_path)
        size = os.path.getsize(file_path)
        if age < MAX_AGE and size > 1024:
            logger.info(f"Using cached file: {filename} (Age: {int(age)}s, Size: {size}B)")
            continue
        else:
            logger.info(f"Refreshing stale or small file: {filename} (Age: {int(age)}s, Size: {size}B)")

    try:
        logger.info(f"Starting download: {link}")
        with session.get(link, stream=True, timeout=30) as r:
            r.raise_for_status()
            with open(file_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=4096):  # small chunk for low memory impact
                    if chunk:
                        f.write(chunk)
        logger.info(f"Downloaded successfully: {filename}")
    except Exception as e:
        logger.error(f"Download failed for {filename}: {e}")

logger.info("All downloads complete ✅")


In [None]:
!date

In [None]:
! ls -alt ./sba_csv/

In [None]:
import pandas as pd
import os
import logging
import traceback
import gc

# ======================
# 📜 Logging Setup
# ======================
logger = logging.getLogger()
logger.setLevel(logging.INFO)
formatter = logging.Formatter(
    '%(asctime)s [%(levelname)s] %(filename)s:%(lineno)d - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger.handlers.clear()
file_handler = logging.FileHandler('sba_download.log', mode='a')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

# ======================
# 📂 Directory Setup
# ======================
chunk_dir = "sba_csv/chunks"
os.makedirs(chunk_dir, exist_ok=True)
problematic_files = []

# ======================
# 🔁 Chunked Write
# ======================
logger.info("🔁 Starting chunked write for public_up_to_150k CSVs...")
seen = set()

for i, filename in enumerate(sorted(set(csv_filenames))):
    if filename.startswith("public_up_to_150k") and filename.endswith(".csv") and filename not in seen:
        seen.add(filename)
        file_path = os.path.join("sba_csv", filename)
        logger.info(f"[{i+1}] Reading: {filename}")
        print(f"Processing file {i+1}: {filename}")

        try:
            df = pd.read_csv(file_path, encoding='latin-1', on_bad_lines='skip', low_memory=False)
            chunk_path = os.path.join(chunk_dir, f"chunk_{i:02d}.csv")
            df.to_csv(chunk_path, index=False)
            logger.info(f"✅ Chunk written: {chunk_path}")
        except Exception as e:
            logger.warning(f"⚠️ Failed to process {filename}: {e}")
            traceback.print_exc()
            problematic_files.append(filename)

        del df
        gc.collect()

# ======================
# 🗂️ Save Problematic Filenames
# ======================
if problematic_files:
    bad_path = os.path.join("sba_csv", "problematic_files.txt")
    with open(bad_path, "w") as f:
        for file in problematic_files:
            f.write(f"{file}\n")
    logger.info(f"📄 Problematic file list written to: {bad_path}")

# ======================
# 🧠 Dtype Harmonization
# ======================
chunk_files = sorted([
    os.path.join(chunk_dir, f) for f in os.listdir(chunk_dir) if f.endswith('.csv')
])
logger.info("📊 Inspecting chunk column types for harmonization...")
column_types = {}

# First pass: collect types
for f in chunk_files:
    try:
        df = pd.read_csv(f, encoding='latin-1', low_memory=False)
        for col in df.columns:
            column_types.setdefault(col, set()).add(df[col].dtype)
        del df
        gc.collect()
    except Exception as e:
        logger.warning(f"Failed type scan on {f}: {e}")
        traceback.print_exc()

# Build harmonized dtype map
harmonized_types = {}
for col, types in column_types.items():
    if len(types) == 1:
        harmonized_types[col] = next(iter(types))
    else:
        harmonized_types[col] = "object"
        logger.info(f"Column '{col}' has mixed types: {types}. Forcing to object.")

# ======================
# 📥 Harmonized Read & Merge
# ======================
logger.info("📥 Loading chunks with harmonized dtypes...")
harmonized_chunks = []

for f in chunk_files:
    try:
        df = pd.read_csv(f, encoding='latin-1', dtype=harmonized_types, low_memory=False)
        harmonized_chunks.append(df)
        logger.info(f"✔️ Harmonized load: {os.path.basename(f)}")
    except Exception as e:
        logger.warning(f"Failed harmonized load on {f}: {e}")
        traceback.print_exc()

logger.info("🔗 Concatenating all harmonized chunks...")
combined_df = pd.concat(harmonized_chunks, ignore_index=True)

# ======================
# 💾 Save Final Output
# ======================
combined_path = os.path.join("sba_csv", "combined_public_up_to_150k.csv")
combined_df.to_csv(combined_path, index=False)
logger.info(f"✅ Final merged CSV written to: {combined_path}")
print("Merge complete with harmonized types.")

# ======================
# 📋 Schema Snapshot
# ======================
schema_path = os.path.join("sba_csv", "schema_summary.csv")
combined_df.dtypes.to_frame(name="dtype").to_csv(schema_path)
logger.info(f"📄 Column type summary saved to: {schema_path}")

# ======================
# 🔍 Type-Forced Column Sample Audit
# ======================
for col, types in column_types.items():
    if len(types) > 1 and harmonized_types.get(col) == "object":
        sample_vals = combined_df[col].dropna().astype(str).unique()[:10]
        logger.info(f"🔍 Column '{col}' forced to object due to mixed types {types}. Sample values: {sample_vals}")


In [None]:
!date

In [None]:
! ls -alt ./sba_csv/

In [None]:
import pandas as pd
import logging

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.FileHandler("csv_diagnostics.log"), logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

def load_csv_robust(filepath, chunksize=100_000, dtype_map=None, encoding="latin-1"):
    logger.info(f"Starting CSV load: {filepath}")
    logger.info(f"Using encoding: {encoding}, chunksize: {chunksize}")

    try:
        reader = pd.read_csv(
            filepath,
            chunksize=chunksize,
            dtype=dtype_map,
            encoding=encoding,
            on_bad_lines='skip',
            low_memory=False
        )
        for i, chunk in enumerate(reader):
            logger.info(f"Loaded chunk {i + 1} with shape {chunk.shape}")
            yield chunk
        logger.info("Finished loading all chunks.")
    except Exception as e:
        logger.error(f"Failed to load CSV: {e}")

def filter_texas_rows(chunk):
    filtered = chunk[chunk['BorrowerZip'].astype(str).str.startswith('78')].copy()
    return filtered

def run_diagnostics(df_tx):
    # Null values
    null_counts = df_tx.isnull().sum()
    null_ratio = (df_tx.isnull().mean() * 100).round(2)
    logger.info(f"Null counts:\n{null_counts.to_string()}")
    logger.info(f"Null percentage:\n{null_ratio.to_string()}")

    # Zip code sanity
    zip_prefix_counts = df_tx['BorrowerZip'].astype(str).str[:3].value_counts()
    logger.info(f"Top zip prefixes:\n{zip_prefix_counts.head(10).to_string()}")

    invalid_zips = df_tx[~df_tx['BorrowerZip'].astype(str).str.startswith('78')]
    logger.info(f"Non-Texas zip rows (should be zero): {invalid_zips.shape[0]}")

    # NAICS distribution
    if 'NAICSCode' in df_tx.columns:
        naics_counts = df_tx['NAICSCode'].value_counts().head(10)
        logger.info(f"Top NAICS codes:\n{naics_counts.to_string()}")

# Define dtypes for optimization
types = {
    'InitialApprovalAmount': 'float32',
    'BorrowerZip': 'category',
    'NAICSCode': 'category'
}

# Load and process chunks
df_tx_list = []
for chunk in load_csv_robust("./sba_csv/combined_public_up_to_150k.csv", dtype_map=types):
    filtered = filter_texas_rows(chunk)
    df_tx_list.append(filtered)
    logger.info(f"Filtered Texas rows in chunk: {filtered.shape[0]}")

# Merge all filtered rows
df_tx = pd.concat(df_tx_list, ignore_index=True)
logger.info(f"Final Texas row count: {df_tx.shape[0]}")

# Run data diagnostics
run_diagnostics(df_tx)


In [None]:
df_tx.info()

In [None]:
!date

In [None]:
# Step 1: Count occurrences of each BorrowerAddress
address_counts = df_tx['BorrowerAddress'].value_counts()

# Step 2: Get addresses appearing >15 times
frequent_addresses = address_counts[address_counts > 15].index

# Step 3: Filter the original DataFrame
df_frequent_addresses = df_tx[df_tx['BorrowerAddress'].isin(frequent_addresses)]

# Display results
print(f"Found {len(frequent_addresses)} addresses appearing >15 times.")
df_frequent_addresses.head()

In [None]:
df_tx[df_tx.BorrowerAddress.str.contains('9900 Spectrum Dr')]

In [None]:
dir()

In [None]:
df.info()

In [None]:
df_tx.info()

In [None]:
df.InitialApprovalAmount.max()

In [None]:
df.BorrowerState.unique()

In [None]:
df.BorrowerState

In [None]:
df.info()

In [None]:
dftest1 = pd.read_csv("./sba_csv/public_150k_plus_240930.csv", encoding='ISO-8859-1')

In [None]:
dftest1.info()

In [None]:
df_test2 = pd.read_csv('./sba_csv/public_150k_plus_240930.csv',  encoding='ISO-8859-1', low_memory=False)
print("Full row count:", df_test2.shape[0])


In [None]:
df_test2.ForgivenessAmount.max()

In [None]:
df_test2[df_test2.ForgivenessAmount > 10000000][['BorrowerCity','BorrowerName','ForgivenessAmount']]

In [None]:
df_test2.BorrowerState.unique()

In [None]:
df.columns

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
import gc

def find_duplicate_addresses_chunked(filepath, min_occurrences=2, chunksize=100000):
    """
    Find addresses that appear multiple times across the entire dataset
    without loading everything into memory at once.
    """
    print(f"Scanning for addresses appearing ≥{min_occurrences} times...")
    
    # Step 1: Count all addresses across chunks
    address_counts = defaultdict(int)
    total_rows = 0
    
    try:
        reader = pd.read_csv(
            filepath, 
            chunksize=chunksize, 
            encoding='latin-1',
            usecols=['BorrowerAddress', 'BorrowerCity', 'BorrowerState'],  # Only load what we need
            on_bad_lines='skip'
        )
        
        for i, chunk in enumerate(reader):
            # Clean and standardize addresses
            chunk['BorrowerAddress'] = chunk['BorrowerAddress'].fillna('').str.strip().str.upper()
            
            # Count addresses in this chunk
            chunk_counts = chunk['BorrowerAddress'].value_counts()
            for addr, count in chunk_counts.items():
                if addr:  # Skip empty addresses
                    address_counts[addr] += count
            
            total_rows += len(chunk)
            if i % 10 == 0:
                print(f"Processed {i+1} chunks, {total_rows:,} total rows")
                
            # Clean up memory
            del chunk
            gc.collect()
            
    except Exception as e:
        print(f"Error processing chunks: {e}")
        return None
    
    # Step 2: Filter for addresses with multiple occurrences
    duplicate_addresses = {
        addr: count for addr, count in address_counts.items() 
        if count >= min_occurrences
    }
    
    print(f"\nFound {len(duplicate_addresses):,} addresses appearing ≥{min_occurrences} times")
    print(f"Total unique addresses processed: {len(address_counts):,}")
    
    return duplicate_addresses

def get_records_for_address(filepath, target_address, chunksize=100000):
    """
    Extract all records for a specific address without loading full dataset.
    """
    print(f"Searching for all records at: {target_address}")
    target_address = target_address.strip().upper()
    matching_records = []
    
    try:
        reader = pd.read_csv(
            filepath, 
            chunksize=chunksize, 
            encoding='latin-1',
            on_bad_lines='skip'
        )
        
        for i, chunk in enumerate(reader):
            # Standardize addresses for comparison
            chunk['BorrowerAddress_clean'] = chunk['BorrowerAddress'].fillna('').str.strip().str.upper()
            
            # Find matches in this chunk
            matches = chunk[chunk['BorrowerAddress_clean'] == target_address].copy()
            if len(matches) > 0:
                matching_records.append(matches.drop('BorrowerAddress_clean', axis=1))
                print(f"Found {len(matches)} records in chunk {i+1}")
            
            # Clean up
            del chunk
            gc.collect()
            
    except Exception as e:
        print(f"Error searching for address: {e}")
        return pd.DataFrame()
    
    if matching_records:
        result = pd.concat(matching_records, ignore_index=True)
        print(f"\nTotal records found: {len(result)}")
        return result
    else:
        print("No records found for this address")
        return pd.DataFrame()

def analyze_top_duplicate_addresses(filepath, top_n=20, min_occurrences=10):
    """
    Find and analyze the most frequently occurring addresses.
    """
    print("=== DUPLICATE ADDRESS ANALYSIS ===\n")
    
    # Get duplicate address counts
    duplicates = find_duplicate_addresses_chunked(filepath, min_occurrences)
    
    if not duplicates:
        return None
    
    # Sort by frequency
    sorted_duplicates = sorted(duplicates.items(), key=lambda x: x[1], reverse=True)
    
    print(f"\nTOP {top_n} MOST FREQUENT ADDRESSES:")
    print("=" * 60)
    
    for i, (address, count) in enumerate(sorted_duplicates[:top_n], 1):
        print(f"{i:2d}. {address[:50]:<50} ({count:,} occurrences)")
    
    return dict(sorted_duplicates)

# Example usage functions
def quick_address_search(filepath, search_term):
    """
    Quick search for addresses containing a specific term.
    """
    search_term = search_term.upper()
    matching_addresses = {}
    
    print(f"Searching for addresses containing: '{search_term}'")
    
    try:
        reader = pd.read_csv(
            filepath, 
            chunksize=50000, 
            encoding='latin-1',
            usecols=['BorrowerAddress'],
            on_bad_lines='skip'
        )
        
        for chunk in reader:
            chunk['BorrowerAddress'] = chunk['BorrowerAddress'].fillna('').str.strip().str.upper()
            
            # Find addresses containing search term
            matches = chunk[chunk['BorrowerAddress'].str.contains(search_term, na=False)]
            
            if len(matches) > 0:
                counts = matches['BorrowerAddress'].value_counts()
                for addr, count in counts.items():
                    matching_addresses[addr] = matching_addresses.get(addr, 0) + count
            
            del chunk
            gc.collect()
            
    except Exception as e:
        print(f"Error in search: {e}")
        return {}
    
    # Sort results
    sorted_matches = sorted(matching_addresses.items(), key=lambda x: x[1], reverse=True)
    
    print(f"\nFound {len(sorted_matches)} unique addresses containing '{search_term}':")
    for addr, count in sorted_matches[:10]:  # Show top 10
        print(f"  {addr[:60]:<60} ({count:,})")
    
    return dict(sorted_matches)

# Main execution example
if __name__ == "__main__":
    filepath = "./sba_csv/combined_public_up_to_150k.csv"
    
    # 1. Find top duplicate addresses
    duplicates = analyze_top_duplicate_addresses(filepath, top_n=25, min_occurrences=15)
    
    # 2. Get detailed records for a specific address (example from your notebook)
    # target_address = "9900 Spectrum Dr"
    # records = get_records_for_address(filepath, target_address)
    # if not records.empty:
    #     print(f"\nSample records for {target_address}:")
    #     print(records[['BorrowerName', 'BorrowerCity', 'InitialApprovalAmount']].head())
    
    # 3. Search for addresses containing specific terms
    # search_results = quick_address_search(filepath, "SPECTRUM")

In [None]:
import pandas as pd
import gc
import logging

def find_woodmont_entries(filepath, chunksize=100000, save_results=True):
    """
    Find all entries for "1445 Woodmont NW" using chunked processing.
    """
    # Set up logging
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
    logger = logging.getLogger()
    
    target_address = "1445 WOODMONT NW"  # Standardized format
    matching_records = []
    total_matches = 0
    total_rows_processed = 0
    chunk_count = 0
    
    logger.info(f"Searching for entries at: {target_address}")
    logger.info(f"Processing file: {filepath}")
    
    try:
        reader = pd.read_csv(
            filepath, 
            chunksize=chunksize, 
            encoding='latin-1',
            on_bad_lines='skip',
            low_memory=False
        )
        
        for chunk in reader:
            chunk_count += 1
            total_rows_processed += len(chunk)
            
            # Standardize addresses for comparison
            chunk['BorrowerAddress_clean'] = (
                chunk['BorrowerAddress']
                .fillna('')
                .str.strip()
                .str.upper()
                .str.replace(r'\s+', ' ', regex=True)  # Normalize whitespace
            )
            
            # Find exact matches
            exact_matches = chunk[chunk['BorrowerAddress_clean'] == target_address].copy()
            
            # Also check for partial matches (in case of variations)
            partial_matches = chunk[
                chunk['BorrowerAddress_clean'].str.contains('1445.*WOODMONT', na=False, regex=True)
            ].copy()
            
            if len(exact_matches) > 0:
                matching_records.append(exact_matches.drop('BorrowerAddress_clean', axis=1))
                total_matches += len(exact_matches)
                logger.info(f"Chunk {chunk_count}: Found {len(exact_matches)} exact matches")
                
            if len(partial_matches) > len(exact_matches):
                # Show if there are additional partial matches
                additional = len(partial_matches) - len(exact_matches)
                if additional > 0:
                    logger.info(f"Chunk {chunk_count}: Found {additional} additional partial matches")
                    
                    # Show the variations found
                    variations = partial_matches[
                        ~partial_matches['BorrowerAddress_clean'].isin(exact_matches['BorrowerAddress_clean'])
                    ]['BorrowerAddress_clean'].unique()
                    
                    for variation in variations[:3]:  # Show first 3 variations
                        logger.info(f"  Variation found: {variation}")
            
            # Progress update every 50 chunks
            if chunk_count % 50 == 0:
                logger.info(f"Processed {chunk_count} chunks, {total_rows_processed:,} total rows")
            
            # Clean up memory
            del chunk
            gc.collect()
            
    except Exception as e:
        logger.error(f"Error processing chunks: {e}")
        return pd.DataFrame()
    
    # Combine all matching records
    if matching_records:
        result = pd.concat(matching_records, ignore_index=True)
        logger.info(f"\n=== SEARCH COMPLETE ===")
        logger.info(f"Total chunks processed: {chunk_count}")
        logger.info(f"Total rows scanned: {total_rows_processed:,}")
        logger.info(f"Total exact matches found: {len(result)}")
        
        if save_results and len(result) > 0:
            output_file = "woodmont_1445_results.csv"
            result.to_csv(output_file, index=False)
            logger.info(f"Results saved to: {output_file}")
        
        return result
    else:
        logger.info("No exact matches found for 1445 Woodmont NW")
        return pd.DataFrame()

def analyze_woodmont_results(df):
    """
    Analyze the results for 1445 Woodmont NW entries.
    """
    if df.empty:
        print("No data to analyze")
        return
    
    print(f"\n=== ANALYSIS: 1445 WOODMONT NW ===")
    print(f"Total entries found: {len(df)}")
    
    # Basic statistics
    if 'InitialApprovalAmount' in df.columns:
        print(f"\nLoan Amounts:")
        print(f"  Total approved: ${df['InitialApprovalAmount'].sum():,.2f}")
        print(f"  Average loan: ${df['InitialApprovalAmount'].mean():,.2f}")
        print(f"  Median loan: ${df['InitialApprovalAmount'].median():,.2f}")
        print(f"  Range: ${df['InitialApprovalAmount'].min():,.2f} - ${df['InitialApprovalAmount'].max():,.2f}")
    
    # Forgiveness statistics
    if 'ForgivenessAmount' in df.columns:
        forgiven = df['ForgivenessAmount'].dropna()
        if len(forgiven) > 0:
            print(f"\nForgiveness:")
            print(f"  Loans with forgiveness data: {len(forgiven)}")
            print(f"  Total forgiven: ${forgiven.sum():,.2f}")
            print(f"  Average forgiveness: ${forgiven.mean():,.2f}")
    
    # Business information
    if 'BorrowerName' in df.columns:
        print(f"\nBusiness Names:")
        for name in df['BorrowerName'].unique()[:10]:  # Show first 10 unique names
            count = (df['BorrowerName'] == name).sum()
            print(f"  {name} ({count} loan{'s' if count > 1 else ''})")
    
    # Dates
    if 'DateApproved' in df.columns:
        print(f"\nApproval Dates:")
        dates = pd.to_datetime(df['DateApproved'], errors='coerce')
        print(f"  Date range: {dates.min()} to {dates.max()}")
        print(f"  Most common year: {dates.dt.year.mode().iloc[0] if len(dates.mode()) > 0 else 'N/A'}")
    
    # Location details
    if 'BorrowerCity' in df.columns and 'BorrowerState' in df.columns:
        locations = df[['BorrowerCity', 'BorrowerState']].drop_duplicates()
        print(f"\nLocations:")
        for _, row in locations.iterrows():
            print(f"  {row['BorrowerCity']}, {row['BorrowerState']}")
    
    return df

# Execute the search
if __name__ == "__main__":
    filepath = "./sba_csv/combined_public_up_to_150k.csv"
    
    print("Starting search for 1445 Woodmont NW...")
    results = find_woodmont_entries(filepath)
    
    if not results.empty:
        # Analyze the results
        analyze_woodmont_results(results)
        
        # Show sample records
        print(f"\n=== SAMPLE RECORDS ===")
        display_columns = ['BorrowerName', 'BorrowerCity', 'BorrowerState', 
                          'InitialApprovalAmount', 'DateApproved', 'ForgivenessAmount']
        available_columns = [col for col in display_columns if col in results.columns]
        print(results[available_columns].head(10).to_string())
        
    else:
        print("\nNo records found for 1445 Woodmont NW")
        print("You might want to try variations like:")
        print("  - 1445 WOODMONT")  
        print("  - WOODMONT NW")
        print("  - Different spacing/punctuation")

In [None]:
import pandas as pd
import gc
import re
from collections import defaultdict

def find_all_woodmont_variations(filepath, chunksize=100000):
    """
    Find ALL variations of 1445 Woodmont addresses based on your log findings.
    """
    print("Searching for all 1445 Woodmont variations...")
    
    # Pattern to match all variations found in your logs
    woodmont_pattern = r'1445\s+WOODMONT\s+(LN\s+NW|NW)'
    
    all_matches = []
    address_variations = defaultdict(int)
    total_matches = 0
    chunk_count = 0
    
    try:
        reader = pd.read_csv(
            filepath, 
            chunksize=chunksize, 
            encoding='latin-1',
            on_bad_lines='skip',
            low_memory=False
        )
        
        for chunk in reader:
            chunk_count += 1
            
            # Clean addresses
            chunk['addr_clean'] = (
                chunk['BorrowerAddress']
                .fillna('')
                .str.strip()
                .str.upper()
                .str.replace(r'\s+', ' ', regex=True)
            )
            
            # Find all Woodmont variations using regex
            woodmont_matches = chunk[
                chunk['addr_clean'].str.contains(woodmont_pattern, na=False, regex=True)
            ].copy()
            
            if len(woodmont_matches) > 0:
                all_matches.append(woodmont_matches.drop('addr_clean', axis=1))
                total_matches += len(woodmont_matches)
                
                # Count variations in this chunk
                for addr in woodmont_matches['addr_clean'].unique():
                    address_variations[addr] += (woodmont_matches['addr_clean'] == addr).sum()
                
                print(f"Chunk {chunk_count}: Found {len(woodmont_matches)} Woodmont records")
            
            # Progress update
            if chunk_count % 25 == 0:
                print(f"Processed {chunk_count} chunks, {total_matches} total matches so far")
            
            del chunk, woodmont_matches
            gc.collect()
            
    except Exception as e:
        print(f"Error: {e}")
        return pd.DataFrame(), {}
    
    # Combine results
    if all_matches:
        result = pd.concat(all_matches, ignore_index=True)
        print(f"\n=== SEARCH COMPLETE ===")
        print(f"Total Woodmont records found: {len(result)}")
        
        # Show address variations
        print(f"\nAddress Variations Found:")
        sorted_variations = sorted(address_variations.items(), key=lambda x: x[1], reverse=True)
        for addr, count in sorted_variations[:15]:  # Top 15 variations
            print(f"  {addr:<40} ({count:,} records)")
        
        return result, dict(sorted_variations)
    else:
        print("No Woodmont variations found")
        return pd.DataFrame(), {}

def analyze_woodmont_complex(df, variations_dict):
    """
    Analyze the 1445 Woodmont complex as a business center.
    """
    if df.empty:
        print("No data to analyze")
        return
    
    print(f"\n=== 1445 WOODMONT BUSINESS CENTER ANALYSIS ===")
    print(f"Total businesses/loans: {len(df)}")
    print(f"Unique address variations: {len(variations_dict)}")
    
    # Financial analysis
    if 'InitialApprovalAmount' in df.columns:
        total_approved = df['InitialApprovalAmount'].sum()
        print(f"\nFinancial Impact:")
        print(f"  Total PPP loans approved: ${total_approved:,.2f}")
        print(f"  Average loan size: ${df['InitialApprovalAmount'].mean():,.2f}")
        print(f"  Largest loan: ${df['InitialApprovalAmount'].max():,.2f}")
        print(f"  Smallest loan: ${df['InitialApprovalAmount'].min():,.2f}")
    
    # Forgiveness analysis
    if 'ForgivenessAmount' in df.columns:
        forgiven_df = df.dropna(subset=['ForgivenessAmount'])
        if len(forgiven_df) > 0:
            total_forgiven = forgiven_df['ForgivenessAmount'].sum()
            forgiveness_rate = len(forgiven_df) / len(df) * 100
            print(f"\nForgiveness:")
            print(f"  Total forgiven: ${total_forgiven:,.2f}")
            print(f"  Forgiveness rate: {forgiveness_rate:.1f}% ({len(forgiven_df)}/{len(df)} loans)")
    
    # Business diversity
    if 'BorrowerName' in df.columns:
        unique_businesses = df['BorrowerName'].nunique()
        print(f"\nBusiness Diversity:")
        print(f"  Unique business names: {unique_businesses}")
        print(f"  Average loans per business: {len(df)/unique_businesses:.1f}")
        
        # Show businesses with multiple loans
        business_counts = df['BorrowerName'].value_counts()
        multiple_loans = business_counts[business_counts > 1]
        if len(multiple_loans) > 0:
            print(f"  Businesses with multiple loans: {len(multiple_loans)}")
            print(f"  Top repeat borrowers:")
            for business, count in multiple_loans.head().items():
                print(f"    {business[:50]:<50} ({count} loans)")
    
    # Suite/Unit analysis
    suite_addresses = [addr for addr in variations_dict.keys() if '#' in addr or 'STE' in addr]
    if suite_addresses:
        print(f"\nSuite/Unit Distribution:")
        print(f"  Addresses with suite numbers: {len(suite_addresses)}")
        
        # Extract suite numbers
        suite_numbers = []
        for addr in suite_addresses:
            # Extract numbers after # or STE
            matches = re.findall(r'#\s*(\d+)|STE\s+(\d+)', addr)
            for match in matches:
                suite_num = match[0] or match[1]
                if suite_num:
                    suite_numbers.append(int(suite_num))
        
        if suite_numbers:
            print(f"  Suite number range: {min(suite_numbers)} - {max(suite_numbers)}")
            print(f"  Total suites identified: {len(set(suite_numbers))}")
    
    # Timeline analysis
    if 'DateApproved' in df.columns:
        dates = pd.to_datetime(df['DateApproved'], errors='coerce')
        print(f"\nTimeline:")
        print(f"  Date range: {dates.min().date()} to {dates.max().date()}")
        
        # Monthly distribution
        monthly = dates.dt.to_period('M').value_counts().sort_index()
        print(f"  Peak month: {monthly.idxmax()} ({monthly.max()} loans)")
    
    return df

# Execute the comprehensive search
if __name__ == "__main__":
    filepath = "./sba_csv/combined_public_up_to_150k.csv"
    
    # Find all Woodmont variations
    results, variations = find_all_woodmont_variations(filepath)
    
    if not results.empty:
        # Save results
        results.to_csv("all_woodmont_1445_variations.csv", index=False)
        print(f"Results saved to: all_woodmont_1445_variations.csv")
        
        # Analyze the complex
        analyze_woodmont_complex(results, variations)
        
        # Show sample records from different address types
        print(f"\n=== SAMPLE RECORDS ===")
        display_cols = ['BorrowerName', 'BorrowerAddress', 'InitialApprovalAmount', 
                       'DateApproved', 'ForgivenessAmount']
        available_cols = [col for col in display_cols if col in results.columns]
        
        # Show samples from different address variations
        unique_addresses = results['BorrowerAddress'].unique()[:55]
        for addr in unique_addresses:
            addr_samples = results[results['BorrowerAddress'] == addr]
            print(f"\nSample from {addr}:")
            print(addr_samples[available_cols].head(3).to_string(index=False))
    
    else:
        print("No Woodmont records found")

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
import gc

def scan_frequent_addresses(filepath, min_occurrences=30, chunksize=100000):
    """
    Scan for addresses appearing more than the specified threshold using chunked processing.
    Memory-efficient approach for large datasets.
    """
    print(f"🔍 Scanning for addresses appearing ≥{min_occurrences} times...")
    print(f"📁 Processing file: {filepath}")
    print(f"📦 Chunk size: {chunksize:,} rows")
    
    # Dictionary to count address occurrences
    address_counts = defaultdict(int)
    total_rows_processed = 0
    chunk_count = 0
    
    try:
        # Create chunked CSV reader
        reader = pd.read_csv(
            filepath, 
            chunksize=chunksize, 
            encoding='latin-1',
            usecols=['BorrowerAddress', 'BorrowerCity', 'BorrowerState'],  # Only load needed columns
            on_bad_lines='skip',
            low_memory=False
        )
        
        # Process each chunk
        for chunk in reader:
            chunk_count += 1
            total_rows_processed += len(chunk)
            
            # Clean and standardize addresses
            chunk['addr_clean'] = (
                chunk['BorrowerAddress']
                .fillna('')
                .str.strip()
                .str.upper()
                .str.replace(r'\s+', ' ', regex=True)  # Normalize whitespace
            )
            
            # Count addresses in this chunk
            chunk_counts = chunk['addr_clean'].value_counts()
            for address, count in chunk_counts.items():
                if address and len(address) > 3:  # Skip empty/very short addresses
                    address_counts[address] += count
            
            # Progress reporting
            if chunk_count % 25 == 0:
                print(f"   📊 Processed {chunk_count} chunks ({total_rows_processed:,} total rows)")
            
            # Memory cleanup
            del chunk, chunk_counts
            gc.collect()
            
    except Exception as e:
        print(f"❌ Error during processing: {e}")
        return {}
    
    # Filter for frequent addresses
    frequent_addresses = {
        addr: count for addr, count in address_counts.items() 
        if count >= min_occurrences
    }
    
    # Sort by frequency (descending)
    sorted_addresses = sorted(frequent_addresses.items(), key=lambda x: x[1], reverse=True)
    
    print(f"\n✅ SCAN COMPLETE")
    print(f"📈 Total rows processed: {total_rows_processed:,}")
    print(f"🏢 Addresses with ≥{min_occurrences} occurrences: {len(sorted_addresses):,}")
    print(f"📝 Total unique addresses scanned: {len(address_counts):,}")
    
    return dict(sorted_addresses)

# Execute the scan
filepath = "./sba_csv/combined_public_up_to_150k.csv"
frequent_addresses = scan_frequent_addresses(filepath, min_occurrences=30)

if frequent_addresses:
    print(f"\n🏆 TOP 25 MOST FREQUENT ADDRESSES (≥30 occurrences):")
    print("=" * 80)
    
    for i, (address, count) in enumerate(list(frequent_addresses.items())[:25], 1):
        # Truncate very long addresses for display
        display_addr = address[:60] + "..." if len(address) > 60 else address
        print(f"{i:2d}. {display_addr:<63} ({count:,} businesses)")
    
    # Summary statistics
    counts = list(frequent_addresses.values())
    print(f"\n📊 SUMMARY STATISTICS:")
    print(f"   • Highest frequency: {max(counts):,} businesses at one address")
    print(f"   • Average frequency: {np.mean(counts):,.1f} businesses per address")
    print(f"   • Median frequency: {np.median(counts):,.0f} businesses per address")
    print(f"   • Addresses with 100+ businesses: {sum(1 for c in counts if c >= 100):,}")
    print(f"   • Addresses with 50+ businesses: {sum(1 for c in counts if c >= 50):,}")
    
    # Save results to CSV for further analysis
    results_df = pd.DataFrame([
        {'Address': addr, 'BusinessCount': count} 
        for addr, count in frequent_addresses.items()
    ])
    output_file = "frequent_addresses_30plus.csv"
    results_df.to_csv(output_file, index=False)
    print(f"\n💾 Results saved to: {output_file}")
    
else:
    print(f"\n❌ No addresses found with ≥{min_occurrences} occurrences")

# Optional: Display top business incubators/complexes
print(f"\n🏢 POTENTIAL BUSINESS INCUBATORS/SHARED OFFICE SPACES:")
print("   (Addresses with 100+ businesses may indicate shared office buildings)")
print("-" * 70)

high_density = {addr: count for addr, count in frequent_addresses.items() if count >= 100}
if high_density:
    for address, count in list(high_density.items())[:10]:
        display_addr = address[:50] + "..." if len(address) > 50 else address
        print(f"   🏢 {display_addr:<53} ({count:,} businesses)")
else:
    print("   📋 No addresses found with 100+ businesses")

print(f"\n🔍 To analyze a specific address in detail, use:")
print(f"    get_records_for_address(filepath, 'ADDRESS_HERE')")

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
import gc

def scan_frequent_addresses(filepath, min_occurrences=30, chunksize=100000):
    """
    Scan for addresses appearing more than the specified threshold using chunked processing.
    Memory-efficient approach for large datasets.
    """
    print(f"🔍 Scanning for addresses appearing ≥{min_occurrences} times...")
    print(f"📁 Processing file: {filepath}")
    print(f"📦 Chunk size: {chunksize:,} rows")
    
    # Dictionary to count address occurrences
    address_counts = defaultdict(int)
    total_rows_processed = 0
    chunk_count = 0
    
    try:
        # Create chunked CSV reader
        reader = pd.read_csv(
            filepath, 
            chunksize=chunksize, 
            encoding='latin-1',
            usecols=['BorrowerAddress', 'BorrowerCity', 'BorrowerState'],  # Only load needed columns
            on_bad_lines='skip',
            low_memory=False
        )
        
        # Process each chunk
        for chunk in reader:
            chunk_count += 1
            total_rows_processed += len(chunk)
            
            # Clean and standardize addresses and cities
            chunk['addr_clean'] = (
                chunk['BorrowerAddress']
                .fillna('')
                .str.strip()
                .str.upper()
                .str.replace(r'\s+', ' ', regex=True)  # Normalize whitespace
            )
            
            chunk['city_clean'] = (
                chunk['BorrowerCity']
                .fillna('')
                .str.strip()
                .str.upper()
                .str.replace(r'\s+', ' ', regex=True)
            )
            
            # Create address-city combination for counting
            chunk['addr_city_key'] = chunk['addr_clean'] + ' | ' + chunk['city_clean']
            
            # Count address-city combinations in this chunk
            chunk_counts = chunk['addr_city_key'].value_counts()
            for addr_city_key, count in chunk_counts.items():
                if ' | ' in addr_city_key:
                    address, city = addr_city_key.split(' | ', 1)
                    if address and len(address) > 3 and city:  # Skip empty/very short addresses or cities
                        address_counts[addr_city_key] += count
            
            # Progress reporting
            if chunk_count % 25 == 0:
                print(f"   📊 Processed {chunk_count} chunks ({total_rows_processed:,} total rows)")
            
            # Memory cleanup
            del chunk, chunk_counts
            gc.collect()
            
    except Exception as e:
        print(f"❌ Error during processing: {e}")
        return {}
    
    # Filter for frequent addresses
    frequent_addresses = {
        addr_city_key: count for addr_city_key, count in address_counts.items() 
        if count >= min_occurrences
    }
    
    # Sort by frequency (descending)
    sorted_addresses = sorted(frequent_addresses.items(), key=lambda x: x[1], reverse=True)
    
    print(f"\n✅ SCAN COMPLETE")
    print(f"📈 Total rows processed: {total_rows_processed:,}")
    print(f"🏢 Address-city combinations with ≥{min_occurrences} occurrences: {len(sorted_addresses):,}")
    print(f"📝 Total unique address-city combinations scanned: {len(address_counts):,}")
    
    return dict(sorted_addresses)

# Execute the scan
filepath = "./sba_csv/combined_public_up_to_150k.csv"
frequent_addresses = scan_frequent_addresses(filepath, min_occurrences=30)

if frequent_addresses:
    print(f"\n🏆 TOP 25 MOST FREQUENT ADDRESSES (≥30 occurrences):")
    print("=" * 100)
    
    for i, (addr_city_key, count) in enumerate(list(frequent_addresses.items())[:25], 1):
        # Split address and city for display
        if ' | ' in addr_city_key:
            address, city = addr_city_key.split(' | ', 1)
            # Truncate very long addresses for display
            display_addr = address[:45] + "..." if len(address) > 45 else address
            display_city = city[:15] + "..." if len(city) > 15 else city
            print(f"{i:2d}. {display_addr:<48} | {display_city:<18} ({count:,} businesses)")
    
    # Summary statistics
    counts = list(frequent_addresses.values())
    print(f"\n📊 SUMMARY STATISTICS:")
    print(f"   • Highest frequency: {max(counts):,} businesses at one address-city")
    print(f"   • Average frequency: {np.mean(counts):,.1f} businesses per address-city")
    print(f"   • Median frequency: {np.median(counts):,.0f} businesses per address-city")
    print(f"   • Address-cities with 100+ businesses: {sum(1 for c in counts if c >= 100):,}")
    print(f"   • Address-cities with 50+ businesses: {sum(1 for c in counts if c >= 50):,}")
    
    # Save results to CSV for further analysis
    results_df = pd.DataFrame([
        {
            'Address': addr_city_key.split(' | ')[0] if ' | ' in addr_city_key else addr_city_key,
            'City': addr_city_key.split(' | ')[1] if ' | ' in addr_city_key else '',
            'BusinessCount': count
        } 
        for addr_city_key, count in frequent_addresses.items()
    ])
    output_file = "frequent_addresses_30plus.csv"
    results_df.to_csv(output_file, index=False)
    print(f"\n💾 Results saved to: {output_file}")
    
else:
    print(f"\n❌ No addresses found with ≥{min_occurrences} occurrences")

# Optional: Display top business incubators/complexes
print(f"\n🏢 POTENTIAL BUSINESS INCUBATORS/SHARED OFFICE SPACES:")
print("   (Address-city combinations with 100+ businesses may indicate shared office buildings)")
print("-" * 90)

high_density = {addr_city: count for addr_city, count in frequent_addresses.items() if count >= 100}
if high_density:
    for addr_city_key, count in list(high_density.items())[:10]:
        if ' | ' in addr_city_key:
            address, city = addr_city_key.split(' | ', 1)
            display_addr = address[:40] + "..." if len(address) > 40 else address
            display_city = city[:15] + "..." if len(city) > 15 else city
            print(f"   🏢 {display_addr:<43} | {display_city:<18} ({count:,} businesses)")
else:
    print("   📋 No address-city combinations found with 100+ businesses")

print(f"\n🔍 To analyze a specific address in detail, use:")
print(f"    get_records_for_address(filepath, 'ADDRESS_HERE')")

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
import gc

def scan_frequent_addresses(filepath, min_occurrences=30, chunksize=100000):
    """
    Scan for addresses appearing more than the specified threshold using chunked processing.
    Memory-efficient approach for large datasets with city-specific matching.
    """
    print(f"Scanning for addresses appearing >={min_occurrences} times...")
    print(f"Processing file: {filepath}")
    print(f"Chunk size: {chunksize:,} rows")
    
    # Dictionary to count address occurrences
    address_counts = defaultdict(int)
    total_rows_processed = 0
    chunk_count = 0
    
    try:
        # Create chunked CSV reader
        reader = pd.read_csv(
            filepath, 
            chunksize=chunksize, 
            encoding='latin-1',
            usecols=['BorrowerAddress', 'BorrowerCity', 'BorrowerState'],  # Only load needed columns
            on_bad_lines='skip',
            low_memory=False
        )
        
        # Process each chunk
        for chunk in reader:
            chunk_count += 1
            total_rows_processed += len(chunk)
            
            # Clean and standardize addresses and cities
            chunk['addr_clean'] = (
                chunk['BorrowerAddress']
                .fillna('')
                .str.strip()
                .str.upper()
                .str.replace(r'\s+', ' ', regex=True)  # Normalize whitespace
            )
            
            chunk['city_clean'] = (
                chunk['BorrowerCity']
                .fillna('')
                .str.strip()
                .str.upper()
                .str.replace(r'\s+', ' ', regex=True)
            )
            
            # Create address-city combination for counting
            chunk['addr_city_key'] = chunk['addr_clean'] + ' | ' + chunk['city_clean']
            
            # Count address-city combinations in this chunk
            chunk_counts = chunk['addr_city_key'].value_counts()
            for addr_city_key, count in chunk_counts.items():
                if ' | ' in addr_city_key:
                    address, city = addr_city_key.split(' | ', 1)
                    if address and len(address) > 3 and city:  # Skip empty/very short addresses or cities
                        address_counts[addr_city_key] += count
            
            # Progress reporting
            if chunk_count % 25 == 0:
                print(f"   Processed {chunk_count} chunks ({total_rows_processed:,} total rows)")
            
            # Memory cleanup
            del chunk, chunk_counts
            gc.collect()
            
    except Exception as e:
        print(f"Error during processing: {e}")
        return {}
    
    # Filter for frequent addresses
    frequent_addresses = {
        addr_city_key: count for addr_city_key, count in address_counts.items() 
        if count >= min_occurrences
    }
    
    # Sort by frequency (descending)
    sorted_addresses = sorted(frequent_addresses.items(), key=lambda x: x[1], reverse=True)
    
    print(f"\nSCAN COMPLETE")
    print(f"Total rows processed: {total_rows_processed:,}")
    print(f"Address-city combinations with >={min_occurrences} occurrences: {len(sorted_addresses):,}")
    print(f"Total unique address-city combinations scanned: {len(address_counts):,}")
    
    return dict(sorted_addresses)

def examine_specific_address(filepath, target_address, target_city=None, chunksize=100000):
    """
    Extract all records for a specific address-city combination using chunked processing.
    """
    # Standardize target address
    target_address_clean = (
        target_address.strip().upper().replace(r'\s+', ' ') if target_address else ''
    )
    target_city_clean = (
        target_city.strip().upper().replace(r'\s+', ' ') if target_city else None
    )
    
    print(f"Searching for all records at: {target_address}")
    if target_city_clean:
        print(f"In city: {target_city}")
    
    matching_records = []
    total_matches = 0
    chunk_count = 0
    
    try:
        reader = pd.read_csv(
            filepath, 
            chunksize=chunksize, 
            encoding='latin-1',
            on_bad_lines='skip',
            low_memory=False
        )
        
        for chunk in reader:
            chunk_count += 1
            
            # Standardize addresses and cities for comparison
            chunk['addr_clean'] = (
                chunk['BorrowerAddress']
                .fillna('')
                .str.strip()
                .str.upper()
                .str.replace(r'\s+', ' ', regex=True)
            )
            
            chunk['city_clean'] = (
                chunk['BorrowerCity']
                .fillna('')
                .str.strip()
                .str.upper()
                .str.replace(r'\s+', ' ', regex=True)
            )
            
            # Filter for matching address
            addr_matches = chunk[chunk['addr_clean'] == target_address_clean]
            
            # Further filter by city if specified
            if target_city_clean and len(addr_matches) > 0:
                addr_matches = addr_matches[addr_matches['city_clean'] == target_city_clean]
            
            if len(addr_matches) > 0:
                matching_records.append(addr_matches.drop(['addr_clean', 'city_clean'], axis=1))
                total_matches += len(addr_matches)
                print(f"Found {len(addr_matches)} records in chunk {chunk_count}")
            
            # Clean up
            del chunk, addr_matches
            gc.collect()
            
    except Exception as e:
        print(f"Error searching for address: {e}")
        return pd.DataFrame()
    
    if matching_records:
        result = pd.concat(matching_records, ignore_index=True)
        print(f"\nTotal records found: {len(result)}")
        return result
    else:
        print("No records found for this address")
        return pd.DataFrame()

def interactive_address_explorer(filepath, min_occurrences=30):
    """
    Interactive function to scan addresses and allow detailed examination.
    """
    # First, scan for frequent addresses
    print("=" * 80)
    print("SCANNING FOR FREQUENT ADDRESSES")
    print("=" * 80)
    
    frequent_addresses = scan_frequent_addresses(filepath, min_occurrences)
    
    if not frequent_addresses:
        print(f"No addresses found with >={min_occurrences} occurrences")
        return None, None
    
    # Display results
    print(f"\nTOP 25 MOST FREQUENT ADDRESSES (>={min_occurrences} occurrences):")
    print("=" * 100)
    
    address_list = []
    for i, (addr_city_key, count) in enumerate(list(frequent_addresses.items())[:25], 1):
        if ' | ' in addr_city_key:
            address, city = addr_city_key.split(' | ', 1)
            address_list.append((address, city, count))
            display_addr = address[:45] + "..." if len(address) > 45 else address
            display_city = city[:15] + "..." if len(city) > 15 else city
            print(f"{i:2d}. {display_addr:<48} | {display_city:<18} ({count:,} businesses)")
    
    # Summary statistics
    counts = list(frequent_addresses.values())
    print(f"\nSUMMARY STATISTICS:")
    print(f"   • Highest frequency: {max(counts):,} businesses at one address-city")
    print(f"   • Average frequency: {np.mean(counts):,.1f} businesses per address-city")
    print(f"   • Median frequency: {np.median(counts):,.0f} businesses per address-city")
    print(f"   • Address-cities with 100+ businesses: {sum(1 for c in counts if c >= 100):,}")
    print(f"   • Address-cities with 50+ businesses: {sum(1 for c in counts if c >= 50):,}")
    
    # Save results to CSV
    results_df = pd.DataFrame([
        {
            'Address': addr_city_key.split(' | ')[0] if ' | ' in addr_city_key else addr_city_key,
            'City': addr_city_key.split(' | ')[1] if ' | ' in addr_city_key else '',
            'BusinessCount': count
        } 
        for addr_city_key, count in frequent_addresses.items()
    ])
    output_file = "frequent_addresses_30plus.csv"
    results_df.to_csv(output_file, index=False)
    print(f"\nResults saved to: {output_file}")
    
    return frequent_addresses, address_list

def analyze_address_details(df, address, city=None):
    """
    Analyze detailed statistics for a specific address.
    """
    if df.empty:
        print("No data to analyze")
        return
    
    location_desc = f"{address}" + (f", {city}" if city else "")
    print(f"\n" + "=" * 80)
    print(f"DETAILED ANALYSIS: {location_desc}")
    print("=" * 80)
    print(f"Total businesses/loans: {len(df)}")
    
    # Financial analysis
    if 'InitialApprovalAmount' in df.columns:
        total_approved = df['InitialApprovalAmount'].sum()
        print(f"\nFinancial Impact:")
        print(f"  Total PPP loans approved: ${total_approved:,.2f}")
        print(f"  Average loan size: ${df['InitialApprovalAmount'].mean():,.2f}")
        print(f"  Largest loan: ${df['InitialApprovalAmount'].max():,.2f}")
        print(f"  Smallest loan: ${df['InitialApprovalAmount'].min():,.2f}")
    
    # Forgiveness analysis
    if 'ForgivenessAmount' in df.columns:
        forgiven_df = df.dropna(subset=['ForgivenessAmount'])
        if len(forgiven_df) > 0:
            total_forgiven = forgiven_df['ForgivenessAmount'].sum()
            forgiveness_rate = len(forgiven_df) / len(df) * 100
            print(f"\nForgiveness:")
            print(f"  Total forgiven: ${total_forgiven:,.2f}")
            print(f"  Forgiveness rate: {forgiveness_rate:.1f}% ({len(forgiven_df)}/{len(df)} loans)")
    
    # Business diversity
    if 'BorrowerName' in df.columns:
        unique_businesses = df['BorrowerName'].nunique()
        print(f"\nBusiness Diversity:")
        print(f"  Unique business names: {unique_businesses}")
        print(f"  Average loans per business: {len(df)/unique_businesses:.1f}")
        
        # Show top businesses
        business_counts = df['BorrowerName'].value_counts()
        print(f"  Top 10 borrowers at this address:")
        for i, (business, count) in enumerate(business_counts.head(10).items(), 1):
            business_name = business[:50] + "..." if len(business) > 50 else business
            print(f"    {i:2d}. {business_name:<53} ({count} loans)")
    
    # Timeline analysis
    if 'DateApproved' in df.columns:
        dates = pd.to_datetime(df['DateApproved'], errors='coerce')
        print(f"\nTimeline:")
        print(f"  Date range: {dates.min().date()} to {dates.max().date()}")
        
        # Monthly distribution
        monthly = dates.dt.to_period('M').value_counts().sort_index()
        if len(monthly) > 0:
            print(f"  Peak month: {monthly.idxmax()} ({monthly.max()} loans)")
    
    return df

# Execute the interactive scanner
filepath = "./sba_csv/combined_public_up_to_150k.csv"

print("INTERACTIVE ADDRESS FREQUENCY SCANNER")
print("This tool will scan for frequently used addresses and allow detailed examination")
print("\nStep 1: Scanning for addresses with 30+ businesses...")

frequent_addresses, address_list = interactive_address_explorer(filepath, min_occurrences=30)

if frequent_addresses and address_list:
    print(f"\n" + "=" * 80)
    print("INTERACTIVE EXAMINATION")
    print("=" * 80)
    print("To examine a specific address in detail, use the following functions:")
    print()
    print("# Example 1: Examine a specific address from the top results")
    print("# Replace with actual address and city from the list above")
    print("address_data = examine_specific_address(filepath, '1445 WOODMONT LN NW', 'ATLANTA')")
    print("analyze_address_details(address_data, '1445 WOODMONT LN NW', 'ATLANTA')")
    print()
    print("# Example 2: Quick examination of top result")
    if address_list:
        top_addr, top_city, top_count = address_list[0]
        print(f"# Examine the top result: {top_addr[:30]}{'...' if len(top_addr) > 30 else ''} in {top_city}")
        print(f"top_address_data = examine_specific_address(filepath, '{top_addr}', '{top_city}')")
        print(f"analyze_address_details(top_address_data, '{top_addr}', '{top_city}')")
    print()
    print("# Example 3: Show sample records")
    print("if not address_data.empty:")
    print("    display_cols = ['BorrowerName', 'InitialApprovalAmount', 'DateApproved', 'ForgivenessAmount']")
    print("    print(address_data[display_cols].head(10))")
    print()
    print("To run these commands, copy and execute them in the next cell.")

In [None]:
address_data = examine_specific_address(filepath, '1445 WOODMONT LN NW', 'ATLANTA')
analyze_address_details(address_data, '1445 WOODMONT LN NW', 'ATLANTA')