In [None]:
import pandas as pd
import numpy as np
import logging
import chardet
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set up comprehensive logging
logging.basicConfig(
    level=logging.INFO, 
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),  # Console output
        logging.FileHandler('top_ppp_loans_analysis.log')  # File output
    ]
)
logger = logging.getLogger(__name__)

logger.info("=" * 60)
logger.info("STARTING TOP 100 PPP LOANS NATIONWIDE ANALYSIS")
logger.info("=" * 60)

print("Libraries imported and logging configured successfully!")

In [None]:
def detect_encoding(file_path, sample_size=10000):
    """Detect the encoding of a file by reading a sample"""
    logger.info(f"Detecting encoding for {file_path}")
    
    with open(file_path, 'rb') as file:
        sample = file.read(sample_size)
        result = chardet.detect(sample)
        
    logger.info(f"Detected encoding: {result['encoding']} (confidence: {result['confidence']:.2f})")
    return result['encoding']

def format_currency(amount):
    """Format number as currency"""
    try:
        return f"${float(amount):,.2f}"
    except:
        return str(amount)

def safe_convert_to_float(value):
    """Safely convert value to float, handling various formats"""
    if pd.isna(value):
        return 0.0
    
    # Convert to string and clean
    str_val = str(value).strip().replace('$', '').replace(',', '')
    
    try:
        return float(str_val)
    except:
        logger.warning(f"Could not convert value to float: {value}")
        return 0.0

logger.info("Helper functions defined successfully!")

In [None]:
import os

# Look for the CSV file
possible_files = [
    "./sba_csv/public_150k_plus_240930.csv",
    "public_150k_plus_240930.csv",
    "sba_csv/public_150k_plus_240930.csv"
]

input_file = None
for file_path in possible_files:
    if os.path.exists(file_path):
        input_file = file_path
        break

if input_file is None:
    logger.error("PPP CSV file not found. Please check file location.")
    print("Available files in current directory:")
    for file in os.listdir('.'):
        if file.endswith('.csv'):
            print(f"  - {file}")
else:
    logger.info(f"Found PPP data file: {input_file}")
    
    # Get file size
    file_size = os.path.getsize(input_file) / (1024 * 1024)  # MB
    logger.info(f"File size: {file_size:.2f} MB")

In [None]:
if input_file:
    # Detect encoding
    try:
        encoding = detect_encoding(input_file)
        if not encoding:
            logger.warning("Could not detect encoding, trying common encodings...")
            encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
        else:
            encodings_to_try = [encoding, 'utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
    except Exception as e:
        logger.warning(f"Error detecting encoding: {e}, trying common encodings...")
        encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
    
    # Test reading with different encodings
    successful_encoding = None
    for enc in encodings_to_try:
        try:
            logger.info(f"Testing encoding: {enc}")
            test_chunk = pd.read_csv(input_file, encoding=enc, nrows=10)
            successful_encoding = enc
            logger.info(f"Successfully opened file with encoding: {enc}")
            logger.info(f"Sample columns: {list(test_chunk.columns)}")
            break
        except Exception as e:
            logger.warning(f"Failed with encoding {enc}: {str(e)}")
            continue
    
    if successful_encoding:
        print(f"File encoding determined: {successful_encoding}")
    else:
        logger.error("Could not open file with any encoding")

In [None]:
if successful_encoding:
    logger.info("Reading full dataset...")
    start_time = datetime.now()
    
    try:
        # Read the entire dataset
        df = pd.read_csv(input_file, encoding=successful_encoding)
        
        read_time = datetime.now() - start_time
        logger.info(f"Dataset loaded successfully in {read_time.total_seconds():.2f} seconds")
        logger.info(f"Dataset shape: {df.shape}")
        logger.info(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")
        
        # Display basic info
        print("\nDATASET OVERVIEW:")
        print(f"Rows: {len(df):,}")
        print(f"Columns: {len(df.columns)}")
        
        # Show column names
        print("\nCOLUMNS:")
        for i, col in enumerate(df.columns, 1):
            print(f"{i:2d}. {col}")
            
    except Exception as e:
        logger.error(f"Error reading full dataset: {str(e)}")
        df = None

In [None]:
if df is not None:
    logger.info("Processing loan amounts...")
    
    # Use the correct amount column
    correct_amount_col = 'InitialApprovalAmount'
    
    if correct_amount_col in df.columns:
        logger.info(f"Using amount column: {correct_amount_col}")
        
        # Clean and convert amounts
        df_clean = df.copy()
        df_clean['loan_amount_clean'] = df_clean[correct_amount_col].apply(safe_convert_to_float)
        
        # Filter out zero or invalid amounts
        df_valid = df_clean[df_clean['loan_amount_clean'] > 0].copy()
        
        logger.info(f"Original records: {len(df):,}")
        logger.info(f"Records with valid loan amounts: {len(df_valid):,}")
        logger.info(f"Records filtered out: {len(df) - len(df_valid):,}")
        
        # Basic statistics
        total_loan_amount = df_valid['loan_amount_clean'].sum()
        avg_loan_amount = df_valid['loan_amount_clean'].mean()
        median_loan_amount = df_valid['loan_amount_clean'].median()
        
        logger.info(f"Total loan amount: {format_currency(total_loan_amount)}")
        logger.info(f"Average loan amount: {format_currency(avg_loan_amount)}")
        logger.info(f"Median loan amount: {format_currency(median_loan_amount)}")
        
        print(f"\nLOAN AMOUNT STATISTICS:")
        print(f"Total loans processed: {len(df_valid):,}")
        print(f"Total amount: {format_currency(total_loan_amount)}")
        print(f"Average: {format_currency(avg_loan_amount)}")
        print(f"Median: {format_currency(median_loan_amount)}")
        
        # Show loan distribution
        ranges = [
            (0, 150000, "Under $150K"),
            (150000, 1000000, "$150K - $1M"), 
            (1000000, 5000000, "$1M - $5M"),
            (5000000, 10000000, "$5M - $10M"),
            (10000000, float('inf'), "Over $10M")
        ]
        
        print(f"\nLOAN AMOUNT DISTRIBUTION:")
        for min_amt, max_amt, label in ranges:
            count = len(df_valid[(df_valid['loan_amount_clean'] >= min_amt) & (df_valid['loan_amount_clean'] < max_amt)])
            percentage = (count / len(df_valid)) * 100
            print(f"{label}: {count:,} loans ({percentage:.1f}%)")
            
    else:
        logger.error(f"Column {correct_amount_col} not found!")

In [None]:
if df is not None and 'df_valid' in locals():
    logger.info("Identifying top 100 PPP loans nationwide...")
    
    # Sort by loan amount and get top 100
    top_100 = df_valid.nlargest(100, 'loan_amount_clean').copy()
    
    logger.info(f"Top 100 loans identified")
    logger.info(f"Highest loan amount: {format_currency(top_100.iloc[0]['loan_amount_clean'])}")
    logger.info(f"100th highest loan amount: {format_currency(top_100.iloc[99]['loan_amount_clean'])}")
    
    # Select display columns - make sure to include loan_amount_clean
    display_columns = ['loan_amount_clean', 'BorrowerName', 'BorrowerCity', 
                      'BorrowerState', 'LoanStatus', 'DateApproved', 'ForgivenessAmount']
    
    # Only keep columns that exist in the dataframe
    display_columns = [col for col in display_columns if col in top_100.columns]
    
    logger.info(f"Display columns selected: {display_columns}")
    
    # Create display dataframe
    top_100_display = top_100[display_columns].copy()
    top_100_display['Rank'] = range(1, 101)
    
    # Reorder columns to put rank first
    cols = ['Rank'] + [col for col in top_100_display.columns if col != 'Rank']
    top_100_display = top_100_display[cols]
    
    # Format the loan amount column for display (only if it exists)
    if 'loan_amount_clean' in top_100_display.columns:
        top_100_display['Loan Amount (Formatted)'] = top_100_display['loan_amount_clean'].apply(format_currency)
    
    print("\n" + "="*120)
    print("TOP 100 PPP LOANS NATIONWIDE")
    print("="*120)
    
    # Display the results
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', 40)
    
    print(top_100_display.to_string(index=False))

In [None]:
if 'top_100' in locals():
    logger.info("Saving results and generating summary...")
    
    # Save top 100 to CSV
    output_file = f"top_100_ppp_loans_nationwide_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    top_100.to_csv(output_file, index=False)
    logger.info(f"Top 100 loans saved to: {output_file}")
    
    # Generate summary statistics
    top_100_total = top_100['loan_amount_clean'].sum()
    top_100_avg = top_100['loan_amount_clean'].mean()
    total_all_loans = df_valid['loan_amount_clean'].sum()
    top_100_percentage = (top_100_total / total_all_loans) * 100
    
    # Create summary report
    summary_report = f"""
PPP LOANS ANALYSIS SUMMARY REPORT
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
{'='*60}

DATASET OVERVIEW:
- Total records processed: {len(df):,}
- Valid loan records: {len(df_valid):,}
- Data source: {input_file}

TOP 100 LOANS STATISTICS:
- Highest loan amount: {format_currency(top_100.iloc[0]['loan_amount_clean'])}
- Lowest in top 100: {format_currency(top_100.iloc[99]['loan_amount_clean'])}
- Total of top 100: {format_currency(top_100_total)}
- Average of top 100: {format_currency(top_100_avg)}
- Top 100 as % of all loans: {top_100_percentage:.4f}%

OVERALL LOAN STATISTICS:
- Total loan amount (all): {format_currency(total_all_loans)}
- Average loan amount: {format_currency(avg_loan_amount)}
- Median loan amount: {format_currency(median_loan_amount)}

OUTPUT FILES:
- Top 100 loans CSV: {output_file}
- Analysis log: top_ppp_loans_analysis.log
"""
    
    print(summary_report)
    
    # Save summary report
    report_file = f"ppp_analysis_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
    with open(report_file, 'w') as f:
        f.write(summary_report)
    
    logger.info(f"Summary report saved to: {report_file}")
    logger.info("Analysis completed successfully!")
    logger.info("="*60)

print("\n" + "="*60)
print("ANALYSIS COMPLETE - Check log file for detailed progress")
print("="*60)