In [None]:
import pandas as pd
import logging
import chardet

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Complete Bexar County ZIP codes
bexar_zips = {
    '78006', '78009', '78015', '78023', '78039', '78056', '78073', '78101', 
    '78108', '78109', '78112', '78114', '78121', '78124', '78154', '78161', 
    '78163', '78201', '78202', '78203', '78204', '78205', '78207', '78208', 
    '78209', '78210', '78211', '78212', '78213', '78214', '78215', '78216', 
    '78217', '78218', '78219', '78220', '78221', '78222', '78223', '78224', 
    '78225', '78226', '78227', '78228', '78229', '78230', '78231', '78232', 
    '78233', '78234', '78235', '78236', '78237', '78238', '78239', '78240', 
    '78242', '78244', '78245', '78247', '78248', '78249', '78250', '78251', 
    '78252', '78253', '78254', '78255', '78256', '78257', '78258', '78259', 
    '78260', '78261', '78263', '78264', '78266', '78269', '78270', '78278', 
    '78279', '78280', '78283', '78284', '78285', '78288', '78289', '78291', 
    '78292', '78293', '78294', '78295', '78296', '78297', '78298', '78299'
}

def detect_encoding(file_path, sample_size=10000):
    """Detect the encoding of a file by reading a sample"""
    logger.info(f"Detecting encoding for {file_path}")
    
    with open(file_path, 'rb') as file:
        sample = file.read(sample_size)
        result = chardet.detect(sample)
        
    logger.info(f"Detected encoding: {result['encoding']} (confidence: {result['confidence']:.2f})")
    return result['encoding']

def process_ppp_file(input_file, output_file, chunk_size=10000):
    bexar_loans = []
    total_processed = 0
    bexar_found = 0
    
    logger.info(f"Starting processing of {input_file}")
    logger.info(f"Using {len(bexar_zips)} Bexar County ZIP codes for filtering")
    
    # Detect encoding first
    try:
        encoding = detect_encoding(input_file)
        if not encoding:
            logger.warning("Could not detect encoding, trying common encodings...")
            encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
        else:
            encodings_to_try = [encoding, 'utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
    except Exception as e:
        logger.warning(f"Error detecting encoding: {e}, trying common encodings...")
        encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
    
    # Try different encodings
    successful_encoding = None
    for encoding in encodings_to_try:
        try:
            logger.info(f"Trying encoding: {encoding}")
            # Test read first few rows
            test_chunk = pd.read_csv(input_file, encoding=encoding, nrows=5)
            successful_encoding = encoding
            logger.info(f"Successfully opened file with encoding: {encoding}")
            break
        except Exception as e:
            logger.warning(f"Failed with encoding {encoding}: {str(e)}")
            continue
    
    if not successful_encoding:
        logger.error("Could not open file with any encoding")
        return False
    
    # Now process the file with the successful encoding
    try:
        chunk_reader = pd.read_csv(input_file, encoding=successful_encoding, chunksize=chunk_size)
        
        for chunk_num, chunk in enumerate(chunk_reader):
            # Log progress
            total_processed += len(chunk)
            logger.info(f"Processing chunk {chunk_num + 1}, rows processed so far: {total_processed}")
            
            # Check column names in first chunk
            if chunk_num == 0:
                logger.info(f"Column names: {list(chunk.columns)}")
                # Try to identify the state and zip columns
                state_cols = [col for col in chunk.columns if 'state' in col.lower() or 'st' in col.lower()]
                zip_cols = [col for col in chunk.columns if 'zip' in col.lower() or 'postal' in col.lower()]
                logger.info(f"Potential state columns: {state_cols}")
                logger.info(f"Potential zip columns: {zip_cols}")
            
            # Filter for Texas first (adjust column name as needed)
            # Common column names: BorrowerState, State, ST, etc.
            state_column = None
            zip_column = None
            
            for col in chunk.columns:
                if 'state' in col.lower():
                    state_column = col
                if 'zip' in col.lower():
                    zip_column = col
            
            if not state_column or not zip_column:
                logger.error(f"Could not find state or zip columns. Available columns: {list(chunk.columns)}")
                return False
            
            logger.info(f"Using state column: {state_column}, zip column: {zip_column}")
            
            tx_chunk = chunk[chunk[state_column].astype(str).str.upper() == 'TX']
            logger.info(f"Chunk {chunk_num + 1}: {len(tx_chunk)} TX loans out of {len(chunk)} total")
            
            if not tx_chunk.empty:
                # Filter by Bexar County ZIP codes
                tx_chunk = tx_chunk.copy()
                tx_chunk['zip_clean'] = tx_chunk[zip_column].astype(str).str.split('-').str[0].str.zfill(5)
                bexar_chunk = tx_chunk[tx_chunk['zip_clean'].isin(bexar_zips)]
                
                if not bexar_chunk.empty:
                    chunk_bexar_count = len(bexar_chunk)
                    bexar_found += chunk_bexar_count
                    logger.info(f"Chunk {chunk_num + 1}: Found {chunk_bexar_count} Bexar County loans")
                    
                    # Drop the temporary column before saving
                    bexar_chunk = bexar_chunk.drop('zip_clean', axis=1)
                    bexar_loans.append(bexar_chunk)
                else:
                    logger.info(f"Chunk {chunk_num + 1}: No Bexar County loans found")
    
    except Exception as e:
        logger.error(f"Error processing file: {str(e)}")
        return False
    
    # Combine and save results
    if bexar_loans:
        logger.info(f"Combining {len(bexar_loans)} chunks with Bexar County data")
        result = pd.concat(bexar_loans, ignore_index=True)
        result.to_csv(output_file, index=False)
        
        logger.info(f"SUMMARY:")
        logger.info(f"Total rows processed: {total_processed}")
        logger.info(f"Total Bexar County loans found: {len(result)}")
        logger.info(f"Results saved to: {output_file}")
        
        return True
    else:
        logger.warning("No Bexar County loans found in the entire file")
        return False

# Usage
if __name__ == "__main__":
    input_file = "./sba_csv/public_150k_plus_240930.csv"
    output_file = "bexar_county_ppp_150k_plus_loans.csv"
    
    success = process_ppp_file(input_file, output_file)
    if success:
        print("Processing completed successfully!")
    else:
        print("Processing failed or no results found.")

In [None]:
!ls -alt SBA*

In [None]:
!pwd