In [None]:
# Import required libraries
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np

# Data loading functions
def load_oos_data(file_path):
    """Load and prepare OOS dataset"""
    df = pd.read_spss(file_path)
    
    # Convert zip codes to string and ensure proper formatting
    df['youth_zip'] = df['youth_zip'].astype(str).str.zfill(5)
    df['now_zip'] = df['now_zip'].astype(str).str.zfill(5)
    
    return df

def get_zipcode_shapes():
    """Load US ZIP Code shapefile"""
    zip_gdf = gpd.read_file('/media/data/personality/cb_2016_us_zcta510_500k/cb_2016_us_zcta510_500k.shp')
    zip_gdf['ZCTA5CE10'] = zip_gdf['ZCTA5CE10'].astype(str).str.zfill(5)
    return zip_gdf

def get_texas_zipcodes(gdf):
    """Filter GeoDataFrame for Texas ZIP codes"""
    texas_prefixes = ('75', '76', '77', '78', '79', '88')
    texas_mask = gdf['ZCTA5CE10'].str.startswith(texas_prefixes)
    return gdf[texas_mask].copy()

# Analysis functions
def calculate_zip_samples(df):
    """Calculate sample sizes for current and youth ZIP codes"""
    # Current ZIP codes
    current_counts = df['now_zip'].value_counts().reset_index()
    current_counts.columns = ['ZIP', 'current_samples']
    
    # Youth ZIP codes
    youth_counts = df['youth_zip'].value_counts().reset_index()
    youth_counts.columns = ['ZIP', 'youth_samples']
    
    # Merge both counts
    zip_counts = current_counts.merge(youth_counts, on='ZIP', how='outer').fillna(0)
    zip_counts['total_samples'] = zip_counts['current_samples'] + zip_counts['youth_samples']
    
    return zip_counts

def create_sample_maps(zip_gdf, zip_counts):
    """Create choropleth maps of sample sizes with separate files"""
    # Merge shapefile with sample counts
    map_data = zip_gdf.merge(zip_counts, left_on='ZCTA5CE10', right_on='ZIP', how='left')
    
    # Filter for mainland USA (excluding Alaska and Hawaii)
    mainland = map_data[~map_data['ZCTA5CE10'].str.startswith(('995', '996', '997', '998', '999', '968', '969'))]
    
    # Filter for Texas
    texas = get_texas_zipcodes(mainland)
    
    def plot_map(data, column, title, filename, state='US'):
        fig, ax = plt.subplots(figsize=(15, 10))
        
        # Plot all geometries with base color white
        data.plot(color='white', ax=ax)
        
        # Create mask for zero values
        zero_mask = data[column] == 0
        
        # Plot non-zero values with viridis colormap
        if (~zero_mask).any():
            data[~zero_mask].plot(
                column=column,
                ax=ax,
                legend=True,
                legend_kwds={'label': title},
                cmap='viridis'
            )
        
        state_label = 'Texas' if state == 'TX' else 'US'
        ax.set_title(f'{title} ({state_label})', fontsize=14)
        ax.axis('off')
        plt.tight_layout()
        
        suffix = '_texas' if state == 'TX' else ''
        fig.savefig(f'/media/gisense/koichi/personality/figs/{filename}{suffix}.png', 
                   dpi=300, bbox_inches='tight')
        plt.close(fig)
    
    # Create US maps
    plot_map(mainland, 'current_samples', 'Current Residents', 'current_residence_map')
    plot_map(mainland, 'youth_samples', 'Youth Residents', 'youth_residence_map')
    plot_map(mainland, 'total_samples', 'Total Samples', 'total_samples_map')
    
    # Create Texas maps
    plot_map(texas, 'current_samples', 'Current Residents', 'current_residence_map', 'TX')
    plot_map(texas, 'youth_samples', 'Youth Residents', 'youth_residence_map', 'TX')
    plot_map(texas, 'total_samples', 'Total Samples', 'total_samples_map', 'TX')
    
    return mainland

# Main execution
if __name__ == "__main__":
    # Load data
    df = load_oos_data('/media/data/personality/OOS Master Dataset Sept 2022/00_OOS_MASTER DATASET_2022_08_22.sav')
    zip_gdf = get_zipcode_shapes()
    
    # Calculate sample sizes
    zip_counts = calculate_zip_samples(df)
    
    # Create maps
    map_data = create_sample_maps(zip_gdf, zip_counts)
    
    # Print summary statistics
    print("Sample Size Summary:")
    print(f"Total ZIP codes with samples: {len(zip_counts)} out of {len(zip_gdf)}")
    print("\nCurrent Residence Statistics:")
    print(zip_counts['current_samples'].describe())
    print("\nYouth Residence Statistics:")
    print(zip_counts['youth_samples'].describe())