This is the code for the pre-capped arrivals

In [None]:
import pandas as pd
import numpy as np
import openpyxl
import warnings
from datetime import datetime
import tkinter as tk
from tkinter import filedialog, messagebox
from pathlib import Path

def select_file(title, file_types, save=False):
    """Unified file selection function"""
    root = tk.Tk()
    root.withdraw()
    root.attributes('-topmost', True)
    
    try:
        if save:
            file_path = filedialog.asksaveasfilename(
                title=title,
                filetypes=file_types,
                defaultextension=file_types[0][1]
            )
        else:
            file_path = filedialog.askopenfilename(
                title=title,
                filetypes=file_types
            )
    finally:
        root.destroy()
    
    return file_path if file_path else None

def optimize_dataframe(df):
    """Optimize DataFrame memory usage"""
    # First, convert any string columns that should be numeric
    numeric_columns = ['state_percapita_income', 'state_unemployment', 
                       'immigrant_population', 'import_from_slu']
    for col in numeric_columns:
        if col in df.columns:
            # Handle potential strings by first converting to float
            try:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            except:
                # If conversion fails, log it but continue
                print(f"Warning: Could not convert {col} to numeric")
    
    date_columns = ['travel_date']
    for col in date_columns:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], format='mixed', errors='coerce')
    
    categorical_columns = ['sex', 'marital_status', 'us_state', 
                         'employment_status', 'purpose_enc', 'accomd_type']
    for col in categorical_columns:
        if col in df.columns:
            df[col] = df[col].astype('category')
    
    return df

def create_table1(df):
    """Create Table 1 with interactions"""
    results = []
    total_n = len(df)
    
    # Calculate age
    print("Calculating age statistics...")
 
    age_stats = df['age'].agg(['mean', 'std'])
    results.append({
        'Category': 'Demographics',
        'Characteristic': 'Age',
        'Value': f"{age_stats['mean']:.1f} ± {age_stats['std']:.1f}",
        'N': total_n
    })

    # Make sure all numeric columns are properly converted to numeric
    numeric_cols = ['state_percapita_income', 'state_unemployment', 
                    'immigrant_population', 'import_from_slu']
    for col in numeric_cols:
        if col in df.columns:
            # Ensure column is numeric
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Format numeric statistics properly with error handling
    def safe_format_stats(column, prefix='', suffix='', decimals=1):
        """Safely format stats for a column with error handling"""
        try:
            # Check if column exists and has numeric values
            if column not in df.columns or df[column].isna().all():
                return "N/A", 0
            
            # Calculate statistics
            stats = df[column].agg(['mean', 'std'])
            n_count = df[column].notna().sum()
            
            # Format the output
            formatted = f"{prefix}{stats['mean']:.{decimals}f}{suffix} ± {prefix}{stats['std']:.{decimals}f}{suffix}"
            return formatted, n_count
        except Exception as e:
            print(f"Error formatting {column}: {e}")
            return "Error calculating stats", 0
    
    # State per capita income
    value, n_count = safe_format_stats('state_percapita_income', prefix='$', decimals=2)
    results.append({
        'Category': 'Demographics',
        'Characteristic': 'State Per Capita Income',
        'Value': value,
        'N': n_count
    })
    
    # State unemployment
    value, n_count = safe_format_stats('state_unemployment', suffix='%')
    results.append({
        'Category': 'Demographics',
        'Characteristic': 'State Unemployment Rate',
        'Value': value,
        'N': n_count
    })

    # State immigrant population
    value, n_count = safe_format_stats('immigrant_population', decimals=0)
    results.append({
        'Category': 'Demographics',
        'Characteristic': 'State Immigrant Population',
        'Value': value,
        'N': n_count
    })

    # State imports from St. Lucia
    value, n_count = safe_format_stats('import_from_slu', prefix='$', decimals=2)
    results.append({
        'Category': 'Demographics',
        'Characteristic': 'State Imports from St. Lucia',
        'Value': value,
        'N': n_count
    })
    
    # Process categorical variables
    categorical_vars = {
        'Demographics': ['sex', 'marital_status', 'employment_status'],
        'Travel': ['purpose_enc', 'accomd_type', 'us_state']
    }
    
    print("Processing categorical variables...")
    for category, variables in categorical_vars.items():
        for var in variables:
            if var not in df.columns:
                print(f"Warning: Column {var} not found in DataFrame")
                continue
                
            value_counts = df[var].value_counts().head(5)  # Top 5 for all categories
            percentages = (value_counts / total_n * 100)
            
            results.append({
                'Category': category,
                'Characteristic': f"{var.title().replace('_', ' ')} (Top 5)",
                'Value': '',
                'N': '',
                'Percentage': ''
            })
            
            for val, count in value_counts.items():
                results.append({
                    'Category': category,
                    'Characteristic': "   " + str(val),
                    'Value': '',
                    'N': count,
                    'Percentage': f"{percentages[val]:.1f}%"
                })
    
    # Create and analyze Gender-Marital Status combinations
    print("Analyzing Gender-Marital Status combinations...")
    # Check if both columns exist before creating combination
    if 'sex' in df.columns and 'marital_status' in df.columns:
        df['gender_marital'] = df['sex'].astype(str) + ' - ' + df['marital_status'].astype(str)
        gender_marital_counts = df['gender_marital'].value_counts()
        gender_marital_pct = (gender_marital_counts / total_n * 100)
        
        results.append({
            'Category': 'Demographics',
            'Characteristic': 'Gender-Marital Status Combinations (Top 5)',
            'Value': '',
            'N': '',
            'Percentage': ''
        })
        
        for val, count in gender_marital_counts.head(5).items():
            results.append({
                'Category': 'Demographics',
                'Characteristic': "   " + val,
                'Value': '',
                'N': count,
                'Percentage': f"{gender_marital_pct[val]:.1f}%"
            })
    else:
        print("Warning: Cannot create gender-marital combinations - required columns missing")
    
    # Calculate length of stay and its interaction with purpose of visit
    print("Calculating length of stay statistics...")
    if 'los_trunc' in df.columns:
        df['LENGTH_OF_STAY'] = df['los_trunc']
        
        # Overall length of stay
        stay_stats = df['LENGTH_OF_STAY'].agg(['mean', 'std'])
        results.append({
            'Category': 'Travel',
            'Characteristic': 'Length of Stay (days)',
            'Value': f"{stay_stats['mean']:.1f} ± {stay_stats['std']:.1f}",
            'N': df['LENGTH_OF_STAY'].notna().sum()
        })
        
        # Length of stay by top 5 purposes
        print("Analyzing length of stay by purpose of visit...")
        if 'purpose_enc' in df.columns:
            top_5_purposes = df['purpose_enc'].value_counts().head(5).index
            results.append({
                'Category': 'Travel',
                'Characteristic': 'Length of Stay by Purpose (Top 5)',
                'Value': '',
                'N': '',
                'Percentage': ''
            })
            
            for purpose in top_5_purposes:
                purpose_stats = df[df['purpose_enc'] == purpose]['LENGTH_OF_STAY'].agg(['mean', 'std', 'size'])
                results.append({
                    'Category': 'Travel',
                    'Characteristic': f"   {purpose}",
                    'Value': f"{purpose_stats['mean']:.1f} ± {purpose_stats['std']:.1f}",
                    'N': purpose_stats['size']
                })
        else:
            print("Warning: Cannot analyze length of stay by purpose - purpose_enc column missing")
    else:
        print("Warning: Cannot calculate length of stay - los_trunc column missing")
    
    return pd.DataFrame(results)

def show_message(title, message, error=False):
    """Show message dialog"""
    root = tk.Tk()
    root.withdraw()
    root.attributes('-topmost', True)
    try:
        if error:
            messagebox.showerror(title, message)
        else:
            messagebox.showinfo(title, message)
    finally:
        root.destroy()

def main():
    try:
        # File type definitions
        excel_types = [
            ('Excel files', '*.xlsx *.xls *.csv'),  # Fixed space before *.csv
            ('All files', '*.*')
        ]
        
        # Get input file
        input_file = select_file(
            title='Select Excel Data File',
            file_types=excel_types
        )
        
        if not input_file:
            print("No file selected. Exiting.")
            return
        
        # Read the data
        print("Reading data...")
        df = pd.read_excel(
            input_file,
            usecols=['sex', 'marital_status', 'us_state', 
                     'employment_status', 'purpose_enc', 'accomd_type',  # Fixed missing comma
                     'age', 'travel_date', 'los_trunc', 'state_percapita_income',
                     'state_unemployment', 'immigrant_population','import_from_slu']
        )
        
        # Optimize DataFrame
        print("Optimizing data structure...")
        df = optimize_dataframe(df)
        
        # Create Table 1
        print("Generating Table 1...")
        table1_df = create_table1(df)
        
        # Get output file
        save_types = [
            ('Excel files', '*.xlsx'),
            ('CSV files', '*.csv'),
            ('All files', '*.*')
        ]
        
        output_file = select_file(
            title='Save Table 1 As',
            file_types=save_types,
            save=True
        )
        
        if not output_file:
            print("No output location selected. Exiting.")
            return
        
        # Ensure proper file extension
        if not output_file.endswith(('.xlsx', '.csv')):
            output_file += '.xlsx'
        
        # Save based on extension
        print("Saving results...")
        if output_file.endswith('.xlsx'):
            with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
                table1_df.to_excel(writer, sheet_name='Table 1', index=False)
                
                # Auto-adjust column widths
                worksheet = writer.sheets['Table 1']
                for idx, col in enumerate(table1_df.columns):
                    max_length = max(
                        table1_df[col].astype(str).apply(len).max(),
                        len(col)
                    )
                    worksheet.column_dimensions[chr(65 + idx)].width = max_length + 2
        else:
            table1_df.to_csv(output_file, index=False)
        
        print(f"Table 1 has been saved to: {output_file}")
        show_message("Success", f"Table 1 has been successfully created and saved to:\n{output_file}")
        
    except Exception as e:
        error_msg = f"An error occurred:\n{str(e)}"
        print(error_msg)
        show_message("Error", error_msg, error=True)

if __name__ == "__main__":
    main()

2025-05-15 15:38:30.145 python[8579:16816808] The class 'NSOpenPanel' overrides the method identifier.  This method is implemented by class 'NSWindow'


Reading data...
Optimizing data structure...
Generating Table 1...
Calculating age statistics...
Processing categorical variables...
Analyzing Gender-Marital Status combinations...
Calculating length of stay statistics...
Analyzing length of stay by purpose of visit...


2025-05-15 15:38:46.984 python[8579:16816808] The class 'NSSavePanel' overrides the method identifier.  This method is implemented by class 'NSWindow'


Saving results...
Table 1 has been saved to: /Users/janai/Library/CloudStorage/OneDrive-SharedLibraries-jlconsulting.llc/Projects - Documents/Research/Saint Lucia Tourism Piece/2.0 Descriptive statistics/final starting arrivals descriptive stats.xlsx


: 

This is the code for the post capped arrivals

In [None]:
import pandas as pd
import numpy as np
import openpyxl
import warnings
from datetime import datetime
import tkinter as tk
from tkinter import filedialog, messagebox
from pathlib import Path

def select_file(title, file_types, save=False):
    """Unified file selection function"""
    root = tk.Tk()
    root.withdraw()
    root.attributes('-topmost', True)
    
    try:
        if save:
            file_path = filedialog.asksaveasfilename(
                title=title,
                filetypes=file_types,
                defaultextension=file_types[0][1]
            )
        else:
            file_path = filedialog.askopenfilename(
                title=title,
                filetypes=file_types
            )
    finally:
        root.destroy()
    
    return file_path if file_path else None

def optimize_dataframe(df):
    """Optimize DataFrame memory usage"""
    # First, convert any string columns that should be numeric
    numeric_columns = ['state_percapita_income', 'state_unemployment', 
                       'immigrant_population', 'import_from_slu']
    for col in numeric_columns:
        if col in df.columns:
            # Handle potential strings by first converting to float
            try:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            except:
                # If conversion fails, log it but continue
                print(f"Warning: Could not convert {col} to numeric")
    
    date_columns = ['travel_date']
    for col in date_columns:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], format='mixed', errors='coerce')
    
    categorical_columns = ['sex_enc', 'marital_status_enc', 'us_state_enc', 
                         'employment_status_enc', 'purpose_simple', 'accomd_type_enc']
    for col in categorical_columns:
        if col in df.columns:
            df[col] = df[col].astype('category')
    
    return df

def create_table1(df):
    """Create Table 1 with interactions"""
    results = []
    total_n = len(df)
    
    # Calculate age
    print("Calculating age statistics...")
 
    age_stats = df['age'].agg(['mean', 'std'])
    results.append({
        'Category': 'Demographics',
        'Characteristic': 'Age',
        'Value': f"{age_stats['mean']:.1f} ± {age_stats['std']:.1f}",
        'N': total_n
    })

    # Make sure all numeric columns are properly converted to numeric
    numeric_cols = ['state_percapita_income', 'state_unemployment', 
                    'immigrant_population', 'import_from_slu']
    for col in numeric_cols:
        if col in df.columns:
            # Ensure column is numeric
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Format numeric statistics properly with error handling
    def safe_format_stats(column, prefix='', suffix='', decimals=1):
        """Safely format stats for a column with error handling"""
        try:
            # Check if column exists and has numeric values
            if column not in df.columns or df[column].isna().all():
                return "N/A", 0
            
            # Calculate statistics
            stats = df[column].agg(['mean', 'std'])
            n_count = df[column].notna().sum()
            
            # Format the output
            formatted = f"{prefix}{stats['mean']:.{decimals}f}{suffix} ± {prefix}{stats['std']:.{decimals}f}{suffix}"
            return formatted, n_count
        except Exception as e:
            print(f"Error formatting {column}: {e}")
            return "Error calculating stats", 0
    
    # State per capita income
    value, n_count = safe_format_stats('state_percapita_income', prefix='$', decimals=2)
    results.append({
        'Category': 'Demographics',
        'Characteristic': 'State Per Capita Income',
        'Value': value,
        'N': n_count
    })
    
    # State unemployment
    value, n_count = safe_format_stats('state_unemployment', suffix='%')
    results.append({
        'Category': 'Demographics',
        'Characteristic': 'State Unemployment Rate',
        'Value': value,
        'N': n_count
    })

    # State immigrant population
    value, n_count = safe_format_stats('immigrant_population', decimals=0)
    results.append({
        'Category': 'Demographics',
        'Characteristic': 'State Immigrant Population',
        'Value': value,
        'N': n_count
    })

    # State imports from St. Lucia
    value, n_count = safe_format_stats('import_from_slu', prefix='$', decimals=2)
    results.append({
        'Category': 'Demographics',
        'Characteristic': 'State Imports from St. Lucia',
        'Value': value,
        'N': n_count
    })
    
    # Process categorical variables
    categorical_vars = {
        'Demographics': ['sex_enc', 'marital_status_enc', 'employment_status_enc'],
        'Travel': ['purpose_simple', 'accomd_type_enc', 'us_state_enc']
    }
    
    print("Processing categorical variables...")
    for category, variables in categorical_vars.items():
        for var in variables:
            if var not in df.columns:
                print(f"Warning: Column {var} not found in DataFrame")
                continue
                
            value_counts = df[var].value_counts().head(5)  # Top 5 for all categories
            percentages = (value_counts / total_n * 100)
            
            results.append({
                'Category': category,
                'Characteristic': f"{var.title().replace('_', ' ')} (Top 5)",
                'Value': '',
                'N': '',
                'Percentage': ''
            })
            
            for val, count in value_counts.items():
                results.append({
                    'Category': category,
                    'Characteristic': "   " + str(val),
                    'Value': '',
                    'N': count,
                    'Percentage': f"{percentages[val]:.1f}%"
                })
    
    # Create and analyze Gender-Marital Status combinations
    print("Analyzing Gender-Marital Status combinations...")
    # Check if both columns exist before creating combination
    if 'sex_enc' in df.columns and 'marital_status_enc' in df.columns:
        df['gender_marital_enc'] = df['sex_enc'].astype(str) + ' - ' + df['marital_status_enc'].astype(str)
        gender_marital_counts = df['gender_marital_enc'].value_counts()
        gender_marital_pct = (gender_marital_counts / total_n * 100)
        
        results.append({
            'Category': 'Demographics',
            'Characteristic': 'Gender-Marital Status Combinations (Top 5)',
            'Value': '',
            'N': '',
            'Percentage': ''
        })
        
        for val, count in gender_marital_counts.head(5).items():
            results.append({
                'Category': 'Demographics',
                'Characteristic': "   " + val,
                'Value': '',
                'N': count,
                'Percentage': f"{gender_marital_pct[val]:.1f}%"
            })
    else:
        print("Warning: Cannot create gender-marital combinations - required columns missing")
    
    # Calculate length of stay and its interaction with purpose of visit
    print("Calculating length of stay statistics...")
    if 'los_capped' in df.columns:
        df['LENGTH_OF_STAY'] = df['los_capped']
        
        # Overall length of stay
        stay_stats = df['LENGTH_OF_STAY'].agg(['mean', 'std'])
        results.append({
            'Category': 'Travel',
            'Characteristic': 'Length of Stay (days)',
            'Value': f"{stay_stats['mean']:.1f} ± {stay_stats['std']:.1f}",
            'N': df['LENGTH_OF_STAY'].notna().sum()
        })
        
        # Length of stay by top 5 purposes
        print("Analyzing length of stay by purpose of visit...")
        if 'purpose_simple' in df.columns:
            top_5_purposes = df['purpose_simple'].value_counts().head(5).index
            results.append({
                'Category': 'Travel',
                'Characteristic': 'Length of Stay by Purpose (Top 5)',
                'Value': '',
                'N': '',
                'Percentage': ''
            })
            
            for purpose in top_5_purposes:
                purpose_stats = df[df['purpose_simple'] == purpose]['LENGTH_OF_STAY'].agg(['mean', 'std', 'size'])
                results.append({
                    'Category': 'Travel',
                    'Characteristic': f"   {purpose}",
                    'Value': f"{purpose_stats['mean']:.1f} ± {purpose_stats['std']:.1f}",
                    'N': purpose_stats['size']
                })
        else:
            print("Warning: Cannot analyze length of stay by purpose - purpose_enc column missing")
    else:
        print("Warning: Cannot calculate length of stay - los_capped column missing")
    
    return pd.DataFrame(results)

def show_message(title, message, error=False):
    """Show message dialog"""
    root = tk.Tk()
    root.withdraw()
    root.attributes('-topmost', True)
    try:
        if error:
            messagebox.showerror(title, message)
        else:
            messagebox.showinfo(title, message)
    finally:
        root.destroy()

def main():
    try:
        # File type definitions
        excel_types = [
            ('Excel files', '*.xlsx *.xls *.csv'),  # Fixed space before *.csv
            ('All files', '*.*')
        ]
        
        # Get input file
        input_file = select_file(
            title='Select Excel Data File',
            file_types=excel_types
        )
        
        if not input_file:
            print("No file selected. Exiting.")
            return
        
        # Read the data
        print("Reading data...")
        df = pd.read_excel(
            input_file,
            usecols=['sex_enc', 'marital_status_enc', 'us_state_enc', 
                     'employment_status_enc', 'purpose_simple', 'accomd_type_enc',  # Fixed missing comma
                     'age', 'los_capped', 'state_percapita_income',
                     'state_unemployment', 'immigrant_population','import_from_slu']
        )
        
        # Optimize DataFrame
        print("Optimizing data structure...")
        df = optimize_dataframe(df)
        
        # Create Table 1
        print("Generating Table 1...")
        table1_df = create_table1(df)
        
        # Get output file
        save_types = [
            ('Excel files', '*.xlsx'),
            ('CSV files', '*.csv'),
            ('All files', '*.*')
        ]
        
        output_file = select_file(
            title='Save Table 1 As',
            file_types=save_types,
            save=True
        )
        
        if not output_file:
            print("No output location selected. Exiting.")
            return
        
        # Ensure proper file extension
        if not output_file.endswith(('.xlsx', '.csv')):
            output_file += '.xlsx'
        
        # Save based on extension
        print("Saving results...")
        if output_file.endswith('.xlsx'):
            with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
                table1_df.to_excel(writer, sheet_name='Table 1', index=False)
                
                # Auto-adjust column widths
                worksheet = writer.sheets['Table 1']
                for idx, col in enumerate(table1_df.columns):
                    max_length = max(
                        table1_df[col].astype(str).apply(len).max(),
                        len(col)
                    )
                    worksheet.column_dimensions[chr(65 + idx)].width = max_length + 2
        else:
            table1_df.to_csv(output_file, index=False)
        
        print(f"Table 1 has been saved to: {output_file}")
        show_message("Success", f"Table 1 has been successfully created and saved to:\n{output_file}")
        
    except Exception as e:
        error_msg = f"An error occurred:\n{str(e)}"
        print(error_msg)
        show_message("Error", error_msg, error=True)

if __name__ == "__main__":
    main()

2025-05-15 15:55:27.704 python[9116:16836455] The class 'NSOpenPanel' overrides the method identifier.  This method is implemented by class 'NSWindow'


Reading data...
Optimizing data structure...
Generating Table 1...
Calculating age statistics...
Processing categorical variables...
Analyzing Gender-Marital Status combinations...
Calculating length of stay statistics...
Analyzing length of stay by purpose of visit...


2025-05-15 15:55:42.224 python[9116:16836455] The class 'NSSavePanel' overrides the method identifier.  This method is implemented by class 'NSWindow'


Saving results...
Table 1 has been saved to: /Users/janai/Library/CloudStorage/OneDrive-SharedLibraries-jlconsulting.llc/Projects - Documents/Research/Saint Lucia Tourism Piece/2.0 Descriptive statistics/final file descriptive statistics.xlsx
