In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import tkinter as tk
from tkinter import filedialog, messagebox
from pathlib import Path

def select_file(title, file_types, save=False):
    """Unified file selection function"""
    root = tk.Tk()
    root.withdraw()
    root.attributes('-topmost', True)
    
    try:
        if save:
            file_path = filedialog.asksaveasfilename(
                title=title,
                filetypes=file_types,
                defaultextension=file_types[0][1]
            )
        else:
            file_path = filedialog.askopenfilename(
                title=title,
                filetypes=file_types
            )
    finally:
        root.destroy()
    
    return file_path if file_path else None

def optimize_dataframe(df):
    """Optimize DataFrame memory usage"""
    date_columns = ['DOB', 'TRAVEL DATE', 'RETURN DATE']
    for col in date_columns:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], format='mixed', errors='coerce')
    
    categorical_columns = ['GENDER', 'MARITAL STATUS', 'NATIONALITY', 
                         'PURPOSE OF VISIT', 'CARRIER TYPE', 'CARRIER NAME',
                         'EMBARK PORT']
    for col in categorical_columns:
        if col in df.columns:
            df[col] = df[col].astype('category')
    
    return df

def calculate_age_vectorized(dob_series):
    """Calculate age using vectorized operations"""
    today = pd.Timestamp.now()
    return ((today - dob_series).dt.days / 365.25)

def create_table1(df):
    """Create Table 1 with interactions"""
    results = []
    total_n = len(df)
    
    # Calculate age
    print("Calculating age statistics...")
    df['AGE'] = calculate_age_vectorized(df['DOB'])
    age_stats = df['AGE'].agg(['mean', 'std'])
    results.append({
        'Category': 'Demographics',
        'Characteristic': 'Age',
        'Value': f"{age_stats['mean']:.1f} ± {age_stats['std']:.1f}",
        'N': total_n
    })
    
    # Process categorical variables
    categorical_vars = {
        'Demographics': ['GENDER', 'MARITAL STATUS', 'NATIONALITY'],
        'Travel': ['PURPOSE OF VISIT', 'CARRIER TYPE', 'CARRIER NAME', 'EMBARK PORT']
    }
    
    print("Processing categorical variables...")
    for category, variables in categorical_vars.items():
        for var in variables:
            value_counts = df[var].value_counts().head(5)  # Top 5 for all categories
            percentages = (value_counts / total_n * 100)
            
            results.append({
                'Category': category,
                'Characteristic': f"{var.title().replace('_', ' ')} (Top 5)",
                'Value': '',
                'N': '',
                'Percentage': ''
            })
            
            for val, count in value_counts.items():
                results.append({
                    'Category': category,
                    'Characteristic': "   " + str(val),
                    'Value': '',
                    'N': count,
                    'Percentage': f"{percentages[val]:.1f}%"
                })
    
    # Create and analyze Gender-Marital Status combinations
    print("Analyzing Gender-Marital Status combinations...")
    df['GENDER_MARITAL'] = df['GENDER'] + ' - ' + df['MARITAL STATUS']
    gender_marital_counts = df['GENDER_MARITAL'].value_counts()
    gender_marital_pct = (gender_marital_counts / total_n * 100)
    
    results.append({
        'Category': 'Demographics',
        'Characteristic': 'Gender-Marital Status Combinations (Top 5)',
        'Value': '',
        'N': '',
        'Percentage': ''
    })
    
    for val, count in gender_marital_counts.head(5).items():
        results.append({
            'Category': 'Demographics',
            'Characteristic': "   " + val,
            'Value': '',
            'N': count,
            'Percentage': f"{gender_marital_pct[val]:.1f}%"
        })
    
    # Calculate length of stay and its interaction with purpose of visit
    print("Calculating length of stay statistics...")
    df['LENGTH_OF_STAY'] = (df['RETURN DATE'] - df['TRAVEL DATE']).dt.days
    
    # Overall length of stay
    stay_stats = df['LENGTH_OF_STAY'].agg(['mean', 'std'])
    results.append({
        'Category': 'Travel',
        'Characteristic': 'Length of Stay (days)',
        'Value': f"{stay_stats['mean']:.1f} ± {stay_stats['std']:.1f}",
        'N': df['LENGTH_OF_STAY'].notna().sum()
    })
    
    # Length of stay by top 5 purposes
    print("Analyzing length of stay by purpose of visit...")
    top_5_purposes = df['PURPOSE OF VISIT'].value_counts().head(5).index
    results.append({
        'Category': 'Travel',
        'Characteristic': 'Length of Stay by Purpose (Top 5)',
        'Value': '',
        'N': '',
        'Percentage': ''
    })
    
    for purpose in top_5_purposes:
        purpose_stats = df[df['PURPOSE OF VISIT'] == purpose]['LENGTH_OF_STAY'].agg(['mean', 'std', 'size'])
        results.append({
            'Category': 'Travel',
            'Characteristic': f"   {purpose}",
            'Value': f"{purpose_stats['mean']:.1f} ± {purpose_stats['std']:.1f}",
            'N': purpose_stats['size']
        })
    
    return pd.DataFrame(results)

def show_message(title, message, error=False):
    """Show message dialog"""
    root = tk.Tk()
    root.withdraw()
    root.attributes('-topmost', True)
    try:
        if error:
            messagebox.showerror(title, message)
        else:
            messagebox.showinfo(title, message)
    finally:
        root.destroy()

def main():
    try:
        # File type definitions
        excel_types = [
            ('Excel files', '*.xlsx *.xls'),
            ('All files', '*.*')
        ]
        
        # Get input file
        input_file = select_file(
            title='Select Excel Data File',
            file_types=excel_types
        )
        
        if not input_file:
            print("No file selected. Exiting.")
            return
        
        # Read the data
        print("Reading data...")
        df = pd.read_excel(
            input_file,
            usecols=['DOB', 'GENDER', 'MARITAL STATUS', 'NATIONALITY',
                    'PURPOSE OF VISIT', 'CARRIER TYPE', 'CARRIER NAME', 
                    'EMBARK PORT', 'TRAVEL DATE', 'RETURN DATE']
        )
        
        # Optimize DataFrame
        print("Optimizing data structure...")
        df = optimize_dataframe(df)
        
        # Create Table 1
        print("Generating Table 1...")
        table1_df = create_table1(df)
        
        # Get output file
        save_types = [
            ('Excel files', '*.xlsx'),
            ('CSV files', '*.csv'),
            ('All files', '*.*')
        ]
        
        output_file = select_file(
            title='Save Table 1 As',
            file_types=save_types,
            save=True
        )
        
        if not output_file:
            print("No output location selected. Exiting.")
            return
        
        # Ensure proper file extension
        if not output_file.endswith(('.xlsx', '.csv')):
            output_file += '.xlsx'
        
        # Save based on extension
        print("Saving results...")
        if output_file.endswith('.xlsx'):
            with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
                table1_df.to_excel(writer, sheet_name='Table 1', index=False)
                
                # Auto-adjust column widths
                worksheet = writer.sheets['Table 1']
                for idx, col in enumerate(table1_df.columns):
                    max_length = max(
                        table1_df[col].astype(str).apply(len).max(),
                        len(col)
                    )
                    worksheet.column_dimensions[chr(65 + idx)].width = max_length + 2
        else:
            table1_df.to_csv(output_file, index=False)
        
        print(f"Table 1 has been saved to: {output_file}")
        show_message("Success", f"Table 1 has been successfully created and saved to:\n{output_file}")
        
    except Exception as e:
        error_msg = f"An error occurred:\n{str(e)}"
        print(error_msg)
        show_message("Error", error_msg, error=True)

if __name__ == "__main__":
    main()