In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import tkinter as tk
from tkinter import filedialog, messagebox
import json
import re
from pathlib import Path

def select_file(title, file_types, save=False):
    """Unified file selection function"""
    root = tk.Tk()
    root.withdraw()
    root.attributes('-topmost', True)
    
    try:
        if save:
            file_path = filedialog.asksaveasfilename(
                title=title,
                filetypes=file_types,
                defaultextension=file_types[0][1]
            )
        else:
            file_path = filedialog.askopenfilename(
                title=title,
                filetypes=file_types
            )
    finally:
        root.destroy()
    
    return file_path if file_path else None

def clean_destinations(destinations_series):
    """
    Clean destinations column by extracting accommodation type
    Efficient processing for large datasets
    """
    def extract_accommodation(json_str):
        try:
            # Handle potential string formatting issues
            json_str = json_str.replace("'", '"').strip()
            if not json_str.startswith('['):
                json_str = f'[{json_str}]'
            
            data = json.loads(json_str)
            if isinstance(data, list) and len(data) > 0:
                accom_type = data[0].get('Accommodation_Type', '').upper()
                
                # Map accommodation types similar to R code
                if 'HOTEL' in accom_type:
                    return 'Hotel'
                elif 'PRIVATE HOME' in accom_type:
                    return 'Sharing'
                elif any(x in accom_type for x in ['ALTERNATIVE', 'COTTAGE', 'CONDOMENIUM', 'APARTMENT']):
                    return 'Sharing'
                elif 'OTHER' in accom_type:
                    return 'Other'
                else:
                    return 'Sharing'
            return 'Sharing'
        except (json.JSONDecodeError, AttributeError, TypeError):
            return 'Sharing'
    
    # Vectorized application for better performance
    return destinations_series.apply(extract_accommodation)

def optimize_dataframe(df):
    """Optimize DataFrame memory usage"""
    date_columns = ['DOB', 'TRAVEL DATE', 'RETURN DATE']
    for col in date_columns:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], format='mixed', errors='coerce')
    
    categorical_columns = ['GENDER', 'MARITAL STATUS', 'NATIONALITY', 
                         'PURPOSE OF VISIT', 'CARRIER NAME', 'EMBARK COUNTRY',
                         'OCCUPATION', 'COUNTRY OF BIRTH', 'COUNTRY OF RESIDENCE',
                         'STATE PROVINCE']
    for col in categorical_columns:
        if col in df.columns:
            df[col] = df[col].astype('category')
    
    return df

def calculate_age_vectorized(dob_series):
    """Calculate age using vectorized operations"""
    today = pd.Timestamp.now()
    return ((today - dob_series).dt.days / 365.25)

def create_descriptive_stats(df):
    """Create descriptive statistics similar to the guide file"""
    results = []
    total_n = len(df)
    
    # Calculate age
    df['AGE'] = calculate_age_vectorized(df['DOB'])
    age_stats = df['AGE'].agg(['mean', 'std'])
    results.append({
        'Category': 'Demographics',
        'Characteristic': 'Age',
        'Value': f"{age_stats['mean']:.1f} ± {age_stats['std']:.1f}",
        'N': total_n
    })
    
    # Process categorical variables
    categorical_vars = {
        'Demographics': ['GENDER', 'MARITAL STATUS', 'NATIONALITY', 'OCCUPATION',
                        'COUNTRY OF BIRTH', 'COUNTRY OF RESIDENCE', 'STATE PROVINCE'],
        'Travel': ['PURPOSE OF VISIT', 'CARRIER NAME', 'EMBARK COUNTRY', 'Place of stay']
    }
    
    for category, variables in categorical_vars.items():
        for var in variables:
            if var in df.columns:
                value_counts = df[var].value_counts().head(5)
                percentages = (value_counts / total_n * 100)
                
                results.append({
                    'Category': category,
                    'Characteristic': f"{var.title().replace('_', ' ')} (Top 5)",
                    'Value': '',
                    'N': '',
                    'Percentage': ''
                })
                
                for val, count in value_counts.items():
                    results.append({
                        'Category': category,
                        'Characteristic': "   " + str(val),
                        'Value': '',
                        'N': count,
                        'Percentage': f"{percentages[val]:.1f}%"
                    })
    
    # Calculate length of stay
    df['LENGTH_OF_STAY'] = (df['RETURN DATE'] - df['TRAVEL DATE']).dt.days
    stay_stats = df['LENGTH_OF_STAY'].agg(['mean', 'std'])
    results.append({
        'Category': 'Travel',
        'Characteristic': 'Length of Stay (days)',
        'Value': f"{stay_stats['mean']:.1f} ± {stay_stats['std']:.1f}",
        'N': df['LENGTH_OF_STAY'].notna().sum()
    })
    
    return pd.DataFrame(results)

def main():
    try:
        # File type definitions
        excel_types = [
            ('Excel files', '*.xlsx *.xls'),
            ('All files', '*.*')
        ]
        
        # Get input file
        input_file = select_file(
            title='Select Tourism Data Excel File',
            file_types=excel_types
        )
        
        if not input_file:
            print("No file selected. Exiting.")
            return
        
        # Read the data with specific columns
        print("Reading data...")
        columns_to_keep = [
            'CARRIER NAME', 'EMBARK COUNTRY', 'TRAVEL DATE', 'RETURN DATE', 
            'DOB', 'GENDER', 'NATIONALITY', 'MARITAL STATUS', 'OCCUPATION',
            'COUNTRY OF BIRTH', 'COUNTRY OF RESIDENCE', 'STATE PROVINCE',
            'PURPOSE OF VISIT', 'DESTINATIONS'
        ]
        
        df = pd.read_excel(input_file, usecols=columns_to_keep)
        
        # Clean and optimize the DataFrame
        print("Processing data...")
        df = optimize_dataframe(df)
        
        # Clean destinations column
        print("Cleaning destinations data...")
        df['Place of stay'] = clean_destinations(df['DESTINATIONS'])
        
        # Create descriptive statistics
        print("Generating descriptive statistics...")
        stats_df = create_descriptive_stats(df)
        
        # Save processed data
        print("Saving processed data...")
        processed_file = select_file(
            title='Save Processed Data As',
            file_types=excel_types,
            save=True
        )
        
        if processed_file:
            df.to_excel(processed_file, index=False)
            print(f"Processed data saved to: {processed_file}")
        
        # Save descriptive statistics
        print("Saving descriptive statistics...")
        stats_file = select_file(
            title='Save Descriptive Statistics As',
            file_types=excel_types,
            save=True
        )
        
        if stats_file:
            with pd.ExcelWriter(stats_file, engine='openpyxl') as writer:
                stats_df.to_excel(writer, sheet_name='Descriptive Statistics', index=False)
                
                # Auto-adjust column widths
                worksheet = writer.sheets['Descriptive Statistics']
                for idx, col in enumerate(stats_df.columns):
                    max_length = max(
                        stats_df[col].astype(str).apply(len).max(),
                        len(col)
                    )
                    worksheet.column_dimensions[chr(65 + idx)].width = max_length + 2
            
            print(f"Descriptive statistics saved to: {stats_file}")
            
        print("Processing completed successfully!")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        messagebox.showerror("Error", f"An error occurred:\n{str(e)}")

if __name__ == "__main__":
    main()

Reading data...
Processing data...
Cleaning destinations data...
Generating descriptive statistics...
Saving processed data...
Processed data saved to: C:/Users/janai/jlconsulting.llc/Projects - Documents/Research/Saint Lucia Tourism Piece/Clean data/Clean arrivals.xlsx
Saving descriptive statistics...
Descriptive statistics saved to: C:/Users/janai/jlconsulting.llc/Projects - Documents/Research/Saint Lucia Tourism Piece/Clean data/Descriptive stats.xlsx
Processing completed successfully!
