# Qualtrics Processing Pipeline

## Step 1: Load Qualtrics Excel File

### Universal Qualtrics Excel File Loader

In [None]:
# Step 1: Universal Qualtrics Excel File Loader with GUI Selector
# This code is designed to work with any Qualtrics export regardless of survey content

import pandas as pd
import numpy as np
from pathlib import Path
import warnings
import re
import tkinter as tk             # <-- ADDED for GUI
from tkinter import filedialog   # <-- ADDED for GUI
warnings.filterwarnings('ignore', category=UserWarning)

def load_qualtrics_export(file_path=None, sheet_name=0):
    """
    Load a Qualtrics Excel export file with a GUI selector and robust error handling.

    Parameters:
    -----------
    file_path : str or Path, optional
        Path to the Excel file. If None, a GUI file selector will open.
    sheet_name : str or int, default 0
        Sheet name or index to load from the Excel file

    Returns:
    --------
    dict : Contains 'raw_data' (DataFrame), 'file_info' (dict), and 'quality_check' (dict)
    """

    print("=== Step 1: Loading Qualtrics Export ===")

    # If no file_path is provided, open a GUI file selector
    if file_path is None:
        # --- REPLACED CODE BLOCK ---
        root = tk.Tk()
        root.withdraw()  # Hide the main tkinter window

        print("Opening file selector...")
        file_path = filedialog.askopenfilename(
            title="Select the Qualtrics Excel Export",
            filetypes=[("Excel Files", "*.xlsx *.xls"), ("All files", "*.*")]
        )

        if not file_path:  # Handle case where user closes the dialog
            raise FileNotFoundError("No file selected. Please run the script again.")
        # --- END REPLACED CODE BLOCK ---

    file_path = Path(file_path)
    if not file_path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")

    print(f"Loading: {file_path.name}")

    # Load the Excel file with error handling
    try:
        # First, get sheet information
        excel_file = pd.ExcelFile(file_path)
        sheet_names = excel_file.sheet_names
        print(f"Available sheets: {sheet_names}")

        # Determine which sheet to load
        if isinstance(sheet_name, str) and sheet_name not in sheet_names:
            print(f"Warning: Sheet '{sheet_name}' not found. Using first sheet: '{sheet_names[0]}'")
            sheet_name = 0
        elif isinstance(sheet_name, int) and sheet_name >= len(sheet_names):
            print(f"Warning: Sheet index {sheet_name} out of range. Using first sheet: '{sheet_names[0]}'")
            sheet_name = 0

        actual_sheet = sheet_names[sheet_name] if isinstance(sheet_name, int) else sheet_name
        print(f"Loading sheet: '{actual_sheet}'")

        # Load the data
        raw_df = pd.read_excel(file_path, sheet_name=sheet_name, engine='openpyxl')

    except Exception as e:
        raise Exception(f"Failed to load Excel file: {str(e)}")

    # File information
    file_info = {
        'filename': file_path.name,
        'file_size_mb': file_path.stat().st_size / (1024 * 1024),
        'sheet_loaded': actual_sheet,
        'available_sheets': sheet_names,
        'raw_shape': raw_df.shape
    }

    print(f"File loaded successfully:")
    print(f"  Size: {file_info['file_size_mb']:.1f} MB")
    print(f"  Dimensions: {raw_df.shape[0]:,} rows × {raw_df.shape[1]:,} columns")

    # Quality checks to identify Qualtrics structure
    quality_check = analyze_qualtrics_structure(raw_df)

    return {
        'raw_data': raw_df,
        'file_info': file_info,
        'quality_check': quality_check
    }

def analyze_qualtrics_structure(df):
    """
    Analyze the loaded DataFrame to identify Qualtrics-specific patterns.

    Parameters:
    -----------
    df : pandas.DataFrame
        Raw loaded data

    Returns:
    --------
    dict : Quality check results and structural analysis
    """

    print("\n=== Analyzing Qualtrics Structure ===")

    # Initialize quality check results
    quality_check = {
        'is_qualtrics_format': False,
        'header_row_index': None,
        'data_start_row': None,
        'standard_qualtrics_columns': [],
        'question_columns': [],
        'total_columns': len(df.columns),
        'potential_issues': []
    }

    # Standard Qualtrics metadata columns (regardless of survey content)
    standard_columns = [
        'StartDate', 'EndDate', 'Status', 'IPAddress', 'Progress',
        'Duration (in seconds)', 'Finished', 'RecordedDate', 'ResponseId',
        'RecipientLastName', 'RecipientFirstName', 'RecipientEmail',
        'ExternalReference', 'LocationLatitude', 'LocationLongitude',
        'DistributionChannel', 'UserLanguage'
    ]

    # Check if this looks like a Qualtrics export
    columns_list = df.columns.tolist()
    standard_found = [col for col in standard_columns if col in columns_list]
    quality_check['standard_qualtrics_columns'] = standard_found

    # Qualtrics exports typically have these key identifiers
    qualtrics_indicators = ['ResponseId', 'StartDate', 'EndDate', 'Status']
    indicators_found = sum(1 for indicator in qualtrics_indicators if indicator in columns_list)

    if indicators_found >= 3:
        quality_check['is_qualtrics_format'] = True
        print("✓ Confirmed Qualtrics export format")
    else:
        quality_check['potential_issues'].append("Does not appear to be standard Qualtrics export format")
        print("⚠ Warning: File may not be a standard Qualtrics export")

    # Identify question columns (typically start with Q followed by number)
    question_pattern = re.compile(r'^Q\d+', re.IGNORECASE)
    question_columns = [col for col in columns_list if question_pattern.match(str(col))]
    quality_check['question_columns'] = question_columns

    print(f"Standard Qualtrics columns found: {len(standard_found)}")
    print(f"Question columns identified: {len(question_columns)}")

    # Analyze row structure for header/data separation
    if quality_check['is_qualtrics_format']:
        header_analysis = analyze_header_structure(df)
        quality_check.update(header_analysis)

    # Check for common issues
    if df.shape[0] < 2:
        quality_check['potential_issues'].append("Very few rows - may not contain response data")

    if df.isnull().all().sum() > len(df.columns) * 0.5:
        quality_check['potential_issues'].append("Many completely empty columns detected")

    # Report findings
    if quality_check['potential_issues']:
        print("\n⚠ Potential Issues Detected:")
        for issue in quality_check['potential_issues']:
            print(f"  - {issue}")
    else:
        print("✓ No structural issues detected")

    return quality_check

def analyze_header_structure(df):
    """
    Analyze the DataFrame to identify where the header row and data rows are.
    Qualtrics exports typically have question text in row 0 and data starting from row 1 or 2.

    Parameters:
    -----------
    df : pandas.DataFrame
        Raw Qualtrics data

    Returns:
    --------
    dict : Header analysis results
    """

    header_info = {
        'header_row_index': 0,  # Qualtrics question text is typically in row 0
        'data_start_row': 1,    # Response data typically starts at row 1
        'response_type_column': None,
        'preview_responses_detected': False
    }

    # Check if there's a Status column to identify preview responses
    if 'Status' in df.columns:
        header_info['response_type_column'] = 'Status'

        # Look for preview responses in first few rows
        status_values = df['Status'].head(10).dropna().unique()
        if any('preview' in str(val).lower() for val in status_values):
            header_info['preview_responses_detected'] = True
            print("✓ Preview responses detected in Status column")

    # Verify our assumptions by checking if row 0 contains question text
    if len(df) > 0:
        row_0_sample = df.iloc[0].dropna().head(3).tolist()
        avg_text_length = np.mean([len(str(val)) for val in row_0_sample]) if row_0_sample else 0

        if avg_text_length > 50:  # Long text suggests question descriptions
            print("✓ Row 0 appears to contain question text (header row)")
        else:
            print("⚠ Row 0 may not contain typical Qualtrics question text")
            header_info.setdefault('potential_issues', []).append("Row 0 structure atypical for Qualtrics")

    return header_info

# Example usage and testing
if __name__ == "__main__":
    try:
        # Load the Qualtrics file
        result = load_qualtrics_export()

        raw_data = result['raw_data']
        file_info = result['file_info']
        quality_check = result['quality_check']

        print(f"\n=== Loading Summary ===")
        print(f"File: {file_info['filename']}")
        print(f"Qualtrics format: {quality_check['is_qualtrics_format']}")
        print(f"Total responses: {raw_data.shape[0]:,}")
        print(f"Total columns: {raw_data.shape[1]:,}")
        print(f"Question columns: {len(quality_check['question_columns'])}")
        print(f"Standard metadata columns: {len(quality_check['standard_qualtrics_columns'])}")

        # Show a sample of the data structure
        print(f"\n=== Data Structure Preview ===")
        print("First few column names:")
        for i, col in enumerate(raw_data.columns[:8]):
            print(f"  {i}: {col}")
        if len(raw_data.columns) > 8:
            print(f"  ... and {len(raw_data.columns) - 8} more columns")

        print(f"\nFirst row sample (likely question text):")
        sample_row = raw_data.iloc[0].head(5)
        for col, val in sample_row.items():
            val_preview = str(val)[:60] + "..." if len(str(val)) > 60 else str(val)
            print(f"  {col}: {val_preview}")

        print("\n✓ Step 1 Complete: File loaded and analyzed successfully")

    except Exception as e:
        print(f"❌ Error in Step 1: {str(e)}")
        print("Please check your file path and ensure it's a valid Qualtrics Excel export.")

##  Step 2: Data Structure Separation

### Extract and Analyze Data Structure

In [None]:
# Step 2a: Extract and Analyze Data Structure
# Objective: Separate codebook information from response data and identify the true structure

import pandas as pd
import numpy as np

def extract_data_structure(result_from_step1):
    """
    Analyze and extract the data structure from a Qualtrics export.
    Handles different Qualtrics export formats robustly.

    Parameters:
    -----------
    result_from_step1 : dict
        Result dictionary from Step 1 containing raw_data, file_info, quality_check

    Returns:
    --------
    dict : Contains separated codebook, response_data, and structure_analysis
    """

    print("=== Step 2a: Extracting Data Structure ===")

    raw_df = result_from_step1['raw_data']
    quality_check = result_from_step1['quality_check']

    # Initialize structure analysis
    structure_analysis = {
        'codebook_source': None,
        'response_data_start_row': None,
        'header_type': None,
        'total_rows_analyzed': len(raw_df),
        'metadata_columns': [],
        'question_columns': [],
        'response_types_found': []
    }

    # Identify metadata vs question columns
    metadata_patterns = [
        'StartDate', 'EndDate', 'Status', 'IPAddress', 'Progress',
        'Duration (in seconds)', 'Finished', 'RecordedDate', 'ResponseId',
        'RecipientLastName', 'RecipientFirstName', 'RecipientEmail',
        'ExternalReference', 'LocationLatitude', 'LocationLongitude',
        'DistributionChannel', 'UserLanguage'
    ]

    all_columns = raw_df.columns.tolist()
    metadata_cols = [col for col in all_columns if col in metadata_patterns]
    question_cols = [col for col in all_columns if col not in metadata_patterns]

    structure_analysis['metadata_columns'] = metadata_cols
    structure_analysis['question_columns'] = question_cols

    print(f"Identified {len(metadata_cols)} metadata columns")
    print(f"Identified {len(question_cols)} question/data columns")

    # Determine header structure and codebook location
    codebook_info = analyze_codebook_structure(raw_df, structure_analysis)
    structure_analysis.update(codebook_info)

    # Separate codebook from response data
    if structure_analysis['codebook_source'] == 'row_0':
        # Traditional Qualtrics format - row 0 has question text
        codebook_df = raw_df.iloc[[0]].copy()
        response_data = raw_df.iloc[1:].copy()
        print("Extracted codebook from row 0 (question text)")

    elif structure_analysis['codebook_source'] == 'column_names':
        # Alternative format - column names are the questions
        codebook_data = {'column_id': all_columns, 'question_text': all_columns}
        codebook_df = pd.DataFrame(codebook_data)
        response_data = raw_df.copy()
        print("Using column names as codebook (no separate question text row)")

    elif structure_analysis['codebook_source'] == 'mixed':
        # Hybrid format - some info in row 0, but not full questions
        codebook_df = raw_df.iloc[[0]].copy()
        response_data = raw_df.iloc[1:].copy()
        print("Extracted partial codebook from row 0 (mixed format)")

    else:
        # Fallback - treat as simple structure
        codebook_data = {'column_id': all_columns, 'question_text': all_columns}
        codebook_df = pd.DataFrame(codebook_data)
        response_data = raw_df.copy()
        print("Using fallback codebook structure")

    # Reset response data index
    response_data.reset_index(drop=True, inplace=True)

    # Analyze response types and quality
    response_analysis = analyze_response_types(response_data, structure_analysis)
    structure_analysis.update(response_analysis)

    return {
        'codebook': codebook_df,
        'response_data': response_data,
        'structure_analysis': structure_analysis,
        'metadata_columns': metadata_cols,
        'question_columns': question_cols
    }

def analyze_codebook_structure(df, structure_info):
    """
    Determine how the codebook/question text is stored in this export.

    Parameters:
    -----------
    df : pandas.DataFrame
        Raw data from Qualtrics
    structure_info : dict
        Current structure analysis

    Returns:
    --------
    dict : Codebook structure information
    """

    print("\n--- Analyzing Codebook Structure ---")

    codebook_info = {
        'codebook_source': 'unknown',
        'header_type': 'unknown',
        'codebook_quality': 'unknown'
    }

    if len(df) == 0:
        print("Warning: No data rows to analyze")
        return codebook_info

    # Examine row 0 to see if it contains question text
    row_0 = df.iloc[0]

    # Calculate metrics to determine if row 0 has question text
    non_null_values = row_0.dropna()
    if len(non_null_values) == 0:
        print("Row 0 is completely empty")
        codebook_info['codebook_source'] = 'column_names'
        codebook_info['header_type'] = 'empty_row_0'
        return codebook_info

    # Analyze text characteristics of row 0
    avg_length = np.mean([len(str(val)) for val in non_null_values])
    max_length = max([len(str(val)) for val in non_null_values])

    # Look for question-like patterns
    question_patterns = 0
    for val in non_null_values.head(10):  # Check first 10 non-null values
        str_val = str(val).lower()
        if any(pattern in str_val for pattern in ['which of', 'please', 'identify', 'provide', '?']):
            question_patterns += 1

    print(f"Row 0 analysis:")
    print(f"  Average text length: {avg_length:.1f} characters")
    print(f"  Maximum text length: {max_length} characters")
    print(f"  Question-like patterns found: {question_patterns}")

    # Decision logic for codebook source
    if avg_length > 30 and question_patterns >= 2:
        codebook_info['codebook_source'] = 'row_0'
        codebook_info['header_type'] = 'full_questions'
        codebook_info['codebook_quality'] = 'high'
        print("✓ Row 0 contains full question text")

    elif avg_length > 15 and max_length > 20:
        codebook_info['codebook_source'] = 'mixed'
        codebook_info['header_type'] = 'descriptive_headers'
        codebook_info['codebook_quality'] = 'medium'
        print("✓ Row 0 contains descriptive headers (not full questions)")

    elif avg_length < 10:
        codebook_info['codebook_source'] = 'column_names'
        codebook_info['header_type'] = 'short_labels'
        codebook_info['codebook_quality'] = 'low'
        print("✓ Row 0 contains short labels - using column names as codebook")

    else:
        codebook_info['codebook_source'] = 'mixed'
        codebook_info['header_type'] = 'mixed_format'
        codebook_info['codebook_quality'] = 'medium'
        print("? Mixed format detected - treating as partial codebook")

    return codebook_info

def analyze_response_types(df, structure_info):
    """
    Analyze the types of responses in the data (real vs test responses).

    Parameters:
    -----------
    df : pandas.DataFrame
        Response data (after removing codebook row if applicable)
    structure_info : dict
        Current structure analysis

    Returns:
    --------
    dict : Response type analysis
    """

    print("\n--- Analyzing Response Types ---")

    response_info = {
        'total_responses': len(df),
        'response_types_found': [],
        'test_responses': 0,
        'genuine_responses': 0,
        'response_type_column': None
    }

    if len(df) == 0:
        print("No response data to analyze")
        return response_info

    # Look for Status column to identify response types
    status_columns = [col for col in df.columns if 'status' in col.lower()]

    if status_columns:
        status_col = status_columns[0]  # Use first status column found
        response_info['response_type_column'] = status_col

        # Analyze response types
        status_counts = df[status_col].value_counts(dropna=False)
        response_info['response_types_found'] = status_counts.to_dict()

        print(f"Response types found in '{status_col}':")
        for resp_type, count in status_counts.items():
            percentage = (count / len(df)) * 100
            print(f"  {resp_type}: {count} responses ({percentage:.1f}%)")

            # Categorize as test vs genuine
            if pd.isna(resp_type):
                continue
            elif any(keyword in str(resp_type).lower() for keyword in ['preview', 'test', 'spam']):
                response_info['test_responses'] += count
            else:
                response_info['genuine_responses'] += count

    else:
        print("No Status column found - assuming all responses are genuine")
        response_info['genuine_responses'] = len(df)

    # Additional data quality checks
    completion_info = analyze_completion_patterns(df)
    response_info.update(completion_info)

    return response_info

def analyze_completion_patterns(df):
    """
    Analyze completion patterns and data quality indicators.

    Parameters:
    -----------
    df : pandas.DataFrame
        Response data

    Returns:
    --------
    dict : Completion analysis
    """

    completion_info = {
        'has_progress_data': False,
        'has_duration_data': False,
        'completion_rate_available': False,
        'data_quality_indicators': []
    }

    # Check for progress tracking
    progress_columns = [col for col in df.columns if 'progress' in col.lower()]
    if progress_columns:
        completion_info['has_progress_data'] = True
        progress_col = progress_columns[0]

        # Analyze completion rates
        if df[progress_col].notna().sum() > 0:
            completion_info['completion_rate_available'] = True
            complete_responses = (df[progress_col] == 100).sum() if (df[progress_col] == 100).any() else 0
            completion_rate = (complete_responses / len(df)) * 100
            completion_info['completion_rate'] = completion_rate
            print(f"Survey completion rate: {completion_rate:.1f}%")

    # Check for duration data
    duration_columns = [col for col in df.columns if 'duration' in col.lower()]
    if duration_columns:
        completion_info['has_duration_data'] = True
        duration_col = duration_columns[0]

        if df[duration_col].notna().sum() > 0:
            median_duration = df[duration_col].median()
            completion_info['median_duration_seconds'] = median_duration
            print(f"Median completion time: {median_duration/60:.1f} minutes")

    # Data quality flags
    if completion_info.get('completion_rate', 100) < 70:
        completion_info['data_quality_indicators'].append('Low completion rate')

    return completion_info

# Example usage
if __name__ == "__main__":
    # This assumes 'result' exists from Step 1
    try:
        structure_result = extract_data_structure(result)

        codebook = structure_result['codebook']
        response_data = structure_result['response_data']
        analysis = structure_result['structure_analysis']

        print(f"\n=== Step 2a Summary ===")
        print(f"Codebook source: {analysis['codebook_source']}")
        print(f"Codebook shape: {codebook.shape}")
        print(f"Response data shape: {response_data.shape}")
        print(f"Total genuine responses: {analysis.get('genuine_responses', 'Unknown')}")
        print(f"Total test responses: {analysis.get('test_responses', 0)}")

        if analysis.get('completion_rate_available'):
            print(f"Completion rate: {analysis.get('completion_rate', 0):.1f}%")

        print("\n✓ Step 2a Complete: Data structure extracted and analyzed")

    except NameError:
        print("❌ Please run Step 1 first to create the 'result' variable")
    except Exception as e:
        print(f"❌ Error in Step 2a: {str(e)}")

### Clean and Filter Response Data

In [None]:
# Step 2b: Clean and Filter Response Data
# Objective: Remove test responses, apply initial cleaning, create quality flags

import pandas as pd
import numpy as np
import re

def clean_and_filter_responses(structure_result):
    """
    Clean the response data by removing test responses and applying universal cleaning rules.
    Create data quality flags for transparent analysis.

    Parameters:
    -----------
    structure_result : dict
        Result from Step 2a containing codebook, response_data, and structure_analysis

    Returns:
    --------
    dict : Contains cleaned_data, quality_flags, cleaning_log, and summary_stats
    """

    print("=== Step 2b: Cleaning and Filtering Response Data ===")

    response_data = structure_result['response_data'].copy()
    analysis = structure_result['structure_analysis']

    cleaning_log = []
    initial_row_count = len(response_data)

    print(f"Starting with {initial_row_count:,} total responses")

    # Step 1: Remove test/preview responses
    genuine_data = filter_test_responses(response_data, analysis, cleaning_log)

    # Step 2: Create data quality flags (before removing any data)
    flagged_data = create_quality_flags(genuine_data, analysis, cleaning_log)

    # Step 3: Apply universal cleaning rules
    cleaned_data = apply_universal_cleaning(flagged_data, cleaning_log)

    # Step 4: Standardize column names
    final_data = standardize_column_names(cleaned_data, structure_result['metadata_columns'],
                                        structure_result['question_columns'], cleaning_log)

    # Generate summary statistics
    summary_stats = generate_cleaning_summary(initial_row_count, final_data, cleaning_log)

    return {
        'cleaned_data': final_data,
        'cleaning_log': cleaning_log,
        'summary_stats': summary_stats,
        'original_columns': response_data.columns.tolist(),
        'final_columns': final_data.columns.tolist()
    }

def filter_test_responses(df, analysis, cleaning_log):
    """
    Remove test responses (Survey Preview, Spam, etc.) while preserving genuine responses.

    Parameters:
    -----------
    df : pandas.DataFrame
        Response data
    analysis : dict
        Structure analysis from Step 2a
    cleaning_log : list
        Log of cleaning operations

    Returns:
    --------
    pandas.DataFrame : Data with test responses removed
    """

    print("\n--- Filtering Test Responses ---")

    initial_count = len(df)

    # Define test response patterns (case-insensitive)
    test_patterns = [
        'survey preview', 'preview', 'test', 'spam', 'survey test'
    ]

    if analysis.get('response_type_column'):
        status_col = analysis['response_type_column']

        # Identify test responses
        test_mask = pd.Series([False] * len(df), index=df.index)

        for pattern in test_patterns:
            pattern_mask = df[status_col].astype(str).str.lower().str.contains(pattern, na=False)
            test_mask = test_mask | pattern_mask

        test_count = test_mask.sum()
        genuine_data = df[~test_mask].copy().reset_index(drop=True)

        cleaning_log.append(f"Removed {test_count} test responses based on Status column patterns")
        print(f"Removed {test_count} test responses ({test_count/initial_count*100:.1f}%)")
        print(f"Retained {len(genuine_data)} genuine responses ({len(genuine_data)/initial_count*100:.1f}%)")

    else:
        # No status column - assume all are genuine
        genuine_data = df.copy()
        cleaning_log.append("No status column found - retained all responses as genuine")
        print("No status column found - assuming all responses are genuine")

    return genuine_data

def create_quality_flags(df, analysis, cleaning_log):
    """
    Create data quality flags without removing data - for transparent analysis.

    Parameters:
    -----------
    df : pandas.DataFrame
        Genuine response data
    analysis : dict
        Structure analysis from Step 2a
    cleaning_log : list
        Log of cleaning operations

    Returns:
    --------
    pandas.DataFrame : Data with quality flag columns added
    """

    print("\n--- Creating Data Quality Flags ---")

    flagged_data = df.copy()
    flags_created = 0

    # Flag 1: Survey completion status
    if analysis.get('has_progress_data'):
        progress_cols = [col for col in df.columns if 'progress' in col.lower()]
        if progress_cols:
            progress_col = progress_cols[0]
            flagged_data['flag_incomplete'] = df[progress_col] < 100
            incomplete_count = flagged_data['flag_incomplete'].sum()
            print(f"Created flag_incomplete: {incomplete_count} responses ({incomplete_count/len(df)*100:.1f}%)")
            flags_created += 1

    # Flag 2: Duration outliers
    if analysis.get('has_duration_data'):
        duration_cols = [col for col in df.columns if 'duration' in col.lower()]
        if duration_cols:
            duration_col = duration_cols[0]

            # Convert to numeric if needed
            duration_numeric = pd.to_numeric(df[duration_col], errors='coerce')

            # Flag very short responses (< 60 seconds)
            flagged_data['flag_duration_too_short'] = duration_numeric < 60
            short_count = flagged_data['flag_duration_too_short'].sum()

            # Flag very long responses (> 2 hours = 7200 seconds)
            flagged_data['flag_duration_too_long'] = duration_numeric > 7200
            long_count = flagged_data['flag_duration_too_long'].sum()

            print(f"Created flag_duration_too_short: {short_count} responses ({short_count/len(df)*100:.1f}%)")
            print(f"Created flag_duration_too_long: {long_count} responses ({long_count/len(df)*100:.1f}%)")
            flags_created += 2

    # Flag 3: Response pattern flags (straight-lining, etc.)
    pattern_flags = create_response_pattern_flags(df, cleaning_log)
    for flag_name, flag_data in pattern_flags.items():
        flagged_data[flag_name] = flag_data
        flag_count = flag_data.sum()
        print(f"Created {flag_name}: {flag_count} responses ({flag_count/len(df)*100:.1f}%)")
        flags_created += 1

    # Summary of flagging
    total_flagged = flagged_data[[col for col in flagged_data.columns if col.startswith('flag_')]].any(axis=1).sum()
    print(f"\nTotal flags created: {flags_created}")
    print(f"Responses with any flag: {total_flagged} ({total_flagged/len(df)*100:.1f}%)")
    print(f"Clean responses (no flags): {len(df)-total_flagged} ({(len(df)-total_flagged)/len(df)*100:.1f}%)")

    cleaning_log.append(f"Created {flags_created} quality flag types affecting {total_flagged} responses")

    return flagged_data

def create_response_pattern_flags(df, cleaning_log):
    """
    Create flags for suspicious response patterns.

    Parameters:
    -----------
    df : pandas.DataFrame
        Response data
    cleaning_log : list
        Cleaning log

    Returns:
    --------
    dict : Dictionary of flag name -> boolean Series
    """

    pattern_flags = {}

    # Find potential rating/scale columns (likely to show straight-lining)
    question_cols = [col for col in df.columns if not any(meta in col for meta in
                    ['Date', 'Status', 'IP', 'Progress', 'Duration', 'Finished', 'Recorded', 'Response', 'Recipient', 'External', 'Location', 'Distribution', 'Language'])]

    # Look for columns that might be scales (numeric responses)
    numeric_question_cols = []
    for col in question_cols:
        # Try to convert to numeric and see if it's reasonable scale data
        numeric_vals = pd.to_numeric(df[col], errors='coerce').dropna()
        if len(numeric_vals) > 0:
            unique_vals = numeric_vals.unique()
            if len(unique_vals) <= 10 and numeric_vals.min() >= 0 and numeric_vals.max() <= 10:
                numeric_question_cols.append(col)

    # Flag potential straight-lining if we have scale columns
    if len(numeric_question_cols) >= 3:
        straightline_flags = []
        for idx, row in df.iterrows():
            scale_responses = []
            for col in numeric_question_cols[:10]:  # Check up to 10 scale columns
                val = pd.to_numeric(row[col], errors='coerce')
                if not pd.isna(val):
                    scale_responses.append(val)

            # Flag if 80% or more of scale responses are identical (and we have at least 3 responses)
            if len(scale_responses) >= 3:
                most_common_val = max(set(scale_responses), key=scale_responses.count)
                same_response_pct = scale_responses.count(most_common_val) / len(scale_responses)
                straightline_flags.append(same_response_pct >= 0.8)
            else:
                straightline_flags.append(False)

        pattern_flags['flag_potential_straightlining'] = pd.Series(straightline_flags, index=df.index)

    return pattern_flags

def apply_universal_cleaning(df, cleaning_log):
    """
    Apply universal data cleaning rules that work across all surveys.

    Parameters:
    -----------
    df : pandas.DataFrame
        Flagged data
    cleaning_log : list
        Cleaning log

    Returns:
    --------
    pandas.DataFrame : Cleaned data
    """

    print("\n--- Applying Universal Cleaning Rules ---")

    cleaned_data = df.copy()
    changes_made = 0

    # Rule 1: Standardize NA representations
    na_patterns = {
        r'^\s*n\s*a\s*$': np.nan,           # "n a", "N A", " na ", etc.
        r'^\s*n/a\s*$': np.nan,             # "n/a", "N/A", " n/a ", etc.
        r'^\s*na\s*$': np.nan,              # "na", "NA", " na ", etc.
        r'^\s*none\s*$': np.nan,            # "none", "None", " none ", etc.
        r'^\s*null\s*$': np.nan,            # "null", "Null", etc.
        r'^\s*$': np.nan                    # Empty strings and whitespace-only
    }

    initial_nulls = cleaned_data.isnull().sum().sum()

    for pattern, replacement in na_patterns.items():
        for col in cleaned_data.columns:
            if cleaned_data[col].dtype == 'object':
                mask = cleaned_data[col].astype(str).str.match(pattern, case=False, na=False)
                if mask.any():
                    cleaned_data.loc[mask, col] = replacement
                    changes_made += mask.sum()

    final_nulls = cleaned_data.isnull().sum().sum()
    na_changes = final_nulls - initial_nulls

    if na_changes > 0:
        print(f"Standardized NA representations: +{na_changes} null values created")
        cleaning_log.append(f"Standardized {na_changes} NA representations to null values")

    # Rule 2: Standardize Yes/No responses
    yes_no_patterns = {
        r'^\s*yes\s*\.?\s*$': 'Yes',        # "yes", "Yes.", " yes ", etc.
        r'^\s*no\s*\.?\s*$': 'No',          # "no", "No.", " no ", etc.
        r'^\s*y\s*$': 'Yes',                # "y", "Y"
        r'^\s*n\s*$': 'No'                  # "n", "N" (but be careful not to catch legitimate "n")
    }

    yes_no_changes = 0
    for pattern, replacement in yes_no_patterns.items():
        for col in cleaned_data.columns:
            if cleaned_data[col].dtype == 'object':
                mask = cleaned_data[col].astype(str).str.match(pattern, case=False, na=False)
                if mask.any():
                    cleaned_data.loc[mask, col] = replacement
                    yes_no_changes += mask.sum()

    if yes_no_changes > 0:
        print(f"Standardized Yes/No responses: {yes_no_changes} changes made")
        cleaning_log.append(f"Standardized {yes_no_changes} Yes/No response formats")

    print(f"Universal cleaning complete: {changes_made + yes_no_changes} total changes")

    return cleaned_data

def standardize_column_names(df, metadata_cols, question_cols, cleaning_log):
    """
    Standardize column names for consistent analysis.

    Parameters:
    -----------
    df : pandas.DataFrame
        Cleaned data
    metadata_cols : list
        List of metadata column names
    question_cols : list
        List of question column names
    cleaning_log : list
        Cleaning log

    Returns:
    --------
    pandas.DataFrame : Data with standardized column names
    """

    print("\n--- Standardizing Column Names ---")

    final_data = df.copy()

    # Create column name mapping
    column_mapping = {}

    for col in df.columns:
        # Clean column names: remove special characters, standardize spacing
        clean_name = re.sub(r'[^\w\s]', '_', col)  # Replace special chars with underscore
        clean_name = re.sub(r'\s+', '_', clean_name)  # Replace spaces with underscore
        clean_name = re.sub(r'_+', '_', clean_name)  # Replace multiple underscores with single
        clean_name = clean_name.strip('_').lower()  # Remove leading/trailing underscores and lowercase

        # Ensure name is not empty
        if not clean_name:
            clean_name = f"col_{df.columns.get_loc(col)}"

        column_mapping[col] = clean_name

    # Apply column name changes
    final_data.rename(columns=column_mapping, inplace=True)

    # Report changes
    changes_made = sum(1 for old, new in column_mapping.items() if old != new)
    print(f"Standardized {changes_made} column names")

    # Show examples of changes
    if changes_made > 0:
        print("Sample column name changes:")
        examples_shown = 0
        for old, new in column_mapping.items():
            if old != new and examples_shown < 5:
                print(f"  '{old}' → '{new}'")
                examples_shown += 1
        if changes_made > 5:
            print(f"  ... and {changes_made - 5} more changes")

    cleaning_log.append(f"Standardized {changes_made} column names for consistency")

    return final_data

def generate_cleaning_summary(initial_count, final_data, cleaning_log):
    """
    Generate comprehensive summary of cleaning operations.

    Parameters:
    -----------
    initial_count : int
        Initial number of responses
    final_data : pandas.DataFrame
        Final cleaned data
    cleaning_log : list
        Log of all cleaning operations

    Returns:
    --------
    dict : Summary statistics
    """

    final_count = len(final_data)
    flag_columns = [col for col in final_data.columns if col.startswith('flag_')]

    summary = {
        'initial_responses': initial_count,
        'final_responses': final_count,
        'responses_removed': initial_count - final_count,
        'removal_rate_pct': ((initial_count - final_count) / initial_count * 100) if initial_count > 0 else 0,
        'quality_flags_created': len(flag_columns),
        'flagged_responses': final_data[flag_columns].any(axis=1).sum() if flag_columns else 0,
        'clean_responses': final_count - (final_data[flag_columns].any(axis=1).sum() if flag_columns else 0),
        'cleaning_operations': len(cleaning_log)
    }

    return summary

# Example usage
if __name__ == "__main__":
    try:
        # This assumes 'structure_result' exists from Step 2a
        cleaning_result = clean_and_filter_responses(structure_result)

        cleaned_data = cleaning_result['cleaned_data']
        summary = cleaning_result['summary_stats']

        print(f"\n=== Step 2b Summary ===")
        print(f"Data cleaning completed:")
        print(f"  Initial responses: {summary['initial_responses']:,}")
        print(f"  Final responses: {summary['final_responses']:,}")
        print(f"  Responses removed: {summary['responses_removed']} ({summary['removal_rate_pct']:.1f}%)")
        print(f"  Quality flags created: {summary['quality_flags_created']}")
        print(f"  Flagged responses: {summary['flagged_responses']} ({summary['flagged_responses']/summary['final_responses']*100:.1f}%)")
        print(f"  Clean responses: {summary['clean_responses']} ({summary['clean_responses']/summary['final_responses']*100:.1f}%)")
        print(f"  Final data shape: {cleaned_data.shape[0]:,} × {cleaned_data.shape[1]:,}")

        print("\n✓ Step 2b Complete: Data cleaned and quality flags created")

    except NameError:
        print("❌ Please run Step 2a first to create the 'structure_result' variable")
    except Exception as e:
        print(f"❌ Error in Step 2b: {str(e)}")

## Step 3: Data Type Optimization and Validation

### Intelligent Data Type Detection

In [None]:
# Step 3a: Intelligent Data Type Detection
# Objective: Automatically detect and assign appropriate data types based on content analysis

import pandas as pd
import numpy as np
import re
from datetime import datetime

def detect_and_assign_data_types(cleaning_result):
    """
    Intelligently detect and assign appropriate data types for all columns.
    Works with any survey content without hardcoded assumptions.

    Parameters:
    -----------
    cleaning_result : dict
        Result from Step 2b containing cleaned_data and metadata

    Returns:
    --------
    dict : Contains typed_data, type_analysis, conversion_log, and validation_results
    """

    print("=== Step 3a: Intelligent Data Type Detection ===")

    df = cleaning_result['cleaned_data'].copy()
    initial_dtypes = df.dtypes.to_dict()

    # Initialize tracking structures
    type_analysis = {
        'columns_analyzed': len(df.columns),
        'conversions_attempted': 0,
        'conversions_successful': 0,
        'columns_by_final_type': {},
        'problematic_columns': []
    }

    conversion_log = []

    # Skip flag columns and ID columns from type conversion
    skip_columns = get_columns_to_skip(df)

    # Analyze each column for optimal data type
    typed_data = df.copy()

    for col in df.columns:
        if col in skip_columns:
            conversion_log.append(f"{col}: Skipped (administrative/flag column)")
            continue

        print(f"\nAnalyzing column: {col}")
        type_result = analyze_column_for_type(df[col], col)

        if type_result['recommended_type'] != 'object':
            type_analysis['conversions_attempted'] += 1

            # Attempt the conversion
            conversion_success = apply_type_conversion(typed_data, col, type_result, conversion_log)

            if conversion_success:
                type_analysis['conversions_successful'] += 1
            else:
                type_analysis['problematic_columns'].append(col)

    # Categorize final types
    final_dtypes = typed_data.dtypes.to_dict()
    type_analysis['columns_by_final_type'] = categorize_final_types(final_dtypes)

    # Validate type assignments
    validation_results = validate_type_assignments(typed_data, conversion_log)

    return {
        'typed_data': typed_data,
        'type_analysis': type_analysis,
        'conversion_log': conversion_log,
        'validation_results': validation_results,
        'initial_dtypes': initial_dtypes,
        'final_dtypes': final_dtypes
    }

def get_columns_to_skip(df):
    """
    Identify columns that should not undergo type conversion.

    Parameters:
    -----------
    df : pandas.DataFrame
        Data to analyze

    Returns:
    --------
    list : Column names to skip
    """

    skip_patterns = [
        r'^flag_',           # Quality flags
        r'.*id$',            # ID columns
        r'.*address$',       # IP addresses
        r'^responseid$',     # Response IDs
        r'^ipaddress$'       # IP addresses
    ]

    skip_columns = []
    for col in df.columns:
        for pattern in skip_patterns:
            if re.match(pattern, col, re.IGNORECASE):
                skip_columns.append(col)
                break

    return skip_columns

def analyze_column_for_type(series, col_name):
    """
    Analyze a single column to determine the optimal data type.

    Parameters:
    -----------
    series : pandas.Series
        Column data to analyze
    col_name : str
        Name of the column

    Returns:
    --------
    dict : Analysis results with recommended type and confidence
    """

    # Get non-null values for analysis
    non_null_data = series.dropna()

    if len(non_null_data) == 0:
        return {
            'recommended_type': 'object',
            'confidence': 'high',
            'reason': 'all_null',
            'sample_values': []
        }

    unique_values = non_null_data.unique()
    sample_values = list(unique_values[:5])

    # Date/Time Detection
    datetime_result = detect_datetime_type(non_null_data, col_name)
    if datetime_result['is_datetime']:
        return {
            'recommended_type': 'datetime',
            'confidence': datetime_result['confidence'],
            'reason': datetime_result['reason'],
            'sample_values': sample_values
        }

    # Numeric Detection
    numeric_result = detect_numeric_type(non_null_data, col_name)
    if numeric_result['is_numeric']:
        return {
            'recommended_type': numeric_result['numeric_subtype'],
            'confidence': numeric_result['confidence'],
            'reason': numeric_result['reason'],
            'sample_values': sample_values,
            'contamination_rate': numeric_result.get('contamination_rate', 0)
        }

    # Boolean/Binary Detection
    boolean_result = detect_boolean_type(non_null_data, col_name)
    if boolean_result['is_boolean']:
        return {
            'recommended_type': 'category',
            'confidence': boolean_result['confidence'],
            'reason': boolean_result['reason'],
            'sample_values': sample_values
        }

    # Categorical Detection
    categorical_result = detect_categorical_type(non_null_data, col_name)
    if categorical_result['is_categorical']:
        return {
            'recommended_type': 'category',
            'confidence': categorical_result['confidence'],
            'reason': categorical_result['reason'],
            'sample_values': sample_values
        }

    # Default to object (text)
    return {
        'recommended_type': 'object',
        'confidence': 'high',
        'reason': 'free_text_or_complex',
        'sample_values': sample_values
    }

def detect_datetime_type(series, col_name):
    """
    Detect if column contains date/time data.
    """

    # Check column name patterns first
    datetime_name_patterns = [r'date', r'time', r'timestamp']
    name_suggests_datetime = any(re.search(pattern, col_name, re.IGNORECASE)
                                for pattern in datetime_name_patterns)

    if not name_suggests_datetime:
        return {'is_datetime': False, 'confidence': 'low', 'reason': 'name_pattern_mismatch'}

    # Try to parse as datetime
    try:
        parsed = pd.to_datetime(series, errors='coerce')
        success_rate = parsed.notna().sum() / len(series)

        if success_rate >= 0.8:
            return {
                'is_datetime': True,
                'confidence': 'high' if success_rate >= 0.95 else 'medium',
                'reason': f'{success_rate:.0%}_successful_datetime_parsing'
            }
    except:
        pass

    return {'is_datetime': False, 'confidence': 'low', 'reason': 'datetime_parsing_failed'}

def detect_numeric_type(series, col_name):
    """
    Detect if column contains numeric data, handling contamination.
    """

    # Attempt numeric conversion
    try:
        numeric_series = pd.to_numeric(series, errors='coerce')
        numeric_count = numeric_series.notna().sum()
        total_count = series.notna().sum()

        if total_count == 0:
            return {'is_numeric': False, 'confidence': 'high', 'reason': 'no_data'}

        success_rate = numeric_count / total_count
        contamination_rate = 1 - success_rate

        # High success rate - clearly numeric
        if success_rate >= 0.9:
            # Determine if integer or float
            if numeric_series.dropna().apply(lambda x: x == int(x)).all():
                subtype = 'int64'
            else:
                subtype = 'float64'

            return {
                'is_numeric': True,
                'numeric_subtype': subtype,
                'confidence': 'high',
                'reason': f'{success_rate:.0%}_numeric_with_{contamination_rate:.0%}_contamination',
                'contamination_rate': contamination_rate
            }

        # Medium success rate - might be numeric with contamination
        elif success_rate >= 0.7:
            return {
                'is_numeric': True,
                'numeric_subtype': 'float64',  # Use float to handle mixed cases
                'confidence': 'medium',
                'reason': f'{success_rate:.0%}_numeric_contaminated',
                'contamination_rate': contamination_rate
            }

        # Low success rate - not primarily numeric
        else:
            return {
                'is_numeric': False,
                'confidence': 'high',
                'reason': f'only_{success_rate:.0%}_numeric'
            }

    except Exception as e:
        return {
            'is_numeric': False,
            'confidence': 'high',
            'reason': f'numeric_conversion_error: {str(e)}'
        }

def detect_boolean_type(series, col_name):
    """
    Detect binary/boolean columns (Yes/No, True/False, etc.).
    """

    unique_values = set(str(val).lower().strip() for val in series.unique())

    # Common boolean patterns
    boolean_patterns = [
        {'yes', 'no'},
        {'true', 'false'},
        {'1', '0'},
        {'y', 'n'},
        {'on', 'off'},
        {'enabled', 'disabled'}
    ]

    for pattern in boolean_patterns:
        if unique_values.issubset(pattern) and len(unique_values) >= 2:
            return {
                'is_boolean': True,
                'confidence': 'high',
                'reason': f'binary_values_{list(unique_values)}'
            }

    # Single value that could be boolean (all Yes, all No, etc.)
    if len(unique_values) == 1 and list(unique_values)[0] in ['yes', 'no', 'true', 'false', '1', '0']:
        return {
            'is_boolean': True,
            'confidence': 'medium',
            'reason': f'single_boolean_value_{list(unique_values)[0]}'
        }

    return {'is_boolean': False, 'confidence': 'high', 'reason': 'not_binary_pattern'}

def detect_categorical_type(series, col_name):
    """
    Detect categorical columns based on repetition patterns.
    """

    unique_count = len(series.unique())
    total_count = len(series)

    if total_count == 0:
        return {'is_categorical': False, 'confidence': 'high', 'reason': 'no_data'}

    # Calculate repetition ratio
    repetition_ratio = total_count / unique_count if unique_count > 0 else 0

    # Categorical if:
    # 1. Few unique values with high repetition
    # 2. Reasonable number of categories (not too many, not too few)

    if unique_count <= 2:
        # Very few categories - likely categorical
        return {
            'is_categorical': True,
            'confidence': 'high',
            'reason': f'{unique_count}_unique_values_high_repetition'
        }
    elif unique_count <= 10 and repetition_ratio >= 2:
        # Moderate categories with good repetition
        return {
            'is_categorical': True,
            'confidence': 'high' if repetition_ratio >= 5 else 'medium',
            'reason': f'{unique_count}_categories_repetition_{repetition_ratio:.1f}x'
        }
    elif unique_count <= 20 and repetition_ratio >= 5:
        # More categories but very high repetition
        return {
            'is_categorical': True,
            'confidence': 'medium',
            'reason': f'{unique_count}_categories_high_repetition_{repetition_ratio:.1f}x'
        }

    return {
        'is_categorical': False,
        'confidence': 'high',
        'reason': f'too_many_unique_values_{unique_count}_or_low_repetition_{repetition_ratio:.1f}x'
    }

def apply_type_conversion(df, col_name, type_result, conversion_log):
    """
    Apply the recommended type conversion to a column.

    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame to modify
    col_name : str
        Column to convert
    type_result : dict
        Type analysis results
    conversion_log : list
        Log of conversions

    Returns:
    --------
    bool : True if conversion successful
    """

    recommended_type = type_result['recommended_type']

    try:
        if recommended_type == 'datetime':
            df[col_name] = pd.to_datetime(df[col_name], errors='coerce')
            conversion_log.append(f"{col_name}: Converted to datetime - {type_result['reason']}")
            print(f"  ✓ Converted to datetime")
            return True

        elif recommended_type in ['int64', 'float64']:
            original_nulls = df[col_name].isnull().sum()
            df[col_name] = pd.to_numeric(df[col_name], errors='coerce')
            new_nulls = df[col_name].isnull().sum()
            contamination_nulls = new_nulls - original_nulls

            if recommended_type == 'int64':
                # Only convert to int if no fractional parts
                if df[col_name].dropna().apply(lambda x: x == int(x) if pd.notna(x) else True).all():
                    df[col_name] = df[col_name].astype('Int64')  # Nullable integer
                else:
                    recommended_type = 'float64'  # Fall back to float

            conversion_log.append(f"{col_name}: Converted to {recommended_type} - {type_result['reason']} - {contamination_nulls} values became null")
            print(f"  ✓ Converted to {recommended_type} ({contamination_nulls} contaminated values → null)")
            return True

        elif recommended_type == 'category':
            df[col_name] = df[col_name].astype('category')
            conversion_log.append(f"{col_name}: Converted to category - {type_result['reason']}")
            print(f"  ✓ Converted to category")
            return True

        else:
            conversion_log.append(f"{col_name}: Kept as object - {type_result['reason']}")
            print(f"  → Kept as object ({type_result['reason']})")
            return True

    except Exception as e:
        conversion_log.append(f"{col_name}: Conversion to {recommended_type} FAILED - {str(e)}")
        print(f"  ✗ Conversion to {recommended_type} failed: {str(e)}")
        return False

def categorize_final_types(dtypes_dict):
    """
    Categorize the final data types for summary reporting.
    """

    type_categories = {
        'datetime': [],
        'numeric': [],
        'categorical': [],
        'text': [],
        'administrative': []
    }

    for col, dtype in dtypes_dict.items():
        dtype_str = str(dtype)

        if col.startswith('flag_') or 'id' in col.lower():
            type_categories['administrative'].append(col)
        elif 'datetime' in dtype_str:
            type_categories['datetime'].append(col)
        elif dtype_str in ['int64', 'Int64', 'float64', 'Float64']:
            type_categories['numeric'].append(col)
        elif dtype_str == 'category':
            type_categories['categorical'].append(col)
        else:
            type_categories['text'].append(col)

    return type_categories

def validate_type_assignments(df, conversion_log):
    """
    Validate that type assignments were successful and reasonable.
    """

    validation_results = {
        'datetime_columns': [],
        'numeric_columns': [],
        'categorical_columns': [],
        'potential_issues': []
    }

    for col in df.columns:
        dtype = df[col].dtype

        if pd.api.types.is_datetime64_any_dtype(dtype):
            null_pct = df[col].isnull().sum() / len(df) * 100
            validation_results['datetime_columns'].append({
                'column': col,
                'null_percentage': null_pct
            })

            if null_pct > 50:
                validation_results['potential_issues'].append(f"{col}: High null rate ({null_pct:.1f}%) after datetime conversion")

        elif pd.api.types.is_numeric_dtype(dtype):
            null_pct = df[col].isnull().sum() / len(df) * 100
            validation_results['numeric_columns'].append({
                'column': col,
                'null_percentage': null_pct,
                'min_value': df[col].min(),
                'max_value': df[col].max()
            })

            if null_pct > 30:
                validation_results['potential_issues'].append(f"{col}: High contamination ({null_pct:.1f}% null) after numeric conversion")

        elif isinstance(dtype, pd.CategoricalDtype):
            n_categories = len(df[col].cat.categories)
            validation_results['categorical_columns'].append({
                'column': col,
                'n_categories': n_categories
            })

            if n_categories > 20:
                validation_results['potential_issues'].append(f"{col}: Many categories ({n_categories}) - might not be truly categorical")

    return validation_results

# Example usage
if __name__ == "__main__":
    try:
        # This assumes 'cleaning_result' exists from Step 2b
        typing_result = detect_and_assign_data_types(cleaning_result)

        typed_data = typing_result['typed_data']
        analysis = typing_result['type_analysis']
        validation = typing_result['validation_results']

        print(f"\n=== Step 3a Summary ===")
        print(f"Data type detection completed:")
        print(f"  Columns analyzed: {analysis['columns_analyzed']}")
        print(f"  Conversions attempted: {analysis['conversions_attempted']}")
        print(f"  Conversions successful: {analysis['conversions_successful']}")
        print(f"  Problematic columns: {len(analysis['problematic_columns'])}")

        print(f"\nFinal type distribution:")
        for type_category, columns in analysis['columns_by_final_type'].items():
            if columns:
                print(f"  {type_category.title()}: {len(columns)} columns")

        if validation['potential_issues']:
            print(f"\nPotential issues detected:")
            for issue in validation['potential_issues'][:5]:
                print(f"  - {issue}")
            if len(validation['potential_issues']) > 5:
                print(f"  ... and {len(validation['potential_issues']) - 5} more issues")

        print(f"\nData shape: {typed_data.shape[0]:,} × {typed_data.shape[1]:,}")
        print("\n✓ Step 3a Complete: Data types detected and assigned")

    except NameError:
        print("❌ Please run Step 2b first to create the 'cleaning_result' variable")
    except Exception as e:
        print(f"❌ Error in Step 3a: {str(e)}")

### Data Type Validation and Optimization

In [None]:
# Step 3b: Data Type Validation and Optimization
# Objective: Validate type assignments, handle edge cases, and optimize final data structure

import pandas as pd
import numpy as np

def validate_and_optimize_data_types(typing_result):
    """
    Validate type assignments, handle contaminated columns, and optimize the final data structure.
    Creates cleaned versions of contaminated columns while preserving originals.

    Parameters:
    -----------
    typing_result : dict
        Result from Step 3a containing typed_data and analysis results

    Returns:
    --------
    dict : Contains optimized_data, validation_report, contamination_handling, and final_summary
    """

    print("=== Step 3b: Data Type Validation and Optimization ===")

    df = typing_result['typed_data'].copy()
    type_analysis = typing_result['type_analysis']
    validation_results = typing_result['validation_results']

    # Initialize optimization tracking
    optimization_log = []
    contamination_handling = {}

    # Step 1: Handle highly contaminated numeric columns
    contaminated_columns = handle_contaminated_numerics(df, validation_results, optimization_log, contamination_handling)

    # Step 2: Optimize categorical columns
    optimized_categoricals = optimize_categorical_columns(df, validation_results, optimization_log)

    # Step 3: Create summary variables for complex multi-part questions
    summary_variables = create_summary_variables(df, optimization_log)

    # Step 4: Final data validation
    final_validation = perform_final_validation(df, optimization_log)

    # Generate comprehensive validation report
    validation_report = generate_validation_report(df, type_analysis, validation_results,
                                                 contamination_handling, optimization_log)

    return {
        'optimized_data': df,
        'validation_report': validation_report,
        'contamination_handling': contamination_handling,
        'optimization_log': optimization_log,
        'final_validation': final_validation
    }

def handle_contaminated_numerics(df, validation_results, optimization_log, contamination_handling):
    """
    Handle numeric columns with high contamination rates by creating clean versions.

    Parameters:
    -----------
    df : pandas.DataFrame
        Data with assigned types
    validation_results : dict
        Validation results from Step 3a
    optimization_log : list
        Log of optimization steps
    contamination_handling : dict
        Tracking contamination handling

    Returns:
    --------
    list : Names of contaminated columns handled
    """

    print("\n--- Handling Contaminated Numeric Columns ---")

    contaminated_columns = []

    for col_info in validation_results.get('numeric_columns', []):
        col_name = col_info['column']
        null_pct = col_info['null_percentage']

        # If contamination is high (>20% null after conversion), create a cleaned version
        if null_pct > 20:
            contaminated_columns.append(col_name)

            print(f"Processing contaminated column: {col_name} ({null_pct:.1f}% null)")

            # Create cleaned version with suffix
            clean_col_name = f"{col_name}_clean"

            # Start with the numeric version (already converted)
            df[clean_col_name] = df[col_name].copy()

            # Apply additional cleaning based on column patterns
            if 'year' in col_name.lower() or col_name.lower() in ['q6', 'establishment', 'founded']:
                # Handle year columns specially
                cleaned_years = clean_year_column(df[col_name], col_name)
                df[clean_col_name] = cleaned_years

                # Count improvements
                original_valid = df[col_name].notna().sum()
                cleaned_valid = df[clean_col_name].notna().sum()

                contamination_handling[col_name] = {
                    'original_valid': original_valid,
                    'cleaned_valid': cleaned_valid,
                    'improvement': cleaned_valid - original_valid,
                    'clean_column_created': clean_col_name,
                    'cleaning_method': 'year_range_validation'
                }

                print(f"  Created {clean_col_name}: {cleaned_valid} valid values (+{cleaned_valid - original_valid} improvement)")
                optimization_log.append(f"Created cleaned year column {clean_col_name} with range validation")

            else:
                # Generic numeric cleaning
                contamination_handling[col_name] = {
                    'original_valid': df[col_name].notna().sum(),
                    'cleaned_valid': df[col_name].notna().sum(),
                    'improvement': 0,
                    'clean_column_created': clean_col_name,
                    'cleaning_method': 'preserved_as_is'
                }

                optimization_log.append(f"Preserved contaminated numeric column {col_name} as {clean_col_name}")

    if not contaminated_columns:
        print("No highly contaminated numeric columns found")

    return contaminated_columns

def clean_year_column(series, col_name):
    """
    Specialized cleaning for year columns with reasonable range validation.

    Parameters:
    -----------
    series : pandas.Series
        Year column data
    col_name : str
        Column name for context

    Returns:
    --------
    pandas.Series : Cleaned year data
    """

    # Define reasonable year range (adjust as needed)
    current_year = pd.Timestamp.now().year
    min_reasonable_year = 1800  # Adjust based on context
    max_reasonable_year = current_year + 5  # Allow slight future dates

    cleaned_series = series.copy()

    # Apply range validation
    out_of_range_mask = (cleaned_series < min_reasonable_year) | (cleaned_series > max_reasonable_year)
    out_of_range_count = out_of_range_mask.sum()

    if out_of_range_count > 0:
        print(f"    Removing {out_of_range_count} out-of-range years (not between {min_reasonable_year}-{max_reasonable_year})")
        cleaned_series.loc[out_of_range_mask] = np.nan

    return cleaned_series

def optimize_categorical_columns(df, validation_results, optimization_log):
    """
    Optimize categorical columns by handling issues like too many categories.

    Parameters:
    -----------
    df : pandas.DataFrame
        Data with assigned types
    validation_results : dict
        Validation results
    optimization_log : list
        Optimization log

    Returns:
    --------
    dict : Information about categorical optimizations
    """

    print("\n--- Optimizing Categorical Columns ---")

    optimized_categoricals = {}

    for col_info in validation_results.get('categorical_columns', []):
        col_name = col_info['column']
        n_categories = col_info['n_categories']

        if n_categories > 20:
            print(f"Column {col_name} has {n_categories} categories - considering optimization")

            # For now, keep as-is but flag for review
            optimized_categoricals[col_name] = {
                'original_categories': n_categories,
                'action': 'flagged_for_review',
                'recommendation': 'Consider if this should be categorical or text'
            }

            optimization_log.append(f"{col_name}: Flagged - {n_categories} categories may be too many for categorical")

        elif n_categories == 1:
            print(f"Column {col_name} has only 1 category - converting to constant")

            # Convert single-category columns to a simple constant
            unique_val = df[col_name].cat.categories[0]
            df[col_name + '_constant'] = unique_val

            optimized_categoricals[col_name] = {
                'original_categories': 1,
                'action': 'converted_to_constant',
                'constant_value': unique_val
            }

            optimization_log.append(f"{col_name}: Converted to constant value '{unique_val}'")

    if not optimized_categoricals:
        print("All categorical columns are appropriately sized")

    return optimized_categoricals

def create_summary_variables(df, optimization_log):
    """
    Create summary variables for complex multi-part questions (like Q2 series).

    Parameters:
    -----------
    df : pandas.DataFrame
        Optimized data
    optimization_log : list
        Optimization log

    Returns:
    --------
    dict : Information about created summary variables
    """

    print("\n--- Creating Summary Variables ---")

    summary_variables = {}

    # Identify question series (like Q2#1, Q2#2, etc.)
    question_series = identify_question_series(df.columns)

    for series_name, columns in question_series.items():
        if len(columns) >= 3:  # Only create summaries for substantial series
            print(f"Creating summary for {series_name} series ({len(columns)} columns)")

            # Create response count summary
            response_count_col = f"{series_name}_response_count"
            df[response_count_col] = df[columns].notna().sum(axis=1)

            # Create completion rate for this series
            completion_rate_col = f"{series_name}_completion_rate"
            df[completion_rate_col] = df[columns].notna().mean(axis=1) * 100

            summary_variables[series_name] = {
                'component_columns': columns,
                'response_count_column': response_count_col,
                'completion_rate_column': completion_rate_col,
                'total_components': len(columns)
            }

            optimization_log.append(f"Created summary variables for {series_name}: {response_count_col}, {completion_rate_col}")

    if not summary_variables:
        print("No substantial question series identified for summary creation")

    return summary_variables

def identify_question_series(columns):
    """
    Identify related question series based on naming patterns.

    Parameters:
    -----------
    columns : list
        List of column names

    Returns:
    --------
    dict : Series name -> list of columns
    """

    import re

    series_patterns = {}

    for col in columns:
        # Look for patterns like q2_1_*, q2_2_*, etc.
        match = re.match(r'^(q\d+)(?:_\d+)*', col)
        if match:
            base_pattern = match.group(1)
            if base_pattern not in series_patterns:
                series_patterns[base_pattern] = []
            series_patterns[base_pattern].append(col)

    # Only return series with multiple components
    return {k: v for k, v in series_patterns.items() if len(v) > 1}

def perform_final_validation(df, optimization_log):
    """
    Perform final validation checks on the optimized data.

    Parameters:
    -----------
    df : pandas.DataFrame
        Final optimized data
    optimization_log : list
        Optimization log

    Returns:
    --------
    dict : Final validation results
    """

    print("\n--- Performing Final Validation ---")

    final_validation = {
        'total_columns': len(df.columns),
        'total_rows': len(df),
        'data_types': df.dtypes.value_counts().to_dict(),
        'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024 / 1024,
        'null_percentages': {},
        'validation_warnings': []
    }

    # Calculate null percentages for key columns
    for col in df.columns:
        null_pct = df[col].isnull().sum() / len(df) * 100
        if null_pct > 50:  # Only report high null rates
            final_validation['null_percentages'][col] = null_pct

    # Check for validation warnings
    if len(final_validation['null_percentages']) > 5:
        final_validation['validation_warnings'].append(f"Many columns ({len(final_validation['null_percentages'])}) have >50% missing data")

    if final_validation['memory_usage_mb'] > 100:
        final_validation['validation_warnings'].append(f"High memory usage: {final_validation['memory_usage_mb']:.1f} MB")

    print(f"Final validation complete:")
    print(f"  Data shape: {df.shape[0]:,} × {df.shape[1]:,}")
    print(f"  Memory usage: {final_validation['memory_usage_mb']:.1f} MB")

    if final_validation['validation_warnings']:
        print(f"  Warnings: {len(final_validation['validation_warnings'])}")

    optimization_log.append(f"Final validation: {df.shape[0]} rows × {df.shape[1]} columns, {final_validation['memory_usage_mb']:.1f} MB")

    return final_validation

def generate_validation_report(df, type_analysis, validation_results, contamination_handling, optimization_log):
    """
    Generate comprehensive validation report.

    Returns:
    --------
    dict : Comprehensive validation report
    """

    report = {
        'data_overview': {
            'shape': df.shape,
            'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024 / 1024
        },
        'type_distribution': type_analysis['columns_by_final_type'],
        'contamination_summary': contamination_handling,
        'validation_issues': validation_results.get('potential_issues', []),
        'optimization_steps': len(optimization_log),
        'ready_for_analysis': len(validation_results.get('potential_issues', [])) < 5
    }

    return report

# Example usage
if __name__ == "__main__":
    try:
        # This assumes 'typing_result' exists from Step 3a
        optimization_result = validate_and_optimize_data_types(typing_result)

        optimized_data = optimization_result['optimized_data']
        validation_report = optimization_result['validation_report']
        contamination_handling = optimization_result['contamination_handling']

        print(f"\n=== Step 3b Summary ===")
        print(f"Data type validation and optimization completed:")
        print(f"  Final data shape: {optimized_data.shape[0]:,} × {optimized_data.shape[1]:,}")
        print(f"  Memory usage: {validation_report['data_overview']['memory_usage_mb']:.1f} MB")
        print(f"  Contaminated columns handled: {len(contamination_handling)}")
        print(f"  Optimization steps performed: {validation_report['optimization_steps']}")

        if contamination_handling:
            print(f"\nContamination handling results:")
            for col, info in contamination_handling.items():
                improvement = info.get('improvement', 0)
                if improvement > 0:
                    print(f"  {col}: +{improvement} valid values recovered")
                else:
                    print(f"  {col}: Preserved with cleaning flags")

        print(f"  Ready for analysis: {'Yes' if validation_report['ready_for_analysis'] else 'No'}")

        print("\n✓ Step 3b Complete: Data validated and optimized for analysis")

    except NameError:
        print("❌ Please run Step 3a first to create the 'typing_result' variable")
    except Exception as e:
        print(f"❌ Error in Step 3b: {str(e)}")

## Step 4: Generate Analysis-Ready Outputs

### Generate Final Datasets and Documentation

In [None]:
# Step 4a: Generate Final Datasets and Documentation
# Objective: Create analysis-ready exports with comprehensive documentation

import pandas as pd
import numpy as np
from datetime import datetime
import json
import tkinter as tk
from tkinter import filedialog
from pathlib import Path
import os

def generate_final_datasets(optimization_result, structure_result):
    """
    Generate final analysis-ready datasets with comprehensive documentation.
    Creates multiple output formats optimized for different use cases.

    Parameters:
    -----------
    optimization_result : dict
        Result from Step 3b containing optimized data and validation
    structure_result : dict
        Result from Step 2a containing original codebook and metadata

    Returns:
    --------
    dict : Contains paths to created files and summary information
    """

    print("=== Step 4a: Generating Final Datasets and Documentation ===")

    # --- ADDED: GUI for selecting output directory ---
    root = tk.Tk()
    root.withdraw()
    print("Opening directory selector for saving outputs...")
    output_dir_str = filedialog.askdirectory(title="Select a Folder to Save Output Files")

    if not output_dir_str:
        output_dir = Path('outputs')
        print(f"No directory selected. Saving files to a new folder: '{output_dir}'")
    else:
        output_dir = Path(output_dir_str)
        print(f"Files will be saved to: '{output_dir}'")

    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    # --- END ADDED BLOCK ---

    optimized_data = optimization_result['optimized_data']
    validation_report = optimization_result['validation_report']
    contamination_handling = optimization_result['contamination_handling']

    generated_files = {}
    export_summary = {
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'total_responses': len(optimized_data),
        'total_variables': len(optimized_data.columns),
        'files_created': []
    }

    print(f"\nCreating final datasets from {len(optimized_data):,} responses × {len(optimized_data.columns):,} variables")

    # --- MODIFIED: Pass output_dir to each function ---
    analysis_dataset = create_analysis_dataset(optimized_data, generated_files, export_summary, output_dir)
    comprehensive_codebook = create_comprehensive_codebook(optimized_data, structure_result, optimization_result, generated_files, export_summary, output_dir)
    quality_report = create_data_quality_report(optimization_result, generated_files, export_summary, output_dir)
    variable_summaries = create_variable_summaries(optimized_data, generated_files, export_summary, output_dir)
    metadata_file = create_metadata_file(optimization_result, structure_result, export_summary, generated_files, output_dir)

    return {
        'generated_files': generated_files,
        'export_summary': export_summary,
        'analysis_dataset': analysis_dataset,
        'codebook': comprehensive_codebook,
        'quality_report': quality_report
    }

def create_analysis_dataset(df, generated_files, export_summary, output_dir):
    print("\n--- Creating Main Analysis Dataset ---")
    analysis_data = df.copy()

    metadata_cols = [col for col in df.columns if any(meta in col.lower() for meta in ['date', 'status', 'progress', 'duration', 'finished', 'recipient', 'location', 'distribution', 'language', 'ipaddress', 'responseid'])]
    flag_cols = [col for col in df.columns if col.startswith('flag_')]
    original_question_cols = [col for col in df.columns if col.startswith('q') and not col.endswith(('_clean', '_constant')) and not any(suffix in col for suffix in ['_response_count', '_completion_rate'])]
    derived_cols = [col for col in df.columns if any(suffix in col for suffix in ['_clean', '_constant', '_response_count', '_completion_rate'])]
    other_cols = [col for col in df.columns if col not in metadata_cols + flag_cols + original_question_cols + derived_cols]
    ordered_columns = metadata_cols + flag_cols + original_question_cols + derived_cols + other_cols
    analysis_data = analysis_data[ordered_columns]

    filepath = output_dir / 'qualtrics_analysis_ready.csv'
    analysis_data.to_csv(filepath, index=False)
    generated_files['analysis_dataset'] = str(filepath)
    export_summary['files_created'].append(str(filepath))

    analysis_info = {
        'filename': str(filepath),
        'shape': analysis_data.shape,
        'column_organization': {
            'metadata_columns': len(metadata_cols),
            'quality_flags': len(flag_cols),
            'original_questions': len(original_question_cols),
            'derived_variables': len(derived_cols),
            'other_columns': len(other_cols)
        }
    }
    print(f"Created {filepath.name}:")
    print(f"  Shape: {analysis_data.shape[0]:,} × {analysis_data.shape[1]:,}")
    return analysis_info

def create_comprehensive_codebook(df, structure_result, optimization_result, generated_files, export_summary, output_dir):
    print("\n--- Creating Comprehensive Codebook ---")
    original_codebook = structure_result.get('codebook', pd.DataFrame())
    codebook_data = []

    for col in df.columns:
        dtype = str(df[col].dtype)
        non_null_count = df[col].notna().sum()
        null_count = df[col].isnull().sum()
        response_rate = (non_null_count / len(df)) * 100
        question_text = col
        if len(original_codebook) > 0 and col in original_codebook.columns:
            try:
                question_text = original_codebook[col].iloc[0] if len(original_codebook) > 0 else col
            except:
                pass

        codebook_entry = {
            'variable_name': col,
            'variable_label': str(question_text)[:100] + '...' if len(str(question_text)) > 100 else str(question_text),
            'variable_type': dtype,
            'variable_category': categorize_variable(col, dtype),
            'total_responses': len(df),
            'valid_responses': non_null_count,
            'missing_responses': null_count,
            'response_rate_percent': round(response_rate, 1),
            'value_information': get_value_information(df[col], dtype)
        }
        codebook_data.append(codebook_entry)

    codebook_df = pd.DataFrame(codebook_data)

    filepath = output_dir / 'qualtrics_codebook_comprehensive.csv'
    codebook_df.to_csv(filepath, index=False)
    generated_files['codebook'] = str(filepath)
    export_summary['files_created'].append(str(filepath))

    print(f"Created {filepath.name} with {len(codebook_df)} variable definitions")
    return {'filename': str(filepath), 'variables_documented': len(codebook_df)}

def create_data_quality_report(optimization_result, generated_files, export_summary, output_dir):
    print("\n--- Creating Data Quality Report ---")
    validation_report = optimization_result['validation_report']
    quality_report = {
        'report_metadata': {
            'created_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'data_shape': validation_report['data_overview']['shape'],
            'memory_usage_mb': validation_report['data_overview']['memory_usage_mb']
        },
        'data_type_summary': validation_report['type_distribution'],
        'contamination_summary': optimization_result['contamination_handling'],
        'validation_issues': validation_report.get('validation_issues', []),
        'optimization_log': optimization_result['optimization_log'],
        'ready_for_analysis': validation_report['ready_for_analysis']
    }

    filepath = output_dir / 'qualtrics_data_quality_report.json'
    with open(filepath, 'w') as f:
        json.dump(quality_report, f, indent=2, default=str)

    generated_files['quality_report'] = str(filepath)
    export_summary['files_created'].append(str(filepath))

    print(f"Created {filepath.name}")
    return {'filename': str(filepath)}

def create_variable_summaries(df, generated_files, export_summary, output_dir):
    print("\n--- Creating Variable Summary Statistics ---")
    summaries = {}
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        numeric_summary = df[numeric_cols].describe()
        numeric_summary.loc['response_rate'] = ((df[numeric_cols].notna().sum() / len(df)) * 100).round(1)
        summaries['numeric_variables'] = numeric_summary

    categorical_cols = [col for col in df.columns if isinstance(df[col].dtype, pd.CategoricalDtype)]
    if len(categorical_cols) > 0:
        categorical_summary = []
        for col in categorical_cols:
            value_counts = df[col].value_counts()
            summary = {
                'variable': col, 'total_responses': len(df), 'valid_responses': df[col].notna().sum(),
                'categories': len(df[col].cat.categories),
                'top_category': value_counts.index[0] if len(value_counts) > 0 else 'None',
                'top_category_count': value_counts.iloc[0] if len(value_counts) > 0 else 0
            }
            categorical_summary.append(summary)
        summaries['categorical_variables'] = pd.DataFrame(categorical_summary)

    if summaries:
        filepath = output_dir / 'qualtrics_variable_summaries.xlsx'
        with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
            for sheet_name, summary_df in summaries.items():
                summary_df.to_excel(writer, sheet_name=sheet_name)

        generated_files['variable_summaries'] = str(filepath)
        export_summary['files_created'].append(str(filepath))
        print(f"Created {filepath.name} with {len(summaries)} summary sheets")

    return summaries

def create_metadata_file(optimization_result, structure_result, export_summary, generated_files, output_dir):
    print("\n--- Creating Processing Metadata ---")
    metadata = {
        'processing_pipeline': { 'step_1': 'Data loading', 'step_2a': 'Structure separation', 'step_2b': 'Cleaning', 'step_3a': 'Type detection', 'step_3b': 'Validation', 'step_4a': 'Output generation' },
        'data_transformations': {
            'test_responses_removed': structure_result.get('structure_analysis', {}).get('test_responses', 0),
            'quality_flags_created': len([col for col in optimization_result['optimized_data'].columns if col.startswith('flag_')]),
            'contaminated_columns_handled': len(optimization_result.get('contamination_handling', {})),
            'summary_variables_created': len([col for col in optimization_result['optimized_data'].columns if any(suffix in col for suffix in ['_count', '_rate', '_clean', '_constant'])])
        },
        'final_dataset_characteristics': { 'total_responses': len(optimization_result['optimized_data']), 'total_variables': len(optimization_result['optimized_data'].columns), 'ready_for_analysis': optimization_result['validation_report']['ready_for_analysis'] },
        'files_generated': generated_files,
        'processing_timestamp': export_summary['timestamp']
    }

    filepath = output_dir / 'qualtrics_processing_metadata.json'
    with open(filepath, 'w') as f:
        json.dump(metadata, f, indent=2, default=str)

    generated_files['processing_metadata'] = str(filepath)
    export_summary['files_created'].append(str(filepath))
    print(f"Created {filepath.name} with complete processing documentation")

    return metadata

def categorize_variable(col_name, dtype):
    if col_name.startswith('flag_'): return 'Quality Flag'
    if any(meta in col_name.lower() for meta in ['date', 'time']): return 'Metadata - Temporal'
    if any(meta in col_name.lower() for meta in ['status', 'progress', 'duration', 'finished']): return 'Metadata - Administrative'
    if any(meta in col_name.lower() for meta in ['recipient', 'location', 'ip', 'distribution', 'language']): return 'Metadata - Technical'
    if col_name.startswith('q') and not any(suffix in col_name for suffix in ['_clean', '_constant', '_count', '_rate']): return 'Survey Question'
    if col_name.endswith('_clean'): return 'Derived - Cleaned Variable'
    if col_name.endswith('_constant'): return 'Derived - Constant Value'
    if any(suffix in col_name for suffix in ['_count', '_rate']): return 'Derived - Summary Metric'
    return 'Other'

def get_value_information(series, dtype):
    if series.notna().sum() == 0: return "All missing values"
    try:
        if 'datetime' in dtype:
            min_date, max_date = series.min(), series.max()
            return f"Date range: {min_date.strftime('%Y-%m-%d')} to {max_date.strftime('%Y-%m-%d')}"
        elif 'int' in dtype.lower() or 'float' in dtype.lower():
            return f"Range: {series.min()} to {series.max()}, Median: {series.median()}"
        elif 'category' in dtype:
            counts = series.value_counts().head(5).to_dict()
            return f"Top 5 categories: {counts}" if len(series.cat.categories) > 5 else f"Categories: {counts}"
        else:
            return f"{series.nunique()} unique values, Sample: {list(series.dropna().unique()[:3])}"
    except Exception as e:
        return f"Value analysis error: {str(e)}"

# Example usage
if __name__ == "__main__":
    try:
        # This assumes 'optimization_result' and 'structure_result' exist from previous steps
        # You would run Step 1, 2a, 2b, 3a, and 3b before this step to generate these variables.
        final_result = generate_final_datasets(optimization_result, structure_result)

        generated_files = final_result['generated_files']
        export_summary = final_result['export_summary']

        print(f"\n=== Step 4a Summary ===")
        print(f"Final dataset generation completed:")
        print(f"  Processing timestamp: {export_summary['timestamp']}")
        print(f"  Total responses processed: {export_summary['total_responses']:,}")
        print(f"  Total variables created: {export_summary['total_variables']:,}")
        print(f"  Files generated: {len(export_summary['files_created'])}\n")

        print("Files created in your selected directory:")
        for file_type, filepath in generated_files.items():
            print(f"  - {Path(filepath).name}")

        print("\n✓ Step 4a Complete: All analysis-ready files generated")
        print("The Qualtrics processing pipeline is now complete!")

    except NameError as e:
        print(f"❌ Please run the previous steps (1 through 3b) first to create the required input variables for this step.")
    except Exception as e:
        print(f"❌ Error in Step 4a: {str(e)}")