# Customer Data Aggregation for Route Optimization

This notebook extends the customer clustering functionality to include data aggregation capabilities. It allows you to:
1. Load customer data from CSV files
2. Select specific variables to aggregate
3. Apply various aggregation methods (sum, mean, count, etc.)
4. Export the aggregated data to CSV or pickle format

---

## Cell 1: Imports and Initial Setup

In [1]:
import os
import sys
import json
from pathlib import Path
from datetime import datetime

import pandas as pd
import numpy as np
import chardet
import re
from tqdm import tqdm
import logging
import warnings

warnings.filterwarnings('ignore')  # Suppress non-critical warnings

# Set up logging with formatting
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('route_optimization')

print("✅ Imports complete")

✅ Imports complete


## Cell 2: Set up project paths and folders

In [2]:
def setup_project():
    """Set up project paths and folders"""
    project_root = Path.cwd()  # Current working directory
    input_path = project_root.parent /  '02 Data' / '01_processed_data' / '01_clean_data'
    output_path = project_root.parent /  '02 Data' / '01_processed_data' / '04_agregated_data'
    
    # Check if input directory exists
    if not input_path.exists():
        print(f"Error: Input directory '{input_path}' does not exist.")
        print("Please create this directory or modify the path.")
        sys.exit(1)
    
    # Create output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)
    print(f"Project setup complete. \n Input path: {input_path} \n Output path: {output_path}")
    
    return input_path, output_path

def load_api_key(file_path="api_keys.json"):
    """Load the HERE API key from a JSON file."""
    try:
        with open(file_path, 'r') as f:
            api_keys = json.load(f)
        api_key = api_keys.get("HERE_API_KEY")
        if not api_key:
            print("⚠️ No HERE API key found in the JSON file")
            return None
        return api_key
    except Exception as e:
        print(f"⚠️ Error loading API key: {e}")
        return None

## Cell 3: Data Loading Functions

In [3]:
def load_data(input_path):
    """Load and parse customer data file"""
    # List available CSV files in the input directory
    available_files = list(input_path.glob("*.csv"))
    if not available_files:
        print(f"No CSV files found in {input_path}")
        sys.exit(1)
    
    print("Available files:")
    for i, f in enumerate(available_files, start=1):
        print(f"{i}: {f.name}")
    
    # Prompt user to choose a file by number
    while True:
        try:
            choice = int(input(f"Choose file number (1-{len(available_files)}): ").strip()) - 1
            if 0 <= choice < len(available_files):
                break
            print(f"Please enter a number between 1 and {len(available_files)}")
        except ValueError:
            print("Please enter a valid number.")
    
    file_path = available_files[choice]
    
    # Detect file encoding
    print(f"Detecting encoding for {file_path.name}...")
    with open(file_path, 'rb') as file:
        result = chardet.detect(file.read())
    encoding = result['encoding']
    confidence = result['confidence']
    print(f"Detected encoding: {encoding} (confidence: {confidence:.1%})")
    
    # Analyze delimiter options
    print("\nAnalyzing potential delimiters:\n")
    delimiters = [',', ';', r'\t', '|']  # Raw string for tab to avoid escape issues
    delimiter_options = {}
    for i, delim in enumerate(delimiters, start=1):
        try:
            preview_df = pd.read_csv(file_path, engine='python', encoding=encoding, sep=delim, nrows=3)
            col_count = len(preview_df.columns)
            delimiter_options[i] = (delim, col_count)
            print(f"{i}: Delimiter '{delim}'\n   Found {col_count} columns")
            print(f"   Preview with option {i}:")
            display(preview_df.head(3))
            print("-" * 80 + "\n")
        except Exception as e:
            print(f"{i}: Error with delimiter '{delim}': {e}")
    
    # Suggest the delimiter with the most columns
    if delimiter_options:
        suggested = max(delimiter_options, key=lambda k: delimiter_options[k][1])
        print(f"Suggested option: {suggested} ('{delimiter_options[suggested][0]}') with {delimiter_options[suggested][1]} columns")
    else:
        print("No valid delimiters found. Please check the file format.")
        sys.exit(1)
    
    # Prompt user to choose delimiter option
    while True:
        try:
            delim_choice = input(f"\nChoose delimiter option (1-{len(delimiter_options)}) [default: {suggested}]: ").strip()
            if not delim_choice:
                delim_choice = suggested
            else:
                delim_choice = int(delim_choice)
            if delim_choice in delimiter_options:
                break
            print(f"Please enter a number between 1 and {len(delimiter_options)} or press Enter for default.")
        except ValueError:
            print("Please enter a valid number or press Enter for default.")
    
    chosen_delim, _ = delimiter_options[delim_choice]
    print(f"Using delimiter: '{chosen_delim}'")
    
    # Load the full CSV with chosen delimiter and encoding
    try:
        df = pd.read_csv(file_path, encoding=encoding, sep=chosen_delim)
        print(f"\n✅ Loaded {df.shape[0]} rows × {df.shape[1]} columns from {file_path.name}")
    except Exception as e:
        print(f"❌ Failed to load CSV: {e}")
        sys.exit(1)
    
    # Display data overview
    print("\nData Overview:")
    print(f"Column names: {', '.join(df.columns[:5])}, ... (and {len(df.columns)-5} more columns)" if len(df.columns) > 5 else f"Column names: {', '.join(df.columns)}")
    print(f"\nData types (first 5 columns):\n{df.dtypes[:5]}")
    print(f"... (and {len(df.columns)-5} more columns)" if len(df.columns) > 5 else "")
    print("\nSample data:")
    display(df.head(3))
    print("-" * 80)
    
    return df, file_path

## Cell 4: Initialize the Project

In [4]:
# Initialize project and load data
input_path, output_path = setup_project()
df, file_path = load_data(input_path)

Project setup complete. 
 Input path: C:\Users\User\Dropbox\Personal\CareerFoundry\06 Sourcing data\Notebook folder\02 Data\01_processed_data\01_clean_data 
 Output path: C:\Users\User\Dropbox\Personal\CareerFoundry\06 Sourcing data\Notebook folder\02 Data\01_processed_data\02_agregated_data
Available files:
1: work_time_and_km_clean.csv


Choose file number (1-1):  1


Detecting encoding for work_time_and_km_clean.csv...
Detected encoding: ascii (confidence: 100.0%)

Analyzing potential delimiters:

1: Delimiter ','
   Found 10 columns
   Preview with option 1:


Unnamed: 0,Date,Year,Month,Day,Route,Route_id,Start_time,end_time,time,distance
0,2025-03-17,2025,3,17,102,2,2025-05-16 09:00:00,2025-05-16 19:25:00,10.42,115.0
1,2025-03-18,2025,3,18,202,2,2025-05-16 09:34:00,2025-05-16 19:02:00,9.47,53.19
2,2025-03-19,2025,3,19,302,2,2025-05-16 08:22:00,2025-05-16 17:36:00,9.23,49.32


--------------------------------------------------------------------------------

2: Delimiter ';'
   Found 1 columns
   Preview with option 2:


Unnamed: 0,"Date,Year,Month,Day,Route,Route_id,Start_time,end_time,time,distance"
0,"2025-03-17,2025,3,17,102,2,2025-05-16 09:00:00..."
1,"2025-03-18,2025,3,18,202,2,2025-05-16 09:34:00..."
2,"2025-03-19,2025,3,19,302,2,2025-05-16 08:22:00..."


--------------------------------------------------------------------------------

3: Delimiter '\t'
   Found 1 columns
   Preview with option 3:


Unnamed: 0,"Date,Year,Month,Day,Route,Route_id,Start_time,end_time,time,distance"
0,"2025-03-17,2025,3,17,102,2,2025-05-16 09:00:00..."
1,"2025-03-18,2025,3,18,202,2,2025-05-16 09:34:00..."
2,"2025-03-19,2025,3,19,302,2,2025-05-16 08:22:00..."


--------------------------------------------------------------------------------

4: Delimiter '|'
   Found 1 columns
   Preview with option 4:


Unnamed: 0,"Date,Year,Month,Day,Route,Route_id,Start_time,end_time,time,distance"
0,"2025-03-17,2025,3,17,102,2,2025-05-16 09:00:00..."
1,"2025-03-18,2025,3,18,202,2,2025-05-16 09:34:00..."
2,"2025-03-19,2025,3,19,302,2,2025-05-16 08:22:00..."


--------------------------------------------------------------------------------

Suggested option: 1 (',') with 10 columns



Choose delimiter option (1-4) [default: 1]:  


Using delimiter: ','

✅ Loaded 20 rows × 10 columns from work_time_and_km_clean.csv

Data Overview:
Column names: Date, Year, Month, Day, Route, ... (and 5 more columns)

Data types (first 5 columns):
Date     object
Year      int64
Month     int64
Day       int64
Route     int64
dtype: object
... (and 5 more columns)

Sample data:


Unnamed: 0,Date,Year,Month,Day,Route,Route_id,Start_time,end_time,time,distance
0,2025-03-17,2025,3,17,102,2,2025-05-16 09:00:00,2025-05-16 19:25:00,10.42,115.0
1,2025-03-18,2025,3,18,202,2,2025-05-16 09:34:00,2025-05-16 19:02:00,9.47,53.19
2,2025-03-19,2025,3,19,302,2,2025-05-16 08:22:00,2025-05-16 17:36:00,9.23,49.32


--------------------------------------------------------------------------------


## Cell 5: Data Aggregation - Introduction

This section allows you to aggregate your data based on specific variables. Aggregation is useful for:
- Summarizing data by groups (e.g., customers by region)
- Computing statistics (sums, averages, etc.) for each group
- Preparing data for further analysis or visualization

## Cell 6: Adding New Variables (Optional)

In [5]:
def add_new_variables(df):
    """Allow the user to add new calculated variables to the DataFrame"""
    
    add_vars = input("Would you like to add new calculated variables before aggregation? (yes/no): ").strip().lower()
    
    if add_vars not in ['yes', 'y']:
        print("Skipping the addition of new variables.")
        return df
    
    print("\n=== Adding New Variables ===")
    print("This feature allows you to create new columns based on existing data.")
    
    # Make a copy of the dataframe to avoid modifying the original
    df_modified = df.copy()
    
    while True:
        print("\nAvailable columns:")
        for i, col in enumerate(df_modified.columns, 1):
            print(f"{i}: {col}")
        
        new_var_name = input("\nEnter name for the new variable (or 'done' to finish): ").strip()
        
        if new_var_name.lower() == 'done':
            break
            
        if new_var_name in df_modified.columns:
            print(f"⚠️ Column '{new_var_name}' already exists. Please choose a different name.")
            continue
            
        print("\nChoose operation type:")
        print("1: Simple arithmetic on one column (e.g., multiply a column by a value)")
        print("2: Operation between two columns (e.g., sum of two columns)")
        print("3: Apply a condition (e.g., if column X > 10 then 'High' else 'Low')")
        
        try:
            op_type = int(input("Enter operation type (1-3): ").strip())
            
            if op_type == 1:
                col_idx = int(input("Choose column number: ").strip()) - 1
                if 0 <= col_idx < len(df_modified.columns):
                    col_name = df_modified.columns[col_idx]
                    operation = input("Enter operation (+, -, *, / followed by a number, e.g., '*2'): ").strip()
                    
                    op_char = operation[0]
                    try:
                        value = float(operation[1:])
                        
                        if op_char == '+':
                            df_modified[new_var_name] = df_modified[col_name] + value
                        elif op_char == '-':
                            df_modified[new_var_name] = df_modified[col_name] - value
                        elif op_char == '*':
                            df_modified[new_var_name] = df_modified[col_name] * value
                        elif op_char == '/':
                            df_modified[new_var_name] = df_modified[col_name] / value
                        else:
                            print("Invalid operation. Please use +, -, *, or /")
                            continue
                            
                        print(f"✅ Created new column '{new_var_name}'")
                        print(df_modified[[col_name, new_var_name]].head())
                    except ValueError:
                        print("Invalid number in operation")
                else:
                    print("Invalid column number.")
            
            elif op_type == 2:
                col1_idx = int(input("Choose first column number: ").strip()) - 1
                col2_idx = int(input("Choose second column number: ").strip()) - 1
                
                if 0 <= col1_idx < len(df_modified.columns) and 0 <= col2_idx < len(df_modified.columns):
                    col1_name = df_modified.columns[col1_idx]
                    col2_name = df_modified.columns[col2_idx]
                    
                    operation = input("Enter operation between columns (+, -, *, /): ").strip()
                    
                    if operation == '+':
                        df_modified[new_var_name] = df_modified[col1_name] + df_modified[col2_name]
                    elif operation == '-':
                        df_modified[new_var_name] = df_modified[col1_name] - df_modified[col2_name]
                    elif operation == '*':
                        df_modified[new_var_name] = df_modified[col1_name] * df_modified[col2_name]
                    elif operation == '/':
                        df_modified[new_var_name] = df_modified[col1_name] / df_modified[col2_name]
                    else:
                        print("Invalid operation. Please use +, -, *, or /")
                        continue
                        
                    print(f"✅ Created new column '{new_var_name}'")
                    print(df_modified[[col1_name, col2_name, new_var_name]].head())
                else:
                    print("Invalid column number(s).")
            
            elif op_type == 3:
                col_idx = int(input("Choose column number for condition: ").strip()) - 1
                if 0 <= col_idx < len(df_modified.columns):
                    col_name = df_modified.columns[col_idx]
                    
                    condition_type = input("Choose condition type (>, <, ==, >=, <=): ").strip()
                    threshold = input("Enter threshold value: ").strip()
                    
                    try:
                        threshold_val = float(threshold)
                        true_value = input("Value if condition is true: ").strip()
                        false_value = input("Value if condition is false: ").strip()
                        
                        # Try to convert true/false values to numbers if possible
                        try:
                            true_value = float(true_value)
                            false_value = float(false_value)
                        except ValueError:
                            # Keep as strings if conversion fails
                            pass
                        
                        if condition_type == '>':
                            df_modified[new_var_name] = np.where(df_modified[col_name] > threshold_val, true_value, false_value)
                        elif condition_type == '<':
                            df_modified[new_var_name] = np.where(df_modified[col_name] < threshold_val, true_value, false_value)
                        elif condition_type == '==':
                            df_modified[new_var_name] = np.where(df_modified[col_name] == threshold_val, true_value, false_value)
                        elif condition_type == '>=':
                            df_modified[new_var_name] = np.where(df_modified[col_name] >= threshold_val, true_value, false_value)
                        elif condition_type == '<=':
                            df_modified[new_var_name] = np.where(df_modified[col_name] <= threshold_val, true_value, false_value)
                        else:
                            print("Invalid condition type.")
                            continue
                            
                        print(f"✅ Created new column '{new_var_name}'")
                        print(df_modified[[col_name, new_var_name]].head())
                    except ValueError:
                        print("Invalid threshold value")
                else:
                    print("Invalid column number.")
            else:
                print("Invalid operation type. Please enter a number between 1 and 3.")
                
        except ValueError:
            print("Please enter a valid number.")
            continue
            
    print(f"\n✅ Added {len(df_modified.columns) - len(df.columns)} new variables to the dataset.")
    return df_modified

# Execute the function
df = add_new_variables(df)

Would you like to add new calculated variables before aggregation? (yes/no):  no


Skipping the addition of new variables.


## Cell 7: Variable Selection for Aggregation

In [6]:
def select_aggregation_variables(df):
    """Allow the user to select variables for aggregation"""
    
    print("\n=== Data Aggregation Setup ===")
    
    # Select groupby variables
    print("\nStep 1: Select columns to group by (e.g., region, customer_type)")
    print("Available columns:")
    for i, col in enumerate(df.columns, 1):
        print(f"{i}: {col}")
    
    groupby_indices = input("\nEnter column numbers to group by (comma-separated) or 'cancel' to skip: ").strip()
    
    if groupby_indices.lower() == 'cancel':
        print("Aggregation cancelled.")
        return df, None, None  # Return 3 values consistently
        
    try:
        groupby_indices = [int(idx.strip()) - 1 for idx in groupby_indices.split(',')]
        groupby_cols = [df.columns[idx] for idx in groupby_indices if 0 <= idx < len(df.columns)]
        
        if not groupby_cols:
            print("⚠️ No valid groupby columns selected. Skipping aggregation.")
            return df, None, None  # Return 3 values consistently
            
        print(f"Selected groupby columns: {', '.join(groupby_cols)}")
    except ValueError:
        print("⚠️ Invalid input. Skipping aggregation.")
        return df, None, None
    
    # Select value variables and aggregation methods
    print("\nStep 2: Select columns to aggregate and their aggregation methods")
    print("Available columns:")
    numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
    
    if not numeric_cols:
        print("⚠️ No numeric columns found for aggregation.")
        return df, None, None
    
    for i, col in enumerate(numeric_cols, 1):
        print(f"{i}: {col}")
    
    agg_cols_input = input("\nEnter column numbers to aggregate (comma-separated): ").strip()
    try:
        agg_indices = [int(idx.strip()) - 1 for idx in agg_cols_input.split(',')]
        agg_cols = [numeric_cols[idx] for idx in agg_indices if 0 <= idx < len(numeric_cols)]
        
        if not agg_cols:
            print("⚠️ No valid aggregation columns selected. Skipping aggregation.")
            return df, None, None
            
        print(f"Selected aggregation columns: {', '.join(agg_cols)}")
    except ValueError:
        print("⚠️ Invalid input. Skipping aggregation.")
        return df, None, None
    
    # For each selected column, ask for aggregation method
    agg_methods = {}
    print("\nAvailable aggregation methods:")
    print("1: sum - Sum of values")
    print("2: mean - Average of values")
    print("3: median - Median of values")
    print("4: min - Minimum value")
    print("5: max - Maximum value")
    print("6: count - Count of values")
    print("7: std - Standard deviation")
    
    for col in agg_cols:
        while True:
            method_input = input(f"Choose aggregation method(s) for '{col}' (comma-separated numbers): ").strip()
            try:
                methods = []
                for m in method_input.split(','):
                    m = int(m.strip())
                    if 1 <= m <= 7:
                        if m == 1:
                            methods.append('sum')
                        elif m == 2:
                            methods.append('mean')
                        elif m == 3:
                            methods.append('median')
                        elif m == 4:
                            methods.append('min')
                        elif m == 5:
                            methods.append('max')
                        elif m == 6:
                            methods.append('count')
                        elif m == 7:
                            methods.append('std')
                
                if methods:
                    agg_methods[col] = methods
                    print(f"For '{col}', will calculate: {', '.join(methods)}")
                    break
                else:
                    print("Please select at least one valid method.")
            except ValueError:
                print("Please enter valid method numbers.")
    
    return df, groupby_cols, agg_methods

## Cell 8: Perform Aggregation

In [7]:
def perform_aggregation(df, groupby_cols, agg_methods):
    """Perform aggregation and write results to specified columns in original DataFrame"""
    
    if groupby_cols is None or agg_methods is None:
        print("Skipping aggregation due to incomplete setup.")
        return df
    
    print("\n=== Performing Aggregation ===")
    
    try:
        # Create a copy to work with
        df_original = df.copy()
        
        # Ask for target columns for each aggregation
        target_columns = {}
        for col, methods in agg_methods.items():
            for method in methods:
                desc = f"{col}_{method}"
                print(f"\nWhere should the result of {desc} be stored?")
                print("1: Create a new column")
                print("2: Use an existing column")
                
                choice = input("Enter choice (1 or 2): ").strip()
                
                if choice == "1":
                    suggested_name = f"{col}_{method}"
                    col_name = input(f"Enter new column name [default: {suggested_name}]: ").strip()
                    if not col_name:
                        col_name = suggested_name
                    
                    # Check if column already exists
                    if col_name in df_original.columns:
                        overwrite = input(f"Column '{col_name}' already exists. Overwrite? (yes/no): ").strip().lower()
                        if overwrite not in ['yes', 'y']:
                            print("Skipping this aggregation.")
                            continue
                else:
                    # Let user select existing column
                    print("\nAvailable columns:")
                    for i, existing_col in enumerate(df_original.columns, 1):
                        print(f"{i}: {existing_col}")
                    
                    while True:
                        try:
                            idx = int(input("\nChoose column number: ").strip()) - 1
                            if 0 <= idx < len(df_original.columns):
                                col_name = df_original.columns[idx]
                                break
                            print(f"Please enter a number between 1 and {len(df_original.columns)}")
                        except ValueError:
                            print("Please enter a valid number.")
                
                target_columns[(col, method)] = col_name
        
        # Perform the groupby operation to get aggregated values
        grouped = df_original.groupby(groupby_cols)
        
        # For each aggregation method, compute and merge back
        for (source_col, method), target_col in target_columns.items():
            print(f"Calculating {source_col}_{method} and storing in {target_col}...")
            
            # Compute the specific aggregation
            if method == 'sum':
                agg_result = grouped[source_col].sum().reset_index()
            elif method == 'mean':
                agg_result = grouped[source_col].mean().reset_index()
            elif method == 'median':
                agg_result = grouped[source_col].median().reset_index()
            elif method == 'min':
                agg_result = grouped[source_col].min().reset_index()
            elif method == 'max':
                agg_result = grouped[source_col].max().reset_index()
            elif method == 'count':
                agg_result = grouped[source_col].count().reset_index()
            elif method == 'std':
                agg_result = grouped[source_col].std().reset_index()
            
            # Rename the aggregated column to match with our target
            agg_result = agg_result.rename(columns={source_col: target_col})
            
            # Merge the aggregated result back to the original dataframe
            df_original = pd.merge(
                df_original, 
                agg_result,
                on=groupby_cols, 
                how='left',
                suffixes=('', '_aggregated')
            )
            
            # If the column already existed and has a suffix now, replace the original
            if f"{target_col}_aggregated" in df_original.columns:
                df_original[target_col] = df_original[f"{target_col}_aggregated"]
                df_original = df_original.drop(columns=[f"{target_col}_aggregated"])
        
        print(f"\n✅ Aggregation complete. Added {len(target_columns)} aggregated values to the original dataset.")
        print("\nUpdated data preview:")
        print(df_original.head())
        
        return df_original
    
    except Exception as e:
        print(f"❌ Error during aggregation: {e}")
        return df

## Cell 9 – Run Multiple Aggregation Cycles and Export Aggregated Data
This section orchestrates repeated aggregation passes and writes the final dataset to disk.

---

In [8]:
def run_aggregation_cycles(df, input_path, output_path):
    """Run multiple aggregation cycles if the user requests it"""
    
    # Start with the original data
    current_df = df.copy()
    
    # Track if any aggregation was performed
    aggregation_performed = False
    
    while True:
        # Run a single aggregation cycle
        _, current_groupby_cols, current_agg_methods = select_aggregation_variables(current_df)
        
        if current_groupby_cols is None or current_agg_methods is None:
            print("Skipping this aggregation cycle.")
            # If no aggregation was performed yet, return the original data
            if not aggregation_performed:
                return df, False
            # Otherwise, use the current state with previous aggregations
            else:
                break
        
        # Perform the current aggregation
        current_df = perform_aggregation(current_df, current_groupby_cols, current_agg_methods)
        
        # Mark that at least one aggregation was performed
        aggregation_performed = True
        
        # Ask if another cycle is desired
        print("\n=== Aggregation Cycle Complete ===")
        another_cycle = input("\nDo you want to run another aggregation cycle with different groupby variables? (yes/no): ").strip().lower()
        
        if another_cycle not in ['yes', 'y']:
            print("Completing aggregation process.")
            break
        
        print("\n=== Starting New Aggregation Cycle ===")
    
    return current_df, aggregation_performed

def export_data(df, output_path):
    """Export the data to CSV or pickle format"""
    
    if df is None or df.empty:
        print("No data to export.")
        return
    
    print("\n=== Data Export ===")
    
    # Ask for the output filename
    default_filename = "aggregated_data"
    filename = input(f"Enter output filename (without extension) [default: {default_filename}]: ").strip()
    if not filename:
        filename = default_filename
    
    # Ask for export format
    print("\nChoose export format:")
    print("1: CSV (.csv) - Readable text format, compatible with Excel and other tools")
    print("2: Pickle (.pkl) - Preserves data types, faster for loading back into Python")
    
    export_format = input("Enter format choice (1 or 2): ").strip()
    
    try:
        if export_format == "1":
            # CSV export options
            print("\nCSV Export Options:")
            print("1: Default settings (comma separator, include index)")
            print("2: Custom settings")
            
            csv_option = input("Choose CSV export option (1 or 2): ").strip()
            
            if csv_option == "2":
                # Custom CSV settings
                sep = input("Enter separator character [default: ,]: ").strip() or ","
                include_index = input("Include row indices? (yes/no) [default: no]: ").strip().lower() in ["yes", "y"]
                include_header = input("Include column headers? (yes/no) [default: yes]: ").strip().lower() not in ["no", "n"]
                
                filepath = output_path / f"{filename}.csv"
                df.to_csv(filepath, sep=sep, index=include_index, header=include_header)
            else:
                # Default CSV settings
                filepath = output_path / f"{filename}.csv"
                df.to_csv(filepath, index=False)
            
            print(f"✅ Data exported to CSV: {filepath}")
            
        elif export_format == "2":
            # Pickle export
            filepath = output_path / f"{filename}.pkl"
            df.to_pickle(filepath)
            print(f"✅ Data exported to Pickle: {filepath}")
            
        else:
            print("Invalid format choice. Defaulting to CSV.")
            filepath = output_path / f"{filename}.csv"
            df.to_csv(filepath, index=False)
            print(f"✅ Data exported to CSV: {filepath}")
            
    except Exception as e:
        print(f"❌ Error during export: {e}")

# Execute the aggregation cycles
df_aggregated, aggregation_was_performed = run_aggregation_cycles(df, input_path, output_path)

# Decide whether to export based on whether aggregation was performed
if aggregation_was_performed:
    export_data(df_aggregated, output_path)
else:
    print("\nNo data aggregation was performed. Skipping the export step.")


=== Data Aggregation Setup ===

Step 1: Select columns to group by (e.g., region, customer_type)
Available columns:
1: Date
2: Year
3: Month
4: Day
5: Route
6: Route_id
7: Start_time
8: end_time
9: time
10: distance



Enter column numbers to group by (comma-separated) or 'cancel' to skip:  6


Selected groupby columns: Route_id

Step 2: Select columns to aggregate and their aggregation methods
Available columns:
1: Year
2: Month
3: Day
4: Route
5: Route_id
6: time
7: distance



Enter column numbers to aggregate (comma-separated):  6,7


Selected aggregation columns: time, distance

Available aggregation methods:
1: sum - Sum of values
2: mean - Average of values
3: median - Median of values
4: min - Minimum value
5: max - Maximum value
6: count - Count of values
7: std - Standard deviation


Choose aggregation method(s) for 'time' (comma-separated numbers):  1


For 'time', will calculate: sum


Choose aggregation method(s) for 'distance' (comma-separated numbers):  1


For 'distance', will calculate: sum

=== Performing Aggregation ===

Where should the result of time_sum be stored?
1: Create a new column
2: Use an existing column


Enter choice (1 or 2):  1
Enter new column name [default: time_sum]:  



Where should the result of distance_sum be stored?
1: Create a new column
2: Use an existing column


Enter choice (1 or 2):  1
Enter new column name [default: distance_sum]:  


Calculating time_sum and storing in time_sum...
Calculating distance_sum and storing in distance_sum...

✅ Aggregation complete. Added 2 aggregated values to the original dataset.

Updated data preview:
         Date  Year  Month  Day  Route  Route_id           Start_time  \
0  2025-03-17  2025      3   17    102         2  2025-05-16 09:00:00   
1  2025-03-18  2025      3   18    202         2  2025-05-16 09:34:00   
2  2025-03-19  2025      3   19    302         2  2025-05-16 08:22:00   
3  2025-03-20  2025      3   20    402         2  2025-05-16 08:32:00   
4  2025-03-21  2025      3   21    502         2  2025-05-16 07:30:00   

              end_time   time  distance  time_sum  distance_sum  
0  2025-05-16 19:25:00  10.42    115.00     47.77        361.73  
1  2025-05-16 19:02:00   9.47     53.19     47.77        361.73  
2  2025-05-16 17:36:00   9.23     49.32     47.77        361.73  
3  2025-05-16 18:01:00   9.48     43.14     47.77        361.73  
4  2025-05-16 16:40:00   9.1


Do you want to run another aggregation cycle with different groupby variables? (yes/no):  no


Completing aggregation process.

=== Data Export ===


Enter output filename (without extension) [default: aggregated_data]:  work_time_and_km_clean_aggregated



Choose export format:
1: CSV (.csv) - Readable text format, compatible with Excel and other tools
2: Pickle (.pkl) - Preserves data types, faster for loading back into Python


Enter format choice (1 or 2):  1



CSV Export Options:
1: Default settings (comma separator, include index)
2: Custom settings


Choose CSV export option (1 or 2):  1


✅ Data exported to CSV: C:\Users\User\Dropbox\Personal\CareerFoundry\06 Sourcing data\Notebook folder\02 Data\01_processed_data\02_agregated_data\work_time_and_km_clean_aggregated.csv


## Cell 10: Summary and Conclusion

In [9]:
# Print a summary of what was done
print("\n=== Processing Summary ===")
print(f"Input file: {file_path.name}")

if aggregation_was_performed:
    # If aggregation was performed
    print(f"Original data: {df.shape[0]} rows × {df.shape[1]} columns")
    print(f"Final data after aggregation: {df_aggregated.shape[0]} rows × {df_aggregated.shape[1]} columns")
    print(f"Number of new columns added: {df_aggregated.shape[1] - df.shape[1]}")
else:
    # If no aggregation was performed
    print(f"Data loaded: {df.shape[0]} rows × {df.shape[1]} columns")
    print("No aggregation was performed.")

print("\n✅ Processing complete.")


=== Processing Summary ===
Input file: work_time_and_km_clean.csv
Original data: 20 rows × 10 columns
Final data after aggregation: 20 rows × 12 columns
Number of new columns added: 2

✅ Processing complete.
