# Customer Clustering Using K-Means for Route Optimization

This section performs customer clustering based on geographical coordinates using the K-Means algorithm.
The goal is to group nearby customers together to support optimized delivery routes in Estonia.

---

## Cell 1: Imports and Initial Setup

In [92]:
import os
import sys
import pandas as pd
import numpy as np
import chardet
from pathlib import Path
import json
import time
import re
import math
import gc
import warnings
import logging
import random
import requests
from datetime import datetime
import time
from tqdm import tqdm
import threading

warnings.filterwarnings('ignore')  # Suppress non-critical warnings

# Set up logging with formatting
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('route_optimization')

print("✅ Imports complete")

✅ Imports complete


## Cell 2: Set up project paths and folders

In [94]:
def setup_project():
    """Set up project paths and folders"""
    project_root = Path.cwd()  # Current working directory
    input_path = project_root.parent / '02 Data' / 'Processed_data'
    output_path = project_root.parent / '02 Data' / 'Processed_data'
    
    # Check if input directory exists
    if not input_path.exists():
        print(f"Error: Input directory '{input_path}' does not exist.")
        print("Please create this directory or modify the path.")
        sys.exit(1)
    
    # Create output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)
    print(f"Project setup complete. \n Input path: {input_path} \n Output path: {output_path}")
    
    return input_path, output_path

def load_api_key(file_path="api_keys.json"):
    """Load the HERE API key from a JSON file."""
    try:
        with open(file_path, 'r') as f:
            api_keys = json.load(f)
        api_key = api_keys.get("HERE_API_KEY")
        if not api_key:
            print("⚠️ No HERE API key found in the JSON file")
            return None
        return api_key
    except Exception as e:
        print(f"⚠️ Error loading API key: {e}")
        return None

# Test these functions
if __name__ == "__main__":
    print("Testing project setup functions...")
    # Comment out if you just want to define the functions without running
    api_key = load_api_key()
    if api_key:
        print(f"✅ API key loaded successfully")
    else:
        print("⚠️ No API key found. Will use fallback methods.")

Testing project setup functions...
✅ API key loaded successfully


## Cell 3: Data Loading Functions

In [96]:
def load_data(input_path):
    """Load and parse customer data file"""
    # List available CSV files in the input directory
    available_files = list(input_path.glob("*.csv"))
    if not available_files:
        print(f"No CSV files found in {input_path}")
        sys.exit(1)
    
    print("Available files:")
    for i, f in enumerate(available_files, start=1):
        print(f"{i}: {f.name}")
    
    # Prompt user to choose a file by number
    while True:
        try:
            choice = int(input(f"Choose file number (1-{len(available_files)}): ").strip()) - 1
            if 0 <= choice < len(available_files):
                break
            print(f"Please enter a number between 1 and {len(available_files)}")
        except ValueError:
            print("Please enter a valid number.")
    
    file_path = available_files[choice]
    
    # Detect file encoding
    print(f"Detecting encoding for {file_path.name}...")
    with open(file_path, 'rb') as file:
        result = chardet.detect(file.read())
    encoding = result['encoding']
    confidence = result['confidence']
    print(f"Detected encoding: {encoding} (confidence: {confidence:.1%})")
    
    # Analyze delimiter options
    print("\nAnalyzing potential delimiters:\n")
    delimiters = [',', ';', r'\t', '|']  # Raw string for tab to avoid escape issues
    delimiter_options = {}
    for i, delim in enumerate(delimiters, start=1):
        try:
            preview_df = pd.read_csv(file_path, engine='python', encoding=encoding, sep=delim, nrows=3)
            col_count = len(preview_df.columns)
            delimiter_options[i] = (delim, col_count)
            print(f"{i}: Delimiter '{delim}'\n   Found {col_count} columns")
            print(f"   Preview with option {i}:")
            display(preview_df.head(3))
            print("-" * 80 + "\n")
        except Exception as e:
            print(f"{i}: Error with delimiter '{delim}': {e}")
    
    # Suggest the delimiter with the most columns
    if delimiter_options:
        suggested = max(delimiter_options, key=lambda k: delimiter_options[k][1])
        print(f"Suggested option: {suggested} ('{delimiter_options[suggested][0]}') with {delimiter_options[suggested][1]} columns")
    else:
        print("No valid delimiters found. Please check the file format.")
        sys.exit(1)
    
    # Prompt user to choose delimiter option
    while True:
        try:
            delim_choice = input(f"\nChoose delimiter option (1-{len(delimiter_options)}) [default: {suggested}]: ").strip()
            if not delim_choice:
                delim_choice = suggested
            else:
                delim_choice = int(delim_choice)
            if delim_choice in delimiter_options:
                break
            print(f"Please enter a number between 1 and {len(delimiter_options)} or press Enter for default.")
        except ValueError:
            print("Please enter a valid number or press Enter for default.")
    
    chosen_delim, _ = delimiter_options[delim_choice]
    print(f"Using delimiter: '{chosen_delim}'")
    
    # Load the full CSV with chosen delimiter and encoding
    try:
        df = pd.read_csv(file_path, encoding=encoding, sep=chosen_delim)
        print(f"\n✅ Loaded {df.shape[0]} rows × {df.shape[1]} columns from {file_path.name}")
    except Exception as e:
        print(f"❌ Failed to load CSV: {e}")
        sys.exit(1)
    
    # Display data overview
    print("\nData Overview:")
    print(f"Column names: {', '.join(df.columns[:5])}, ... (and {len(df.columns)-5} more columns)" if len(df.columns) > 5 else f"Column names: {', '.join(df.columns)}")
    print(f"\nData types (first 5 columns):\n{df.dtypes[:5]}")
    print(f"... (and {len(df.columns)-5} more columns)" if len(df.columns) > 5 else "")
    print("\nSample data:")
    display(df.head(3))
    print("-" * 80)
    
    return df, file_path