# Geocoding Workflow 🚀
This notebook  
1. Loads raw address data  
2. Detects the correct delimiter and previews the file  
3. Geocodes all addresses with the HERE API (with caching & checkpointing)  
4. Lets you interactively fix low-confidence rows  
5. Optionally adds depot locations  
6. Exports the consolidated data set and an interactive Folium map  

> **Prerequisites**  
> * `api_keys.json` in the project root, containing `{"HERE_API_KEY": "YOUR_KEY"}`  
> * A “Processed_data” folder to hold outputs  

---

In [8]:
#!/usr/bin/env python3
"""
Geocoding Script: Load address data, geocode addresses using HERE API,
allow interactive correction of problematic addresses, and export results.
Includes support for depot locations for route planning.
"""

# stdlib
import json
import logging
import os
import sys
import time
from functools import lru_cache
from pathlib import Path
from typing import Dict, List, Optional, Tuple

# third-party
import chardet
import folium
import pandas as pd
import requests
from folium.plugins import MarkerCluster
from tqdm import tqdm

# === Configuration ===
GEOCODE_URL = "https://geocode.search.hereapi.com/v1/geocode"
DATA_FOLDER = Path.cwd().parent / '02 Data' / '01_processed_data'
os.makedirs(DATA_FOLDER, exist_ok=True)

# Load API key from api_keys.json
API_KEYS_FILE = Path.cwd() / "api_keys.json"
try:
    with open(API_KEYS_FILE, "r") as f:
        api_keys = json.load(f)
    HERE_API_KEY = api_keys.get("HERE_API_KEY")
    if not HERE_API_KEY:
        raise ValueError("HERE_API_KEY not found in api_keys.json.")
except FileNotFoundError:
    raise FileNotFoundError(f"api_keys.json not found at {API_KEYS_FILE}. Please create it with your HERE_API_KEY.")
except json.JSONDecodeError:
    raise ValueError(f"Invalid JSON format in {API_KEYS_FILE}.")

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('geocoding')

# === Utility Functions ===
def select_input_file(input_path: Path) -> Path:
    """Interactively select a file from the input directory."""
    if not input_path.exists() or not any(input_path.iterdir()):
        raise FileNotFoundError(f"Input folder '{input_path}' is empty or does not exist.")
    
    files = [f for f in input_path.iterdir() if f.is_file()]
    if len(sys.argv) > 1 and Path(sys.argv[1]).exists():
        return Path(sys.argv[1])
    
    print("Available files:")
    for i, f in enumerate(files, 1):
        print(f"{i}. {f.name}")
    while True:
        choice = input("\nEnter file number (or 'q' to quit): ").strip()
        if choice.lower() == 'q':
            sys.exit("Operation cancelled.")
        if choice.isdigit() and 1 <= int(choice) <= len(files):
            return files[int(choice) - 1]
        print("Invalid selection.")

def load_data(file_path: Path) -> pd.DataFrame:
    """Load data from CSV or pickle file with encoding/delimiter detection."""
    if file_path.suffix.lower() == '.csv':
        encoding, previews = preview_csv_with_delimiters(file_path)
        delim = select_delimiter(previews)
        for enc in [encoding, 'utf-8', 'latin1', 'ISO-8859-1', 'cp1252']:
            try:
                df = pd.read_csv(file_path, sep=delim, encoding=enc)
                print(f"\n✅ Loaded '{file_path.name}' with encoding '{enc}' and delimiter '{delim}'")
                return df
            except Exception as e:
                logger.error(f"Failed with encoding '{enc}': {e}")
        raise ValueError(f"Could not load '{file_path.name}' with any encoding.")
    elif file_path.suffix.lower() == '.pkl':
        return pd.read_pickle(file_path)
    raise ValueError(f"Unsupported file format: {file_path.suffix}")

def preview_csv_with_delimiters(csv_path: Path, delimiters: list = [',', ';', '\t', '|']) -> Tuple[str, Dict]:
    """Preview CSV with different delimiters and detect encoding."""
    with open(csv_path, 'rb') as f:
        raw = f.read(100_000)
        encoding = chardet.detect(raw)['encoding'] or 'utf-8'
    print(f"\nDetected encoding: {encoding}")
    previews = {}
    for delim in delimiters:
        try:
            df = pd.read_csv(csv_path, sep=delim, encoding=encoding, nrows=3)
            print(f"\n✅ Delimiter '{delim}':\n{df.to_string(index=False)}")
            previews[delim] = df
        except Exception as e:
            print(f"\n❌ Delimiter '{delim}' failed: {e}")
            previews[delim] = None
    return encoding, previews

def select_delimiter(previews: Dict) -> str:
    """Prompt user to select a delimiter based on previews."""
    delimiters = list(previews.keys())
    print("\nSelect delimiter:")
    for i, delim in enumerate(delimiters, 1):
        print(f"{i}. '{delim}'")
    while True:
        choice = input("Enter number: ").strip()
        if choice.isdigit() and 1 <= int(choice) <= len(delimiters):
            delim = delimiters[int(choice) - 1]
            if previews[delim] is not None:
                return delim
        print("Invalid choice.")

@lru_cache(maxsize=1000)
def geocode_address(
        address: str,
        api_key: str = HERE_API_KEY,
        country: str = "Estonia"
) -> Tuple[Optional[float], Optional[float], Optional[str], Optional[float]]:
    
    """Geocode an address using the HERE API."""
    if not address.strip():
        logger.warning("Empty address provided.")
        return None, None, None, None
    query = f"{address}, {country}" if country.lower() not in address.lower() else address
    params = {"q": query, "apiKey": api_key}
    try:
        response = requests.get(GEOCODE_URL, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()
        if not data.get("items"):
            logger.info(f"No results for address: {address}")
            return None, None, None, None
        match = data["items"][0]
        pos = match.get("position", {})
        return pos.get("lat"), pos.get("lng"), match.get("address", {}).get("label"), match.get("scoring", {}).get("queryScore", 0)
    except requests.RequestException as e:
        logger.error(f"Geocoding failed for '{address}': {e}")
        return None, None, None, None

def geocode_dataframe(df: pd.DataFrame, address_col: str, file_label: str, batch_size: int = 100) -> pd.DataFrame:
    """Geocode addresses in a DataFrame."""
    if address_col not in df.columns:
        raise ValueError(f"Column '{address_col}' not found.")
    for col in ["latitude", "longitude", "formatted_address", "geocode_confidence"]:
        if col not in df.columns:
            df[col] = None
    to_geocode = df.index[df["latitude"].isna()]
    if not to_geocode.any():
        print("✅ All addresses geocoded.")
        return df
    print(f"🔄 Geocoding {len(to_geocode)} addresses...")
    checkpoint_path = DATA_FOLDER / f"{Path(file_label).stem}_checkpoint.pkl"
    for idx in tqdm(to_geocode, desc="Geocoding"):
        addr = str(df.loc[idx, address_col])
        if not addr.strip() or addr == "-":
            continue
        lat, lon, fmt, conf = geocode_address(addr)
        if lat and lon:
            df.loc[idx, ["latitude", "longitude", "formatted_address", "geocode_confidence"]] = lat, lon, fmt, conf
        if idx % batch_size == 0:
            df.to_pickle(checkpoint_path)
    return df

def collect_depot_information(country: str) -> pd.DataFrame:
    """Collect depot information from user input with option to skip."""
    print("\n=== Depot Information Collection ===")
    
    skip_depots = input("Do you want to add depot information? (y/n): ").strip().lower()
    if skip_depots != 'y':
        print("Skipping depot information collection.")
        # Return an empty DataFrame with the correct structure
        return pd.DataFrame(columns=[
            "depot_id", "depot_name", "depot_address", 
            "latitude", "longitude", "formatted_address", 
            "geocode_confidence", "is_depot"
        ])
    
    while True:
        try:
            num_depots_input = input("Enter number of depots (or press Enter to skip): ").strip()
            if not num_depots_input:  # User pressed Enter without typing
                print("Skipping depot information collection.")
                return pd.DataFrame(columns=[
                    "depot_id", "depot_name", "depot_address", 
                    "latitude", "longitude", "formatted_address", 
                    "geocode_confidence", "is_depot"
                ])
                
            num_depots = int(num_depots_input)
            if num_depots < 0:
                print("Number of depots must be non-negative.")
                continue
            if num_depots == 0:
                print("No depots to add.")
                return pd.DataFrame(columns=[
                    "depot_id", "depot_name", "depot_address", 
                    "latitude", "longitude", "formatted_address", 
                    "geocode_confidence", "is_depot"
                ])
            break
        except ValueError:
            print("Please enter a valid number.")
    
    depots = []
    for i in range(1, num_depots + 1):
        depot_name = input(f"\nDepot #{i} name: ").strip()
        depot_addr = input(f"Depot #{i} address: ").strip()
        
        # Geocode the depot address
        print(f"Geocoding depot: {depot_name}...")
        lat, lon, formatted_addr, confidence = geocode_address(depot_addr, country=country)
        
        if lat is None or lon is None:
            print(f"⚠️ Warning: Could not geocode address for depot '{depot_name}'")
            retry = input("Try again? (y/n): ").strip().lower()
            if retry == 'y':
                i -= 1  # Repeat this depot
                continue
        
        depot_info = {
            "depot_id": i,
            "depot_name": depot_name,
            "depot_address": depot_addr,
            "latitude": lat,
            "longitude": lon,
            "formatted_address": formatted_addr,
            "geocode_confidence": confidence,
            "is_depot": True  # Flag to identify depot rows
        }
        depots.append(depot_info)
        
        if lat is not None and lon is not None:
            print(f"✅ Geocoded: {formatted_addr} (Confidence: {confidence:.2f})")
    
    return pd.DataFrame(depots)
    
# === Main Execution ===
def main():
    file_path = select_input_file(DATA_FOLDER)
    df = load_data(file_path)
    print(f"\nPreview:\n{df.head(2).to_string(index=False)}")
    
    country = input("Country (default 'Estonia'): ").strip() or "Estonia"
    addr_col = select_address_column(df)
    df = geocode_dataframe(df, addr_col, file_path.name)
    
    # Collect depot information
    depot_df = collect_depot_information(country)
    
    fix_problematic_addresses(df, addr_col)
    
    # Prepare combined data for export
    # Add is_depot flag to original dataframe (all False)
    if "is_depot" not in df.columns:
        df["is_depot"] = False
    
    # If the main dataframe doesn't have depot_id and depot_name, add them
    for col in ["depot_id", "depot_name"]:
        if col not in df.columns:
            df[col] = None
    
    # Combine customer and depot dataframes
    combined_df = pd.concat([df, depot_df], ignore_index=True)
    
    export_results(combined_df, file_path.name)
    print_summary(df)  # Only print summary for customer locations
    
    if input("Generate map with depots? (y/n): ").lower() == 'y':
        create_geocoded_map_with_depots(combined_df, addr_col, file_path.name)

def select_address_column(df: pd.DataFrame) -> str:
    """Prompt user to select address column."""
    text_cols = [col for col in df.columns if df[col].dtype == object]
    print("\nSelect address column:")
    for i, col in enumerate(text_cols, 1):
        print(f"{i}. {col} (e.g. '{df[col].iloc[0]}')")
    while True:
        choice = input(f"Choice (1-{len(text_cols)}): ").strip()
        if choice.isdigit() and 1 <= int(choice) <= len(text_cols):
            return text_cols[int(choice) - 1]
        print("Invalid choice.")

def fix_problematic_addresses(df: pd.DataFrame, addr_col: str):
    """Fix addresses with low confidence or missing coordinates with improved workflow."""
    issues = df[(df["latitude"].isna()) | (df["geocode_confidence"] < 0.8)].index
    if not len(issues):
        print("✅ No problematic addresses found.")
        return
        
    print(f"\nFound {len(issues)} addresses with confidence < 0.9 or not geocoded.")
    correct = input("Do you want to correct these addresses? (y/n): ").strip().lower()
    
    if correct != 'y':
        return
        
    print("\nFor each address:")
    print("- Enter a corrected address and press Enter to process it")
    print("- Enter '1' to skip this address")
    print("- Enter '2' to quit the correction process\n")
    
    for i, idx in enumerate(issues):
        addr = df.loc[idx, addr_col]
        customer = df.loc[idx, "Customer"] if "Customer" in df.columns else "Unknown"
        conf = df.loc[idx, "geocode_confidence"]
        status = "❌ Not geocoded" if pd.isna(conf) else f"⚠️ Low confidence ({conf:.2f})"
        
        print(f"\nAddress {i+1}/{len(issues)}: '{addr}'")
        print(f"Customer: {customer}")
        print(f"Status: {status}")
        
        response = input("New address (or 1=skip, 2=quit): ").strip()
        
        if response == '2':
            print("Quitting address correction.")
            break
        elif response == '1':
            print("Skipping this address.")
            continue
        else:
            # User entered a new address
            lat, lon, fmt, conf = geocode_address(response)
            if lat and lon:
                df.loc[idx, [addr_col,
                             "latitude",
                             "longitude",
                             "formatted_address",
                             "geocode_confidence"]
                ] = response, lat, lon, fmt, conf
                print(f"✅ Updated: {fmt} (Confidence: {conf:.2f})")
            else:
                print("❌ Could not geocode the new address.")

def export_results(df: pd.DataFrame, file_label: str):
    """Export DataFrame to file."""
    format_choice = input("\nExport format: 1. CSV, 2. Pickle: ").strip()
    file_format = 'csv' if format_choice == '1' else 'pkl'
    filename = input("Output filename (no ext): ").strip()
    output_file = DATA_FOLDER / f"{filename}.{file_format}"
    if file_format == 'csv':
        df.to_csv(output_file, index=False)
    else:
        df.to_pickle(output_file)
    print(f"✅ Saved to {output_file}")

def create_geocoded_map_with_depots(df: pd.DataFrame, addr_col: str, file_label: str):
    """Create an interactive map with both customer locations and depots."""
    valid = df[df["latitude"].notna() & df["longitude"].notna()]
    if valid.empty:
        print("No points to map.")
        return
    
    center = [valid["latitude"].mean(), valid["longitude"].mean()]
    m = folium.Map(location=center, zoom_start=10)
    
    # Add depots first (without clustering)
    depots = valid[valid["is_depot"] == True]
    for _, row in depots.iterrows():
        popup_text = (f"<b>DEPOT: {row['depot_name']}</b><br>"
                     f"{row['depot_address']}<br>"
                     f"{row['formatted_address']}<br>"
                     f"Conf: {row['geocode_confidence']:.2f}")
        
        folium.Marker(
            [row["latitude"], row["longitude"]],
            popup=popup_text,
            icon=folium.Icon(color='red', icon='home', prefix='fa')
        ).add_to(m)
    
    # Add customer locations with clustering
    cluster = MarkerCluster().add_to(m)
    customers = valid[valid["is_depot"] != True]
    for _, row in customers.iterrows():
        try:
            address = row[addr_col] if addr_col in row and not pd.isna(row[addr_col]) else "No address"
            color = 'green' if row["geocode_confidence"] >= 0.8 else 'orange' if row["geocode_confidence"] >= 0.6 else 'red'
            
            folium.Marker(
                [row["latitude"], row["longitude"]],
                popup=f"{address}<br>{row['formatted_address']}<br>Conf: {row['geocode_confidence']:.2f}",
                icon=folium.Icon(color=color)
            ).add_to(cluster)
        except Exception as e:
            logger.error(f"Error adding marker: {e}")
    
    map_file = DATA_FOLDER / f"{Path(file_label).stem}_map.html"
    m.save(map_file)
    print(f"Map saved to {map_file}")
    # Optional: open in browser
    import webbrowser
    webbrowser.open(map_file.as_uri())

def print_summary(df: pd.DataFrame):
    """Print geocoding summary."""
    total = len(df)
    successes = df["latitude"].notna().sum()
    print(f"\n📊 Summary:\n- Total: {total}\n- Geocoded: {successes}\n- Rate: {successes/total*100:.1f}%")
    conf = df["geocode_confidence"].dropna()
    if not conf.empty:
        print("\n📈 Confidence:")
        for label, count in (
                pd.cut(conf,
                       bins=[0, 0.6, 0.8, 0.9, 1.0],
                       labels=["<0.6", "0.6-0.8", "0.8-0.9", "0.9-1.0"])
                .value_counts()
                .items()):
            
            print(f"- {label}: {count}")

if __name__ == "__main__":
    main()

Available files:
1. 00_work_time_and_km_aggregated.csv
2. 01_weekly_deliveries_clean.csv
3. 02_weekly_deliveries_geocoded_ORG.csv
4. 03_1_depot_centered_clusters.csv
5. 04_optimo_routes.xlsx
6. 05_merged_data.csv
7. 06_clean_agregated_data.csv
8. 06_clean_agregated_data_for Tableau.csv
9. 90_weekly_deliveries_clean.csv
10. 90_work_time_and_km_clean_aggregated.csv
11. routes_map_here.html



Enter file number (or 'q' to quit):  2



Detected encoding: utf-8

✅ Delimiter ',':
 ABS Custumer no  Route Number                                                     Customer                       Full address Service  DeliveryQty  Net Weight
             123          2432                                                 HESBURGER AS    Sõpruse tee 15, Viimsi HARJUMAA     MAS          1.0        11.4
             124          3023 KOTKAPOJA TN 2A, KOTKJAPOJA TN  2F, KOTKAPOJA TN T4 KÜ (9400               Kotkapoja 2A TALLINN     MAS          2.0         3.4
             125          4024 LIBLE TN 1/2/3/4/5/6, LIBLE TN  2A, LIBLE TÄNAV KÜ (94156236 Lible tn 1, 2, 3, 4, 5, 6  TALLINN     MAS          6.0         3.4

✅ Delimiter ';':
                                       ABS Custumer no,Route Number,Customer,Full address,Service,DeliveryQty,Net Weight
                                                    123,2432,HESBURGER AS,"Sõpruse tee 15, Viimsi HARJUMAA",MAS,1.0,11.4
                124,3023,"KOTKAPOJA TN 2A, KOTKJAPOJA TN

Enter number:  1



✅ Loaded '01_weekly_deliveries_clean.csv' with encoding 'utf-8' and delimiter ','

Preview:
 ABS Custumer no  Route Number                                                     Customer                    Full address Service  DeliveryQty  Net Weight
             123          2432                                                 HESBURGER AS Sõpruse tee 15, Viimsi HARJUMAA     MAS          1.0        11.4
             124          3023 KOTKAPOJA TN 2A, KOTKJAPOJA TN  2F, KOTKAPOJA TN T4 KÜ (9400            Kotkapoja 2A TALLINN     MAS          2.0         3.4


Country (default 'Estonia'):  



Select address column:
1. Customer (e.g. 'HESBURGER AS')
2. Full address (e.g. 'Sõpruse tee 15, Viimsi HARJUMAA')
3. Service (e.g. 'MAS')


Choice (1-3):  2


🔄 Geocoding 885 addresses...


Geocoding:  25%|█████████████████                                                    | 219/885 [01:05<03:31,  3.14it/s]INFO:geocoding:No results for address: Estonia pst 9/ÕHTUNE VAHETUS Tallinn
Geocoding:  65%|████████████████████████████████████████████▌                        | 571/885 [02:53<01:05,  4.81it/s]INFO:geocoding:No results for address: Padriku tee 4/1,2,3,4,5,6:7 TALLINN
Geocoding: 100%|█████████████████████████████████████████████████████████████████████| 885/885 [04:19<00:00,  3.41it/s]



=== Depot Information Collection ===


Do you want to add depot information? (y/n):  y
Enter number of depots (or press Enter to skip):  4

Depot #1 name:  Production LOO
Depot #1 address:  Kuusiku tee 28, Loo, 74201 Harju maakond, Estonia


Geocoding depot: Production LOO...
✅ Geocoded: Kuusiku tee 26, Loo, Jõelähtme, 74201 Harju Maakond, Eesti (Confidence: 0.99)



Depot #2 name:  Production TARTU
Depot #2 address:  Vabriku tn 7, Vahi, 60534 Tartu maakond, Estonia


Geocoding depot: Production TARTU...
✅ Geocoded: Vabriku 7, Vahi, Tartu vald, 60534 Tartu Maakond, Eesti (Confidence: 0.89)



Depot #3 name:  Warehouse PÄRNU
Depot #3 address:  Kõrtsi tn 7, Pärnu, 80010 Pärnu maakond, Estonia


Geocoding depot: Warehouse PÄRNU...
✅ Geocoded: Kõrtsi 7, Pärnu, 80034 Pärnu Maakond, Eesti (Confidence: 0.91)



Depot #4 name:  Warehouse JÕHVI
Depot #4 address:  Linda tänav 15f, Jõhvi, 41536 Ida-Viru maakond, Estonia


Geocoding depot: Warehouse JÕHVI...
✅ Geocoded: Linda 15f, Jõhvi, 41536 Ida-Viru Maakond, Eesti (Confidence: 1.00)

Found 69 addresses with confidence < 0.9 or not geocoded.


Do you want to correct these addresses? (y/n):  n

Export format: 1. CSV, 2. Pickle:  1
Output filename (no ext):  02_weekly_deliveries_geocoded


✅ Saved to C:\Users\User\Dropbox\Personal\CareerFoundry\06 Sourcing data\Notebook folder\02 Data\01_processed_data\02_weekly_deliveries_geocoded.csv

📊 Summary:
- Total: 885
- Geocoded: 880
- Rate: 99.4%

📈 Confidence:
- 0.9-1.0: 696
- 0.8-0.9: 117
- 0.6-0.8: 61
- <0.6: 6


Generate map with depots? (y/n):  n
