In [1]:
import pickle
import pandas as pd
import math
from itertools import combinations

from src.data_load import load_tables, load_artificial_instance
from src.distance_utils import distance

from itertools import product

data_path = '../data'
instance = 'inst3'
directorio_df, labors_raw_df, cities_df, duraciones_df, valid_cities = load_tables(data_path, generate_labors=False)
labors_real_df = load_artificial_instance(data_path, instance, labors_raw_df)

In [2]:
def compute_transport_distances(labors_real_df, method="osrm", timeout=5):
    """
    Compute all pairwise transport distances between all recorded points,
    grouped by (city, schedule_date). 
    Returns a single dictionary with all distances.

    Parameters
    ----------
    labors_real_df : pd.DataFrame
        DataFrame with transport labors (must include city, schedule_date,
        start_address_id, end_address_id, start_address_point, end_address_point).
    method : str, default "osrm"
        Distance computation method ("osrm" or "haversine").
    timeout : int, default 5
        Timeout for OSRM calls.

    Returns
    -------
    dict
        Dictionary with distances in km.
        Keys include both (start_id, end_id) and (start_point, end_point).
    """
    dist_dict = {}

    # Ensure schedule_date is datetime
    if not pd.api.types.is_datetime64_any_dtype(labors_real_df["schedule_date"]):
        labors_real_df["schedule_date"] = pd.to_datetime(labors_real_df["schedule_date"])

    # --- Group by city + date ---
    grouped = labors_real_df.groupby(["city", labors_real_df["schedule_date"].dt.date])

    for (city, day), df_sub in grouped:
        # --- Collect unique nodes (ids + points) ---
        unique_nodes = df_sub[["start_address_id", "start_address_point"]].dropna().drop_duplicates()
        unique_nodes = pd.concat([
            unique_nodes,
            df_sub[["end_address_id", "end_address_point"]].dropna().drop_duplicates()
        ]).drop_duplicates()

        unique_nodes = unique_nodes.rename(
            columns={"start_address_id": "address_id", "start_address_point": "address_point"}
        )

        nodes_list = unique_nodes.to_dict("records")

        # --- Compute all pairwise distances ---
        for n1, n2 in product(nodes_list, repeat=2):
            if n1["address_id"] == n2["address_id"]:
                continue  # skip self-distance

            sid, sp = n1["address_id"], n1["address_point"]
            did, dp = n2["address_id"], n2["address_point"]

            if (sid, did) in dist_dict:
                continue  # already computed

            # Use your existing distance() function (supports OSRM + haversine)
            d = distance(sp, dp, method=method, timeout=timeout)

            # Store under both key types
            dist_dict[(sid, did)] = d
            dist_dict[(sp, dp)] = d

        # print(f"✅ Finished {city}, {day}, {len(nodes_list)} unique nodes")

    return dist_dict


In [3]:
manhattan_distances = compute_transport_distances(labors_real_df, method='Manhattan')

with open(f'{data_path}/data_clean/artif_col_inst/{instance}/manhattan_dist_dict.pkl', "wb") as f:
    pickle.dump(manhattan_distances, f)

In [None]:
real_distances = compute_transport_distances(labors_real_df, method='osrm')

with open(f'{data_path}/data_clean/artif_col_inst/{instance}/real_dist_dict.pkl', "wb") as f:
    pickle.dump(real_distances, f)

## New distance computing

In [2]:
import pandas as pd
import numpy as np
import requests
import math

OSRM_URL = "http://localhost:5000/table/v1/driving/"

def parse_point(p: str):
    """
    Parse 'POINT(lon lat)' into (lat, lon).
    """
    try:
        coords = p.strip().replace("POINT (", "").replace(")", "").split()
        lon, lat = float(coords[0]), float(coords[1])
        return lat, lon
    except Exception:
        return None, None


def haversine(lat1, lon1, lat2, lon2):
    """
    Simple haversine distance (km).
    """
    φ1, φ2 = map(math.radians, (lat1, lat2))
    dφ = math.radians(lat2 - lat1)
    dλ = math.radians(lon2 - lon1)
    a = math.sin(dφ / 2) ** 2 + math.cos(φ1) * math.cos(φ2) * math.sin(dλ / 2) ** 2
    return 2 * 6371 * math.atan2(math.sqrt(a), math.sqrt(1 - a))


def compute_transport_distances(labors_real_df, method="osrm", timeout=5, chunk_size=50):
    """
    Compute all unique transport distances using OSRM table API (fast) with fallback.
    
    Returns
    -------
    dict: {(start_id, end_id): distance_km}
    """
    dist_dict = {}

    # Extract unique pairs
    pairs = labors_real_df[
        ['start_address_id', 'end_address_id',
         'start_address_point', 'end_address_point']
    ].dropna().drop_duplicates()

    # Build unique node list
    unique_points = {}
    bad_ids = []
    for _, row in pairs.iterrows():
        for aid, point in [
            (row['start_address_id'], row['start_address_point']),
            (row['end_address_id'], row['end_address_point'])
        ]:
            if aid not in unique_points:
                lat, lon = parse_point(point)
                if lat is not None and lon is not None:
                    unique_points[aid] = (lon, lat)  # OSRM expects lon,lat
                else:
                    bad_ids.append(aid)

    if bad_ids:
        print(f"⚠️ Skipped {len(bad_ids)} ids with invalid coordinates: {bad_ids[:5]}...")

    ids = list(unique_points.keys())
    coords = [unique_points[i] for i in ids]
    id_to_idx = {i: idx for idx, i in enumerate(ids)}

    # Precompute distance matrix via OSRM (chunked)
    n = len(coords)
    distances = np.full((n, n), np.nan)

    if method == "osrm" and n > 0:
        for i in range(0, n, chunk_size):
            chunk_ids = ids[i:i + chunk_size]
            chunk_coords = coords[i:i + chunk_size]
            coord_str = ";".join([f"{lon},{lat}" for lon, lat in chunk_coords])

            url = OSRM_URL + coord_str + "?annotations=distance"
            try:
                r = requests.get(url, timeout=timeout)
                r.raise_for_status()
                dist_matrix = r.json()["distances"]

                # Fill global matrix
                for local_i, global_i in enumerate(range(i, i + len(chunk_coords))):
                    for local_j, global_j in enumerate(range(i, i + len(chunk_coords))):
                        d = dist_matrix[local_i][local_j]
                        if d is not None:
                            distances[global_i, global_j] = d / 1000.0  # to km
            except Exception as e:
                print(f"⚠️ OSRM table failed for chunk {i}:{i+chunk_size}, reason={e}")

    # Build final dictionary with fallback
    for _, row in pairs.iterrows():
        sid, did = row["start_address_id"], row["end_address_id"]

        if sid not in id_to_idx or did not in id_to_idx:
            print(f"⚠️ Missing mapping for pair ({sid}, {did}) – skipping.")
            continue

        si, di = id_to_idx[sid], id_to_idx[did]
        d = distances[si, di]

        if np.isnan(d):  # fallback to haversine
            lat1, lon1 = parse_point(row["start_address_point"])
            lat2, lon2 = parse_point(row["end_address_point"])
            if None not in (lat1, lon1, lat2, lon2):
                d = haversine(lat1, lon1, lat2, lon2)
            else:
                d = float("nan")

        dist_dict[(sid, did)] = d

    return dist_dict

dist_dict = compute_transport_distances(labors_real_df, method = 'osrm', timeout=5, chunk_size=100)

⚠️ OSRM table failed for chunk 0:100, reason=403 Client Error: Forbidden for url: http://localhost:5000/table/v1/driving/-75.56991479999999,6.2276503;-75.57907089999999,6.25308;-75.55954,6.213297000000001;-75.557794,6.322076999999998;-75.564082,6.2082589;-75.545807,6.1747933;-75.56115299999999,6.1790162;-75.57566829999999,6.2153768;-75.57430579999999,6.2025602;-75.55785159999999,6.1829066;-75.5717335,6.2027526;-75.6010649,6.1622762;-75.5735983,6.1572705;-75.56979539999999,6.2198631;-75.5596249,6.319155499999999;-75.5763387,6.2430981;-75.5856104,6.183108199999999;-75.557857,6.331759;-75.55964929999999,6.2436608;-75.5754326,6.222050900000001;-75.5697201,6.2226518;-75.5912002,6.270564799999999;-75.61999899999999,6.173254699999999;-75.570449,6.2322998;-75.5601715,6.325476999999998;-75.5700854,6.2310458;-75.5653105,6.2013119;-75.5720821,6.230957699999999;-75.4315025,6.4003481;-75.5719103,6.1663031;-75.5814053,6.214877200000001;-75.5799287,6.255081;-75.6011212,6.2626847;-75.5856515,6.1834397