In [None]:

# Metrics-only pipeline (requests + responses + Excel overview), no predictions
# 1) Group by (RouteId, Date) from ModifiedQueryRows.xlsx
# 2) Identify: first morning CreateSequence, final pre-departure EstimateTime
# 3) Parse both request files to get coords & time windows; parse both response files to get sequences
# 4) Compute metrics: removed/added, order-change (footrule, Kendall tau), position-shift stats,
#    relative path lengths and % change, optional time-window stratifications.
#
# NOTE: For now distances are in normalized units. % change is still meaningful.

import re, json, math, pandas as pd, numpy as np

def parse_request_coords(path):
    with open(path,'r',encoding='utf-8') as f:
        try:
            data = json.load(f)
            tasks = data['tasks']
            return {int(t['id']):(float(t['address']['latitude']), float(t['address']['longitude']))
                    for t in tasks}
        except json.JSONDecodeError:
            text = f.read()
            return {int(m.group(1)):(float(m.group(2)), float(m.group(3)))
                    for m in re.finditer(r'id\\s+(\\d+)\\s+address\\s+latitude\\s+([\\d\\.\\-]+)\\s+longitude\\s+([\\d\\.\\-]+)', text)}

def parse_response_ids(path):
    with open(path,'r',encoding='utf-8') as f:
        return [int(line.strip()) for line in f if line.strip().isdigit()]

def path_length(seq, coords):
    total = 0.0
    for i in range(len(seq)-1):
        a, b = coords.get(seq[i]), coords.get(seq[i+1])
        if a and b:
            dx, dy = a[0]-b[0], a[1]-b[1]
            total += math.hypot(dx, dy)
    return total

def metrics_for_pair(req_first_json, resp_first_txt, req_final_json, resp_final_txt):
    coords_first = parse_request_coords(req_first_json)
    coords_final = parse_request_coords(req_final_json)
    seq_first = parse_response_ids(resp_first_txt)
    seq_final = parse_response_ids(resp_final_txt)

    set_first, set_final = set(seq_first), set(seq_final)
    removed = sorted(set_first - set_final)
    added   = sorted(set_final - set_first)
    common  = sorted(set_first & set_final)

    pos_first = {tid:i for i,tid in enumerate(seq_first)}
    pos_final = {tid:i for i,tid in enumerate(seq_final)}
    positions = np.array([[tid, pos_first[tid], pos_final[tid]] for tid in common])
    abs_shifts = np.abs(positions[:,1] - positions[:,2])

    # Footrule
    footrule = float(abs_shifts.sum())
    n = len(common); max_footrule = n*(n-1)/2 if n>1 else 1
    footrule_norm = footrule/max_footrule

    # Kendall tau
    perm = [pos_final[tid] for tid in sorted(common, key=lambda x: pos_first[x])]
    inv = sum(perm[i] > perm[j] for i in range(len(perm)) for j in range(i+1, len(perm)))
    max_inv = n*(n-1)//2 if n>1 else 1
    kendall_norm = inv/max_inv

    # Path lengths
    L_first   = path_length(seq_first,  coords_first)
    L_final   = path_length(seq_final,  coords_final)
    L_delta   = L_final - L_first
    L_pct     = (L_delta/L_first*100) if L_first>0 else float('nan')

    return {
        'removed_count': len(removed), 'added_count': len(added), 'common_count': n,
        'footrule': footrule, 'footrule_norm': footrule_norm,
        'kendall_inversions': int(inv), 'kendall_tau_norm': kendall_norm,
        'mean_abs_shift': float(abs_shifts.mean()) if n else 0.0,
        'median_abs_shift': float(np.median(abs_shifts)) if n else 0.0,
        'max_abs_shift': int(abs_shifts.max()) if n else 0,
        'length_first_units': L_first, 'length_final_units': L_final,
        'length_delta_units': L_delta,        'length_delta_units': L_delta, 'length_pct_change': L_pct,


In [None]:

# 1) Build stable location_id
#    - either hash(round(lat, 6), round(lon, 6))
#    - parse request -> {taskId: (lat, lon), time_window}#    - or use a small-radius match by KDTree/DBSCAN if jitter exists
#    - parse response -> sequence_first, sequence_final
#    - map each taskId -> location_id
#    - compute removed/added/common (on taskId AND on location_id for robustness)
#    - compute order-change metrics on the common set
#    - compute path length (units) + % change
#    - compute centroid and bbox of the day's stops

# 3) Across days per RouteId:
#    - group by RouteId; compute centroid drift, bbox drift, Jaccard overlap of location_id sets
#    - aggregate per-location stats: days_seen, removal_rate, avg_abs_shift, etc.
``

# 2) For each (RouteId, Date):


In [None]:

# # file: build_route_metrics.py
# # Scans data/requests and data/responses for depot 0521# Python 3.9+  (uses: pandas, numpy, pathlib)
# # Pairs first/final (and optional intermediates) per (RouteId, Date)
# # Computes metrics and writes consolidated CSVs
# # ------------------------------------------------------------

# import re, json, math, sys
# from pathlib import Path
# from datetime import datetime, time
# import pandas as pd
# import numpy as np

# # =========================
# # Configuration
# # =========================
# BASE_DIR = Path("data")                # adjust if your repo layout differs
# REQUESTS_DIR = BASE_DIR / "requests"
# RESPONSES_DIR = BASE_DIR / "responses"
# EXCEL_OVERVIEW = BASE_DIR / "ModifiedQueryRows.xlsx"  # optional but recommended

# DEPOT_PREFIX = "0521_"                 # only process route folders whose name starts with this
# MORNING_START = time(5, 0, 0)          # 05:00 local
# MORNING_END   = time(13, 0, 0)         # 13:00 local (exclude evening investigations)
# INCLUDE_EVOLUTION = True               # set False if you only want first vs final

# # Output files
# OUT_DIR = Path("out")
# OUT_DIR.mkdir(parents=True, exist_ok=True)
# MASTER_METRICS_CSV = OUT_DIR / "master_route_day_metrics.csv"
# PER_CALL_METRICS_CSV = OUT_DIR / "per_call_metrics.csv"
# PER_TASK_LABELS_CSV = OUT_DIR / "per_route_day_task_labels.csv"  # optional (large)
# CALL_INDEX_CSV = OUT_DIR / "call_index.csv"                      # traceability


# # =========================
# # Helpers: parsing & time
# # =========================
# FILENAME_TS_RE = re.compile(r".*-(\d{6,6})-")  # e.g. 0521_300-20220617-055733-2-0.json -> 055733

# def parse_hhmmss_from_fname(name: str) -> time:
#     m = FILENAME_TS_RE.match(name)
#     if not m:
#         # fallback: try last 6 digits pattern
#         m = re.search(r"(\d{6})(?=[^0-9]*\.)", name)
#     if not m:
#         return None
#     hh, mm, ss = m.group(1)[0:2], m.group(1)[2:4], m.group(1)[4:6]
#     return time(int(hh), int(mm), int(ss))

# def is_morning(t: time) -> bool:
#     return (t is not None) and (MORNING_START <= t < MORNING_END)

# def try_read_json(path: Path):
#     try:
#         return json.loads(path.read_text(encoding="utf-8"))
#     except json.JSONDecodeError:
#         return None

# TASK_BLOCK_RE = re.compile(
#     r"id\s+(\d+)\s+address\s+latitude\s+([-\d\.]+)\s+longitude\s+([-\d\.]+)\s+timeWindow\s+from\s+([0-9\-:T]+)\s+till\s+([0-9\-:T]+)",
#     re.MULTILINE
# )
# FIXED_BLOCK_RE = re.compile(
#     r"taskId\s+(\d+)\s+activityType\s+Task\s+fixedPosition\s+(true|false)",
#     re.MULTILINE
# )

# def parse_request(path: Path):
#     """
#     Returns dict with:
#       configurationName: str|None
#       tasks: list of {id:int, lat:float, lon:float, from:str, till:str}
#       fixedTasks: set[int]
#     """
#     data = try_read_json(path)
#     out = {"configurationName": None, "tasks": [], "fixedTasks": set()}
#     if data:
#         out["configurationName"] = data.get("configurationName")
#         for t in data.get("tasks", []):
#             addr = t.get("address", {})
#             out["tasks"].append({
#                 "id": int(t["id"]),
#                 "lat": float(addr.get("latitude", np.nan)),
#                 "lon": float(addr.get("longitude", np.nan)),
#                 "from": t.get("timeWindow", {}).get("from"),
#                 "till": t.get("timeWindow", {}).get("till"),
#             })
#         out["fixedTasks"] = {int(ft.get("taskId")) for ft in data.get("fixedTasks", []) if "taskId" in ft}
#         return out

#     # Fallback for flattened text
#     txt = path.read_text(encoding="utf-8", errors="ignore")
#     # config name
#     m = re.search(r"configurationName\s+([A-Za-z]+)", txt)
#     if m: out["configurationName"] = m.group(1)
#     # tasks
#     for m in TASK_BLOCK_RE.finditer(txt):
#         out["tasks"].append({
#             "id": int(m.group(1)),
#             "lat": float(m.group(2)),
#             "lon": float(m.group(3)),
#             "from": m.group(4),
#             "till": m.group(5),
#         })
#     # fixed
#     for m in FIXED_BLOCK_RE.finditer(txt):
#         out["fixedTasks"].add(int(m.group(1)))
#     return out

# def parse_response_ids(path: Path) -> list[int]:
#     if not path.exists():
#         return []
#     ids = []
#     for line in path.read_text(encoding="utf-8", errors="ignore").splitlines():
#         s = line.strip()
#         if s.isdigit():
#             ids.append(int(s))
#     return ids


# # =========================
# # Index the calls
# # =========================
# def index_calls():
#     """
#     Returns DataFrame with columns:
#       depot, route_id, date, call_time, folder_type (request|response),
#       config_name (for requests), num_tasks (for requests), num_fixed (for requests),
#       request_path, response_path, call_key
#     We build one row per REQUEST call and attach the nearest-in-time RESPONSE path in same folder.
#     """
#     rows = []

#     # (A) Optional: read Excel overview to help with typing
#     overview = None
#     if EXCEL_OVERVIEW.exists():
#         try:
#             overview = pd.read_excel(EXCEL_OVERVIEW, engine="openpyxl")
#             # Normalize column names
#             overview.columns = [c.strip() for c in overview.columns]
#             # Make a key RouteId + Date + Time to help later if needed
#         except Exception as e:
#             print(f"[WARN] Could not read {EXCEL_OVERVIEW}: {e}")

#     # Walk request folders
#     for route_day_dir in sorted(REQUESTS_DIR.glob(f"{DEPOT_PREFIX}*")):
#         if not route_day_dir.is_dir():
#             continue
#         # route_day name pattern: 0521_300-20220617
#         folder_name = route_day_dir.name
#         if "-" not in folder_name:
#             continue
#         route_id, ymd = folder_name.split("-")
#         if not route_id.startswith(DEPOT_PREFIX):
#             continue
#         date_str = ymd  # yyyymmdd

#         resp_dir = RESPONSES_DIR / folder_name

#         # requests in this folder
#         for req_file in sorted(route_day_dir.glob("*.json")):
#             call_t = parse_hhmmss_from_fname(req_file.name)
#             req_parsed = parse_request(req_file)
#             config = req_parsed.get("configurationName")
#             num_tasks = len(req_parsed.get("tasks", []))
#             num_fixed = len(req_parsed.get("fixedTasks", []))

#             # find a response file with same timestamp; fallback: nearest in time
#             candidate = None
#             if resp_dir.exists():
#                 # first try exact hhmmss match
#                 exact = list(resp_dir.glob(req_file.name.replace(".json", ".txt")))
#                 if exact:
#                     candidate = exact[0]
#                 else:
#                     # fallback: nearest by hhmmss among files that start with same prefix (route_id-date-*)
#                     resp_files = sorted(resp_dir.glob(f"{route_id}-{date_str}-*.txt"))
#                     # compute abs diff in seconds
#                     def hhmmss_to_sec(t: time):
#                         return t.hour*3600 + t.minute*60 + t.second if t else None
#                     req_sec = hhmmss_to_sec(call_t)
#                     best = None
#                     best_delta = None
#                     for rf in resp_files:
#                         rt = parse_hhmmss_from_fname(rf.name)
#                         rs = hhmmss_to_sec(rt)
#                         if (req_sec is not None) and (rs is not None):
#                             delta = abs(req_sec - rs)
#                             if best is None or delta < best_delta:
#                                 best = rf
#                                 best_delta = delta
#                     candidate = best

#             rows.append({
#                 "depot": route_id.split("_")[0],
#                 "route_id": route_id,
#                 "date": date_str,
#                 "call_time": call_t.isoformat() if call_t else None,
#                 "config_name": config,
#                 "num_tasks": num_tasks,
#                 "num_fixed": num_fixed,
#                 "request_path": str(req_file),
#                 "response_path": str(candidate) if candidate else None,
#             })

#     df = pd.DataFrame(rows)
#     # Morning filter flag
#     def flag_morning(t):
#         if pd.isna(t): return False
#         tt = datetime.strptime(t, "%H:%M:%S").time()
#         return is_morning(tt)
#     df["is_morning"] = df["call_time"].apply(flag_morning)

#     # Persist for transparency
#     df.sort_values(["route_id", "date", "call_time"], inplace=True, na_position="last")
#     df.to_csv(CALL_INDEX_CSV, index=False)
#     return df


# # =========================
# # Pair first & final + mark intermediates
# # =========================
# def select_pairs(call_index: pd.DataFrame) -> pd.DataFrame:
#     """
#     For each (route_id, date):
#       first = earliest morning CreateSequence
#       final = last morning EstimateTime with num_fixed == num_tasks
#       intermediates = morning calls in between (optional)
#     """
#     records = []
#     for (route_id, date), grp in call_index.groupby(["route_id", "date"], dropna=False):
#         g = grp[grp["is_morning"]].copy()
#         if g.empty:
#             continue

#         # First CreateSequence
#         g_first = g[g["config_name"]=="CreateSequence"].sort_values("call_time")
#         if g_first.empty:
#             continue
#         first_row = g_first.iloc[0]

#         # Final: last EstimateTime with fully fixed plan
#         g_final = g[(g["config_name"]=="EstimateTime") & (g["num_fixed"]==g["num_tasks"])].sort_values("call_time")
#         if g_final.empty:
#             # fallback: last morning EstimateTime even if not fully fixed
#             g_final = g[g["config_name"]=="EstimateTime"].sort_values("call_time")
#             if g_final.empty:
#                 continue
#         final_row = g_final.iloc[-1]

#         # Intermediates: all morning calls strictly between first and final times
#         inter = pd.DataFrame()
#         if INCLUDE_EVOLUTION:
#             inter = g[
#                 (g["call_time"] > first_row["call_time"]) &
#                 (g["call_time"] < final_row["call_time"])
#             ].sort_values("call_time")

#         records.append({
#             "route_id": route_id,
#             "date": date,
#             "first_request_path": first_row["request_path"],
#             "first_response_path": first_row["response_path"],
#             "first_call_time": first_row["call_time"],
#             "final_request_path": final_row["request_path"],
#             "final_response_path": final_row["response_path"],
#             "final_call_time": final_row["call_time"],
#             "num_intermediate_calls": len(inter),
#             "intermediate_rows": inter  # keep for now; we won't serialize this dict
#         })

#     return pd.DataFrame(records)


# # =========================
# # Metrics & labels
# # =========================
# def path_length(seq: list[int], coords: dict[int, tuple[float,float]]) -> float:
#     total = 0.0
#     for i in range(len(seq)-1):
#         a = coords.get(seq[i]); b = coords.get(seq[i+1])
#         if not a or not b: continue
#         dx = a[0]-b[0]; dy = a[1]-b[1]
#         total += math.hypot(dx, dy)
#     return total

# def parse_coords_and_seq(req_path: str, resp_path: str):
#     req = parse_request(Path(req_path))
#     coords = {t["id"]:(t["lat"], t["lon"]) for t in req["tasks"] if not (np.isnan(t["lat"]) or np.isnan(t["lon"]))}
#     seq = parse_response_ids(Path(resp_path)) if resp_path else []
#     return req, coords, seq

# def order_metrics(first_seq, final_seq):
#     set_first = set(first_seq); set_final = set(final_seq)
#     removed = sorted(set_first - set_final)
#     added   = sorted(set_final - set_first)
#     common  = sorted(set_first & set_final)

#     pos_first = {tid:i for i,tid in enumerate(first_seq)}
#     pos_final = {tid:i for i,tid in enumerate(final_seq)}

#     if len(common)==0:
#         return {
#             "removed_count": len(removed), "added_count": len(added), "common_count": 0,
#             "footrule": np.nan, "footrule_norm": np.nan,
#             "kendall_inversions": np.nan, "kendall_tau_norm": np.nan,
#             "mean_abs_shift": np.nan, "median_abs_shift": np.nan, "max_abs_shift": np.nan,
#             "removed_ids": removed, "added_ids": added, "shifts_df": pd.DataFrame()
#         }

#     positions = np.array([[tid, pos_first[tid], pos_final[tid]] for tid in common])
#     abs_shifts = np.abs(positions[:,1] - positions[:,2])

#     # Spearman footrule
#     footrule = float(abs_shifts.sum())
#     n = len(common); max_footrule = n*(n-1)/2 if n>1 else 1.0
#     footrule_norm = footrule / max_footrule

#     # Kendall tau inversions (O(n^2) is fine for ~200)
#     perm = [pos_final[tid] for tid in sorted(common, key=lambda x: pos_first[x])]
#     inv = 0
#     for i in range(len(perm)):
#         for j in range(i+1, len(perm)):
#             inv += 1 if perm[i] > perm[j] else 0
#     max_inv = n*(n-1)//2 if n>1 else 1
#     kendall_norm = inv / max_inv

#     shifts_df = pd.DataFrame({
#         "taskId": positions[:,0].astype(int),
#         "pos_first": positions[:,1].astype(int),
#         "pos_final": positions[:,2].astype(int),
#         "abs_shift": abs_shifts.astype(int)
#     })

#     return {
#         "removed_count": len(removed), "added_count": len(added), "common_count": n,
#         "footrule": footrule, "footrule_norm": footrule_norm,
#         "kendall_inversions": int(inv), "kendall_tau_norm": float(kendall_norm),
#         "mean_abs_shift": float(abs_shifts.mean()), "median_abs_shift": float(np.median(abs_shifts)),
#         "max_abs_shift": int(abs_shifts.max()),
#         "removed_ids": removed, "added_ids": added, "shifts_df": shifts_df
#     }

# def time_window_category(tw_from: str, tw_till: str):
#     try:
#         till = datetime.fromisoformat(tw_till).time()
#     except Exception:
#         return "unknown"
#     if till.hour==23 and till.minute==59:
#         return "all-day"
#     if till < time(13,0,0):
#         return "morning-window"
#     # quick heuristic for late starts
#     try:
#         frm = datetime.fromisoformat(tw_from).time()
#         if frm >= time(16,0,0):
#             return "late-window"
#     except Exception:
#         pass
#     return "other-window"


# # =========================
# # Main driver
# # =========================
# def main():
#     call_index = index_calls()
#     pairs = select_pairs(call_index)

#     master_rows = []
#     per_call_rows = []
#     per_task_rows = []

#     for _, row in pairs.iterrows():
#         route_id = row["route_id"]; date = row["date"]
#         first_req, first_coords, first_seq = parse_coords_and_seq(row["first_request_path"], row["first_response_path"])
#         final_req, final_coords, final_seq = parse_coords_and_seq(row["final_request_path"], row["final_response_path"])

#         # Core order metrics
#         om = order_metrics(first_seq, final_seq)

#         # Path lengths (normalized units)
#         L_first = path_length(first_seq, first_coords)
#         L_final = path_length(final_seq, final_coords)
#         L_delta = L_final - L_first
#         L_pct   = (L_delta/L_first*100.0) if L_first>0 else np.nan

#         master_rows.append({
#             "depot": route_id.split("_")[0],
#             "route_id": route_id,
#             "date": date,
#             "first_call_time": row["first_call_time"],
#             "final_call_time": row["final_call_time"],
#             "num_intermediate_calls": row["num_intermediate_calls"],
#             # order change (common)
#             "removed_count": om["removed_count"],
#             "added_count": om["added_count"],
#             "common_count": om["common_count"],
#             "footrule": om["footrule"],
#             "footrule_norm": om["footrule_norm"],
#             "kendall_inversions": om["kendall_inversions"],
#             "kendall_tau_norm": om["kendall_tau_norm"],
#             "mean_abs_shift": om["mean_abs_shift"],
#             "median_abs_shift": om["median_abs_shift"],
#             "max_abs_shift": om["max_abs_shift"],
#             # path length
#             "length_first_units": L_first,
#             "length_final_units": L_final,
#             "length_delta_units": L_delta,
#             "length_pct_change": L_pct,
#         })

#         # Optional per-task labels for this route-day
#         set_first = set(first_seq); set_final = set(final_seq)
#         removed_ids = set_first - set_final
#         added_ids   = set_final - set_first
#         kept_ids    = set_first & set_final

#         # build time window lookup from first/final requests
#         tw = {}
#         for t in first_req["tasks"]:
#             tw[t["id"]] = (t["from"], t["till"])
#         for t in final_req["tasks"]:
#             if t["id"] not in tw:
#                 tw[t["id"]] = (t["from"], t["till"])

#         # shifts for kept
#         shifts_map = {}
#         for _, r2 in om["shifts_df"].iterrows():
#             shifts_map[int(r2["taskId"])] = int(r2["abs_shift"])

#         for tid in removed_ids:
#             frm, till = tw.get(tid, (None, None))
#             per_task_rows.append({
#                 "route_id": route_id, "date": date, "taskId": tid,
#                 "label": "removed", "abs_shift": None,
#                 "time_window_cat": time_window_category(frm, till)
#             })
#         for tid in added_ids:
#             frm, till = tw.get(tid, (None, None))
#             per_task_rows.append({
#                 "route_id": route_id, "date": date, "taskId": tid,
#                 "label": "added", "abs_shift": None,
#                 "time_window_cat": time_window_category(frm, till)
#             })
#         for tid in kept_ids:
#             frm, till = tw.get(tid, (None, None))
#             per_task_rows.append({
#                 "route_id": route_id, "date": date, "taskId": tid,
#                 "label": "kept", "abs_shift": shifts_map.get(tid, 0),
#                 "time_window_cat": time_window_category(frm, till)
#             })

#         # Optional: evolution across intermediate calls
#         if INCLUDE_EVOLUTION:
#             inter_df: pd.DataFrame = row["intermediate_rows"]
#             # Compare each intermediate to the first
#             for _, ir in inter_df.iterrows():
#                 _, inter_coords, inter_seq = parse_coords_and_seq(ir["request_path"], ir["response_path"])
#                 em = order_metrics(first_seq, inter_seq)
#                 per_call_rows.append({
#                     "route_id": route_id,
#                     "date": date,
#                     "call_time": ir["call_time"],
#                     "config_name": ir["config_name"],
#                     "num_tasks": ir["num_tasks"],
#                     "num_fixed": ir["num_fixed"],
#                     "vs_first_common_count": em["common_count"],
#                     "vs_first_footrule_norm": em["footrule_norm"],
#                     "vs_first_kendall_tau_norm": em["kendall_tau_norm"],
#                 })

#     # Write outputs
#     pd.DataFrame(master_rows).sort_values(["route_id","date"]).to_csv(MASTER_METRICS_CSV, index=False)
#     if INCLUDE_EVOLUTION and len(per_call_rows)>0:
#         pd.DataFrame(per_call_rows).to_csv(PER_CALL_METRICS_CSV, index=False)
#     if len(per_task_rows)>0:
#         pd.DataFrame(per_task_rows).to_csv(PER_TASK_LABELS_CSV, index=False)

#     print(f"[OK] Wrote:\n- {MASTER_METRICS_CSV}")
#     if INCLUDE_EVOLUTION and len(per_call_rows)>0:
#         print(f"- {PER_CALL_METRICS_CSV}")
#     if len(per_task_rows)>0:
#         print(f"- {PER_TASK_LABELS_CSV}")
#     print(f"- {CALL_INDEX_CSV}  (traceability)")

# if __name__ == "__main__":
   
# # ------------------------------------------------------------


In [None]:
import re 

pattern = re.compile(r"^(?:[^-]*-){2}(\d{6})-")
m = pattern.match("0521_300-20220617-055733-2-0.json")
print(m.group(1))

In [None]:
name = "0521_300-20220617-055733-2-0.json"
pattern = re.compile(r"^(?:[^-]*-)(\d{8})-")
timestring_from_filename = pattern.match(name)
print(timestring_from_filename.group(1))

In [None]:
from pathlib import Path

base_dir = Path("data")   
print(base_dir)             # adjust if your repo layout differs
# REQUESTS_DIR = BASE_DIR / "requests"
# RESPONSESDIR = BASE_DIR / "responses"
# EXCEL_OVERVIEW = BASE_DIR / "ModifiedQueryRows.xlsx"  # optional but recommended

# DEPOT_PREFIX = "0521_"                 # only process route folders whose name starts with this
# MORNING_START = time(5, 0, 0)          # 05:00 local
# MORNING_END   = time(13, 0, 0)         # 13:00 local (exclude evening investigations)
# INCLUDE_EVOLUTION = True               # set False if you only want first vs final_


In [None]:

from pathlib import Path

# Directory of the current Python file
HERE = Path.cwd()
BASE_DIR = HERE.parent          # go up to repo/ from src/my_app/
DATA_DIR = BASE_DIR / "data"            # repo/data
OUTPUT = DATA_DIR / "output"          # repo/data/outputs

OUTPUT.mkdir(parents=True, exist_ok=True)

print(HERE)
print(BASE_DIR)

