In [None]:
import itertools
import string
from collections import defaultdict
from pathlib import Path

import numpy as np
import pandas as pd

In [None]:
RAW_FILE   = "originals/Raw international financial data_FY22-25.xlsx"
RAW_SHEET  = "FY25 Mar Est - Europe (2)"

REF_FILE   = "originals/CLN_FY25_International - products ONLY.xlsx"
REF_SHEET  = "CLN - EU - Products ONLY"

OUTPUT_DIR   = "outputs"
OUTPUT_XLSX  = f"{OUTPUT_DIR}/cleaned_products_eu.xlsx"
OUTPUT_CSV   = f"{OUTPUT_DIR}/cleaned_products_eu.csv"
OUTPUT_SHEET = "AUTO_CLEAN"

BRACKETS = [
    (25,   "0-25K"),
    (49,   "26-49K"),
    (100,  "50-100K"),
    (249,  "101-249K"),
    (499,  "250-499K"),
    (float("inf"), "500+")
]

FINAL_COLS = [
    "Region","LOB","Combined SW #","Partner","Category","Primary territory",
    "Start date","End date","Bracket","Lifecycle","High / Med/Low touch","Details",
    "Net revenue","0-25K","26-49K","50-100K","101-249K","250-499K","500+"
]

In [None]:
def detect_header(sheet: str, marker: str = "Sub Dept", lookahead: int = 60) -> int:
    """Return the (zero-based) header row index that contains *marker*."""
    tmp = pd.read_excel(RAW_FILE, sheet, header=None, nrows=lookahead)
    row = tmp.index[tmp.iloc[:, 0].astype(str).str.contains(marker, na=False)]
    if row.empty:
        raise ValueError("Could not find header row for sheet → " + sheet)
    return int(row[0])


def load_raw() -> pd.DataFrame:
    hdr = detect_header(RAW_SHEET)
    df  = pd.read_excel(RAW_FILE, RAW_SHEET, header=hdr)

    # trim pesky whitespace everywhere early
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

    mask = (
        df["Sub Dept"].str.contains("Product", case=False, na=False)
        & ~df["Sub Dept"].str.contains("Total",  case=False, na=False)
    )
    return df.loc[mask].copy()


RENAME = {
    "Licensee"        : "Partner",
    "Product Category": "Category",
    "Primary Territory": "Primary territory",
    "SW #"            : "Combined SW #",
    "Net Revenue"     : "Net revenue",
    "Start Date"      : "Start date",
    "End Date"        : "End date",
}


# def aggregate_rows(df: pd.DataFrame) -> pd.DataFrame:
#     """Roll-up exact duplicates *before* we start fiddling with IDs."""
#     key = ["Partner","Combined SW #","Category","Start date","End date"]
#     out = (
#         df.groupby(key, dropna=False, as_index=False)
#           .agg({
#               "Net revenue"      : "sum",
#               "Primary territory": (lambda s: ", ".join(sorted(set(s.dropna()))))
#           })
#     )
#     return out

def aggregate_rows(df: pd.DataFrame) -> pd.DataFrame:
    """
    Roll-up exact duplicates, **except** leave each “New Business” /
    placeholder deal as its own commercial line (only collapse if the
    FINANCE sheet literally duplicated the same territory row).
    """
    base_key = ["Partner", "Combined SW #", "Category",
                "Start date", "End date"]

    nb_mask  = df["Partner"].str.lower() == "new business"
    regular  = df.loc[~nb_mask]
    newbiz   = df.loc[ nb_mask]

    # ―― normal roll-up (territories concatenated) ―――――――――――――――――――
    regular_out = (
        regular.groupby(base_key, dropna=False, as_index=False)
               .agg({"Net revenue": "sum",
                     "Primary territory":
                         lambda s: ", ".join(sorted(set(s.dropna())))})
    )

    # ―― “New Business”: keep one row per territory ―――――――――――――――――――
    nb_key = base_key + ["Primary territory"]
    newbiz_out = (
        newbiz.groupby(nb_key, dropna=False, as_index=False)
               .agg({"Net revenue": "sum"})
    )

    return pd.concat([regular_out, newbiz_out], ignore_index=True)

def assign_ids(df: pd.DataFrame) -> pd.DataFrame:
    id_counter = defaultdict(int)
    blank_it   = itertools.count(1)
    tbd_it     = itertools.count(1)
    new_ids    = []

    for base_raw, partner in zip(df["_base"], df["Partner"], strict=False):
        base = str(base_raw).strip()
        base_up = base.upper()         # <--- NEW (1)  normalise once

        # ---------- placeholder logic -------------------------
        if base_up in {"", "NAN", "NONE", "TB", "TBD"}:   # <--- NEW (2)
            if str(partner).strip().lower() == "new business":
                new_ids.append(f"Blank {next(blank_it)}")
            elif str(partner).lower().startswith("fx gain"):
                new_ids.append("FX Gain(Loss)")
            else:
                new_ids.append(f"TBD{next(tbd_it)}")
            continue

        seq = id_counter[base_up]
        suffix = "" if seq == 0 else string.ascii_uppercase[seq - 1]
        new_ids.append(f"{base_up}{suffix}")
        id_counter[base_up] += 1        # <--- change key to *base_up*  (3)

    df["Combined SW #"] = new_ids
    return df

def rollup_fx(df: pd.DataFrame) -> pd.DataFrame:
    """
    Collapse every row whose Partner starts with 'FX Gain' or 'FX Loss'
    into a single template row (keeps the first one as shape template).
    """
    mask = df["Partner"].str.contains(r"^FX Gain", case=False, na=False)
    if mask.sum() <= 1:
        return df                      # already rolled-up or none present

    total = df.loc[mask, "Net revenue"].sum()
    row   = df.loc[mask].iloc[0].copy()
    row["Net revenue"] = total
    # keep the first territory (or Benelux) so the line still has a place
    row["Primary territory"] = "Benelux"

    return pd.concat([df.loc[~mask], row.to_frame().T], ignore_index=True)

def add_brackets(df: pd.DataFrame) -> pd.DataFrame:
    bins, labels = zip(*BRACKETS, strict=False)
    df["Bracket"] = pd.cut(df["Net revenue"], bins=[-np.inf, *bins], labels=labels)
    for lab in labels:
        df[lab] = (df["Bracket"] == lab).astype(int)
    return df

In [None]:
raw = load_raw().rename(columns=RENAME)

# date coercion -------------------------------------------------------------
for col in ("Start date", "End date"):
    raw[col] = pd.to_datetime(raw[col], errors="coerce")

# drop obviously placeholder rows (missing category) ------------------------
raw = raw[raw["Category"].notna()]

# aggregate → suffix handling ------------------------------------------------
agg = aggregate_rows(raw)
agg["Region"] = "EUROPE"
agg["LOB"]    = "Products"

agg["_base"] = (
    agg["Combined SW #"].fillna("")
        .astype(str)
        .str.replace(r"[A-Z]$", "", regex=True)  # strip any existing suffix
)
agg = agg.sort_values(["_base", "Partner"], kind="mergesort")  # stable sort
agg = rollup_fx(agg)
agg = assign_ids(agg)

# pad new manual‑only columns ----------------------------------------------
for col in ("Lifecycle", "High / Med/Low touch", "Details"):
    agg[col] = ""

# brackets & final tidy -----------------------------------------------------
agg = add_brackets(agg)
final = agg[FINAL_COLS]

# ---------------------------------------------------------------------------
# outputs -------------------------------------------------------------------
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
final.to_csv(OUTPUT_CSV, index=False)

# make sure we overwrite cleanly if the file exists already
with pd.ExcelWriter(OUTPUT_XLSX, engine="openpyxl", mode="w") as xls:
    final.to_excel(xls, sheet_name=OUTPUT_SHEET, index=False)

print("Saved →", OUTPUT_CSV, "and", OUTPUT_XLSX)

# quick sanity‑check against the manual reference ---------------------------
if Path(REF_FILE).exists():
    ref = pd.read_excel(REF_FILE, REF_SHEET, header=2)
    ref = ref.loc[:, ~ref.columns.str.contains(r"^Unnamed")]  # <- FIXED LINE

    print("\n— sanity —")
    print("rows   ref / ours:", ref.shape[0], "/", final.shape[0])
    print(
        "total €k       :",
        round(ref["Net revenue"].sum() / 1_000, 1), "/",
        round(final["Net revenue"].sum() / 1_000, 1),
    )


# Interesting Examples
- Blank 1 / Blank 2, 
- 12621
- 13191