In [47]:
import pandas as pd
import numpy as np
import json
import os
from pathlib import Path

from sklearn.metrics import confusion_matrix
from scipy.optimize import linear_sum_assignment

In [48]:
regime_df = pd.read_csv(
    "../../artifacts/reports/GBP_USD/ws5/learnable64/default/nc8/regime_assignments_dbg.csv"
)

In [49]:
raw_cluster_series = regime_df["Cluster_ID"].dropna().astype(int)
raw_cluster_ids = set(raw_cluster_series.unique())
regime_counts = raw_cluster_series.value_counts().sort_index()
print("🔎 Unique raw Cluster_IDs:", raw_cluster_ids)

🔎 Unique raw Cluster_IDs: {np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7)}


In [50]:
metadata_dir = Path("../../artifacts/baseline_metadata/GBP_USD/ws5/learnable64/default/nc8")

baseline_path = metadata_dir / "regime_assignments_baseline.csv"
mapping_path = metadata_dir / "regime_label_mapping.json"

baseline_series = pd.read_csv(baseline_path)["Cluster_ID"].dropna().astype(int)


# Load baseline assignments
baseline_counts = baseline_series.value_counts().sort_index()


with open(mapping_path, "r", encoding="utf-8") as f:
    existing_mapping = json.load(f)
    existing_mapping = {int(k): int(v) for k, v in existing_mapping.items()}

In [51]:
print("🔎 Unique regime Cluster_IDs:", regime_counts)
print("🔎 Unique baseline Cluster_IDs:", baseline_counts)

🔎 Unique regime Cluster_IDs: Cluster_ID
0    600
1    162
2    413
3    157
4    202
5     60
6     33
7    254
Name: count, dtype: int64
🔎 Unique baseline Cluster_IDs: Cluster_ID
0    604
1    253
2    413
3    157
4     60
5     30
6    160
7    202
Name: count, dtype: int64


In [60]:
already_mapped_from = set(existing_mapping.keys())
already_mapped_to = set(existing_mapping.values())
known_ids = already_mapped_from.union(already_mapped_to)
print("🔎 Existing mapping:", existing_mapping)
print("🔎 Already mapped from:", already_mapped_from)
print("🔎 Already mapped to:", already_mapped_to)
print("🔎 Known IDs:", known_ids)
print("🔎 Raw:", raw_cluster_ids)

unmapped_ids = raw_cluster_ids - known_ids

print(
    "🆕 Unmapped cluster IDs:", unmapped_ids
)

🔎 Existing mapping: {1: 0, 3: 1, 2: 2, 4: 4, 5: 5, 0: 6, 6: 7}
🔎 Already mapped from: {0, 1, 2, 3, 4, 5, 6}
🔎 Already mapped to: {0, 1, 2, 4, 5, 6, 7}
🔎 Known IDs: {0, 1, 2, 3, 4, 5, 6, 7}
🔎 Raw: {np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7)}
🆕 Unmapped cluster IDs: set()


In [53]:
min_len = min(len(baseline_series), len(raw_cluster_series))
baseline_trimmed = baseline_series.iloc[:min_len].to_numpy()
current_trimmed = raw_cluster_series.iloc[:min_len].to_numpy()

mask = np.isin(current_trimmed, list(unmapped_ids))
baseline_masked = baseline_trimmed[mask]
current_masked = current_trimmed[mask]

print("✂️ Masked baseline (first 10):", baseline_masked[:10])
print("✂️ Masked current  (first 10):", current_masked[:10])
print("✅ unique current:", np.unique(current_masked))
print("✅ unique baseline:", np.unique(baseline_masked))
print(
    "📊 baseline_masked full unique counts:\n",
    pd.Series(baseline_masked).value_counts(),
)

✂️ Masked baseline (first 10): [1 1 1 1 1 1 1 1 1 1]
✂️ Masked current  (first 10): [7 7 7 7 7 7 7 7 7 7]
✅ unique current: [7]
✅ unique baseline: [0 1]
📊 baseline_masked full unique counts:
 1    252
0      2
Name: count, dtype: int64


In [54]:
if len(current_masked) == 0 or not any(
    uid in np.unique(baseline_masked) for uid in unmapped_ids
):
    print("⚠️ Skipping Hungarian — no valid overlap for unmapped IDs.")
else:
    C = confusion_matrix(baseline_masked, current_masked, labels=sorted(unmapped_ids))
    row_ind, col_ind = linear_sum_assignment(-C)
    new_mapping = {int(col): int(row) for row, col in zip(row_ind, col_ind)}
    print("📌 New Mapping:", new_mapping)

⚠️ Skipping Hungarian — no valid overlap for unmapped IDs.


In [61]:
# Drop NaNs and convert to int
current_series = regime_df["Cluster_ID"].dropna().astype(int)

# Trim to same length
min_len = min(len(current_series), len(baseline_series))
current_trimmed = current_series.iloc[:min_len].to_numpy()
baseline_trimmed = baseline_series.iloc[:min_len].to_numpy()

# Compute unified label space
all_ids = sorted(set(current_trimmed) | set(baseline_trimmed))

# Confusion matrix
C = confusion_matrix(baseline_trimmed, current_trimmed, labels=all_ids)

# Run Hungarian algorithm
row_ind, col_ind = linear_sum_assignment(-C)
mapping = {all_ids[col]: all_ids[row] for row, col in zip(row_ind, col_ind)}

# Print diagnostics
print("🔢 All Cluster IDs:", all_ids)
print("🔗 Confusion Matrix:\n", C)
print("✅ Hungarian Mapping:")
for k, v in mapping.items():
    print(f"    {k} → {v}")

🔢 All Cluster IDs: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7)]
🔗 Confusion Matrix:
 [[599   2   0   0   0   0   1   2]
 [  0   0   0   0   1   0   0 252]
 [  0   0 413   0   0   0   0   0]
 [  0   0   0 157   0   0   0   0]
 [  0   0   0   0   0  60   0   0]
 [  0   0   0   0   0   0  30   0]
 [  0 160   0   0   0   0   0   0]
 [  1   0   0   0 201   0   0   0]]
✅ Hungarian Mapping:
    0 → 0
    7 → 1
    2 → 2
    3 → 3
    5 → 4
    6 → 5
    1 → 6
    4 → 7


In [None]:
# --- Step 1: Save original raw cluster labels ---
raw_cluster_labels = regime_df["Cluster_ID"].copy()
raw_cluster_ids = set(raw_cluster_labels.dropna().astype(int).unique())

# --- Step 2: Load existing mapping ---
# mapping_path = os.path.join(self.baseline_dir, "regime_label_mapping.json")
# if os.path.exists(mapping_path):
#     with open(mapping_path, "r", encoding="utf-8") as f:
#         existing_mapping = json.load(f)
#         existing_mapping = {int(k): int(v) for k, v in existing_mapping.items()}
#     logger.info(f"🔁 Existing mapping loaded with {len(existing_mapping)} entries")
# else:
#     existing_mapping = {}


# --- Step 3: Apply existing mapping ---
def apply_mapping(x):
    return existing_mapping.get(int(x), x) if pd.notna(x) else x


regime_df["Cluster_ID"] = raw_cluster_labels.apply(apply_mapping)

# --- Step 4: Detect unmapped cluster IDs ---
already_mapped = set(existing_mapping.keys())
unmapped_ids = raw_cluster_ids - already_mapped
print(f"🧠 Unmapped cluster IDs: {sorted(unmapped_ids)}")

# --- Step 5: Align only new IDs if baseline is present ---
baseline_path = os.path.join(self.baseline_dir, "regime_assignments_baseline.csv")
if unmapped_ids and os.path.exists(baseline_path):
    baseline_series = pd.read_csv(baseline_path)["Cluster_ID"].dropna().astype(int)
    current_series = raw_cluster_labels.dropna().astype(int)

    min_len = min(len(current_series), len(baseline_series))
    baseline_trimmed = baseline_series.iloc[:min_len].to_numpy()
    current_trimmed = current_series.iloc[:min_len].to_numpy()

    # Restrict confusion matrix to unmapped IDs only
    C = confusion_matrix(baseline_trimmed, current_trimmed, labels=sorted(unmapped_ids))

    if C.size == 0 or C.shape[0] != C.shape[1]:
        logger.warning("⚠️ Hungarian skipped: not enough overlap to align new IDs.")
    else:
        row_ind, col_ind = linear_sum_assignment(-C)
        new_mapping = {
            int(sorted(unmapped_ids)[col]): int(sorted(unmapped_ids)[row])
            for row, col in zip(row_ind, col_ind)
        }

        # Add new mappings to existing
        for k, v in new_mapping.items():
            if k not in existing_mapping:
                existing_mapping[k] = v
        print(f"🔐 Updated mapping with {len(new_mapping)} new entries.")

        # Re-apply full mapping after update
        regime_df["Cluster_ID"] = raw_cluster_labels.apply(
            lambda x: existing_mapping.get(int(x), x) if pd.notna(x) else x
        )

        # with open(mapping_path, "w", encoding="utf-8") as f:
        #     json.dump(existing_mapping, f, indent=2)
        # print(f"💾 Mapping file saved: {mapping_path}")