In [6]:
# QC Backtest + Live Portfolio (latest-month) – full script
# Version: v6 (2025-08-26)
# Changes in v6:
#  - Add FACTOR_TOP knob (half/third/quarter/all or numeric fraction) + FACTOR_MIN_K
#  - Backtest & LIVE meta-selection use compute_top_k(...)
#  - Keeps v5 robust date parser + earlier numeric-coercion fixes and Weight_% assign

import pandas as pd
import numpy as np
import os
from functools import reduce

# ---------- Robust date parser to avoid pandas 'Could not infer format' warnings ----------
def parse_date_column(series: pd.Series) -> pd.Series:
    """Parse a date column that may contain Excel serials and/or multiple string formats
    without triggering the pandas 'Could not infer format' warning.
    Tries in order: Excel serial, then a list of explicit string formats, then a last-resort
    dateutil parse with dayfirst=True. Always returns a datetime64[ns] Series (NaT on failure).
    """
    s = series.copy()
    # 1) Try Excel serials (vectorized)
    as_num = pd.to_numeric(s, errors='coerce')
    out = pd.to_datetime(as_num, unit='D', origin='1899-12-30', errors='coerce')

    # 2) Try explicit string formats (vectorized per-format)
    formats = [
        '%Y-%m-%d',    # 2025-08-31
        '%d-%m-%Y',    # 31-08-2025
        '%d/%m/%Y',    # 31/08/2025
        '%m/%d/%Y',    # 08/31/2025
        '%d-%b-%Y',    # 31-Aug-2025
        '%d-%b-%y',    # 31-Aug-25
    ]
    mask = out.isna()
    s_str = s.astype(str)
    for fmt in formats:
        if not mask.any():
            break
        idx = mask
        parsed = pd.to_datetime(s_str[idx], format=fmt, errors='coerce')
        out.loc[idx] = out.loc[idx].fillna(parsed)
        mask = out.isna()

    # 3) Last resort: dateutil with dayfirst, still vectorized; may be slower but no warning now
    if mask.any():
        out.loc[mask] = pd.to_datetime(s_str[mask], errors='coerce', dayfirst=True)

    return out

# ======================================
# 1) Which universes to run (edit later)
# ======================================
UNIVERSES = ["FTSE250", "SnP500", "STOXX600","S&PMidCap"]          

# ======================================
# 2) Portfolio knobs (edit anytime)
# ======================================
KEEP_N       = 10        # number of stocks per factor portfolio
SCREEN_N     = 20        # number of candidates screened per month
FACTOR_TOP   = 'quarter'
FACTOR_MIN_K = 1         # never pick fewer than this many factors

# ---- helpers for meta-selection ----
def _factor_fraction(x) -> float:
    """Resolve FACTOR_TOP to a numeric fraction in (0, 1]."""
    if isinstance(x, str):
        s = x.strip().lower()
        mapping = {
            'half': 0.5,
            'third': 1/3,
            'quarter': 0.25,
            'fifth': 0.20,
            'all': 1.0,
        }
        if s in mapping:
            return mapping[s]
        try:
            f = float(s)
        except ValueError:
            f = 0.5
        return max(1e-9, min(1.0, f))
    try:
        f = float(x)
    except Exception:
        f = 0.5
    return max(1e-9, min(1.0, f))

def compute_top_k(n_available: int) -> int:
    """Number of factors to keep given how many have valid 12m returns."""
    frac = _factor_fraction(FACTOR_TOP)
    return max(FACTOR_MIN_K, int(np.ceil(n_available * frac)))

# ======================================
# 3) Shared base folders (edit if needed)
# ======================================
BASE        = "H:/Tech Hardware Shared/$Mike/Quant"
CONSTIT_DIR = f"{BASE}/Constituents"
CSV_DIR     = f"{BASE}/CSV_files/QC"       # per-ISIN history files (unchanged)
SPOT_DIR    = f"{BASE}/Spot"
EXPORT_DIR  = f"{BASE}/Python_Outputs/QC"
os.makedirs(EXPORT_DIR, exist_ok=True)

# ======================================
# 4) Trading costs by universe
# ======================================
COSTS = {
    "FTSE250":     {"STAMP_DUTY": 0.0050, "BID_ASK": 0.0010, "FEES": 0.0010},
    "STOXX600":    {"STAMP_DUTY": 0.0000, "BID_ASK": 0.0010, "FEES": 0.0010},
    "SnP500":      {"STAMP_DUTY": 0.0000, "BID_ASK": 0.0010, "FEES": 0.0010},
    "S&PMidCap":   {"STAMP_DUTY": 0.0000, "BID_ASK": 0.0010, "FEES": 0.0010},    
}

# ======================================
# 5) Main runner for one universe (full pipeline)
# ======================================
def run_universe(UNIVERSE: str, keep_n: int = KEEP_N, screen_n: int = SCREEN_N):
    if UNIVERSE not in COSTS:
        print(f"[SKIP] Unknown UNIVERSE '{UNIVERSE}'. Supported: {list(COSTS)}")
        return

    # --- config & file paths ---
    constituents_file_path = os.path.join(CONSTIT_DIR, f"{UNIVERSE}_Constit.csv")
    spot_file_path         = os.path.join(SPOT_DIR,     f"{UNIVERSE}_Spot.csv")
    export_xls             = os.path.join(EXPORT_DIR,   f"{UNIVERSE}_Filtered_Net_NAV_and_constituents.xlsx")

    # --- trading costs ---
    STAMP_DUTY = COSTS[UNIVERSE]["STAMP_DUTY"]
    BID_ASK    = COSTS[UNIVERSE]["BID_ASK"]
    FEES       = COSTS[UNIVERSE]["FEES"]
    TRADING_COST_RATE = STAMP_DUTY + BID_ASK + FEES

    # -----------------------------
    # 1) Build constituent schedule
    # -----------------------------
    if not os.path.exists(constituents_file_path):
        print(f"[{UNIVERSE}] Constituents file not found: {constituents_file_path}  → skipping.")
        return

    df = pd.read_csv(constituents_file_path, header=0)
    df.columns = pd.to_datetime(df.columns, format='%d-%b-%y')
    df = pd.melt(df, id_vars=[], var_name='Date', value_name='ISIN')
    df['Date'] = df['Date'] + pd.offsets.MonthEnd(0)
    df['ISIN'] = df['ISIN'].fillna('placeholder')

    # -----------------------------
    # 2) Read Spot (intramonth) rows
    # -----------------------------
    if os.path.exists(spot_file_path):
        spot_df = pd.read_csv(spot_file_path)
        spot_df.columns = [c.strip() for c in spot_df.columns]
        if 'ISIN' not in spot_df.columns or 'Date' not in spot_df.columns:
            raise ValueError(f"{UNIVERSE}_Spot.csv must include 'ISIN' and 'Date' columns.")
        spot_df['Date'] = parse_date_column(spot_df['Date'])
        spot_df = spot_df.dropna(subset=['ISIN', 'Date'])
        spot_df['ISIN'] = spot_df['ISIN'].astype(str).str.strip()
    else:
        spot_df = pd.DataFrame(columns=['ISIN', 'Date'])

    # -----------------------------
    # 3) Read/augment per-ISIN files
    # -----------------------------
    all_isin_data = []

    for isin in df['ISIN'].unique():
        if isin == 'placeholder':
            continue

        csv_file_path = os.path.join(CSV_DIR, f'{isin}.csv')
        if not os.path.exists(csv_file_path):
            continue

        hist_df = pd.read_csv(csv_file_path, header=0, na_values=["-"])
        # Convert historical Excel serial date to datetime (coerces ISO safely)
        hist_df['Date'] = pd.to_datetime(hist_df['Date'], unit='D', origin='1899-12-30', errors='coerce')

        # Append matching spot rows (if any)
        if not spot_df.empty:
            spot_rows = spot_df[spot_df['ISIN'] == isin].copy()
            if not spot_rows.empty:
                if 'ISIN' not in hist_df.columns:
                    hist_df['ISIN'] = isin
                else:
                    hist_df['ISIN'] = hist_df['ISIN'].fillna(isin).astype(str).str.strip()

                spot_rows['Date'] = pd.to_datetime(spot_rows['Date'], errors='coerce')
                spot_rows['ISIN'] = isin

                hist_df = pd.concat([hist_df, spot_rows], ignore_index=True, sort=False)
                hist_df = (
                    hist_df.sort_values('Date')
                           .drop_duplicates(subset=['Date'], keep='last')
                           .reset_index(drop=True)
                )

        # Month-end align (latest within month wins)
        hist_df['Date'] = hist_df['Date'] + pd.offsets.MonthEnd(0)

        # ---- Calculations ----
        isin_data = hist_df.copy()
        isin_data['ISIN'] = isin

        numeric_columns = [
            'Mkt_Cap','RI','P',
            'Rev_LTM','Rev_NTM',
            'EBITDA_LTM_DS','EBITDA_LTM','EBITDA_NTM',
            'EPS_LTM','EPS_NTM',
            'BPS_LTM','BPS_NTM',
            'DPS_LTM','DPS_NTM',
            'CF_LTM_DS','CFPS_LTM','CFPS_NTM',
            'Assets_LTM',
            'NetDebt_LTM_DS','NetDebt_LTM','NetDebt_NTM'
        ]
        for col in numeric_columns:
            if col not in isin_data.columns:
                isin_data[col] = np.nan
        isin_data[numeric_columns] = isin_data[numeric_columns].apply(lambda x: pd.to_numeric(x, errors='coerce'))

        isin_data['RI'] = np.where(isin_data['RI'] == 0, np.nan, isin_data['RI'])
        isin_data['NetDebt_LTM_Best']  = np.where(pd.notnull(isin_data['NetDebt_LTM']),
                                                  isin_data['NetDebt_LTM'],
                                                  isin_data['NetDebt_LTM_DS'])
        isin_data['EBITDA_LTM_Best']   = np.where(pd.notnull(isin_data['EBITDA_LTM']),
                                                  isin_data['EBITDA_LTM'],
                                                  isin_data['EBITDA_LTM_DS'])
        isin_data['EV'] = isin_data['Mkt_Cap'] + np.where(pd.notnull(isin_data['NetDebt_NTM']),
                                                          isin_data['NetDebt_NTM'],
                                                          isin_data['NetDebt_LTM_Best'])

        for c in ['BPS_LTM','BPS_NTM']:
            isin_data[c] = np.where(isin_data[c] < 0, np.nan, isin_data[c])

        isin_data['TR']           = isin_data['RI'] / isin_data['RI'].shift(1) - 1
        isin_data['TR_LTM']       = isin_data['RI'] / isin_data['RI'].shift(12) - 1
        isin_data['EPS_NTM_Chg']  = np.where(isin_data['EPS_NTM'].shift(12) > 0,
                                             isin_data['EPS_NTM'] / isin_data['EPS_NTM'].shift(12) - 1,
                                             np.nan)

        isin_data['NTM_RevGrowth']  = isin_data['Rev_NTM'] / isin_data['Rev_LTM'] - 1
        isin_data['Assets_NTM_Raw'] = (1 + isin_data['NTM_RevGrowth']) * isin_data['Assets_LTM']
        isin_data['Assets_NTM']     = (isin_data[['Assets_NTM_Raw', 'Assets_LTM']].max(axis=1))

        isin_data['RoE_LTM']        = isin_data['EPS_LTM'] / isin_data['BPS_LTM']
        isin_data['RoE_LTM_5yAvg']  = isin_data['RoE_LTM'].rolling(window=60, min_periods=60).median()
        isin_data['RoE_NTM']        = isin_data['EPS_NTM'] / isin_data['BPS_NTM']
        isin_data['RoE_NTM_5yAvg']  = isin_data['RoE_NTM'].rolling(window=60, min_periods=60).median()

        isin_data['Sales_EV_LTM']   = isin_data['Rev_LTM'] / isin_data['EV']
        isin_data['Sales_EV_NTM']   = isin_data['Rev_NTM'] / isin_data['EV']

        isin_data['EBITDA_EV_LTM']  = isin_data['EBITDA_LTM_Best'] / isin_data['EV']
        isin_data['EBITDA_EV_NTM']  = isin_data['EBITDA_NTM'] / isin_data['EV']

        isin_data['EY_LTM']         = isin_data['EPS_LTM'] / isin_data['P']
        isin_data['EY_NTM']         = isin_data['EPS_NTM'] / isin_data['P']

        isin_data['BY_LTM']         = isin_data['BPS_LTM'] / isin_data['P']
        isin_data['BY_NTM']         = isin_data['BPS_NTM'] / isin_data['P']

        isin_data['DY_LTM']         = isin_data['DPS_LTM'] / isin_data['P']
        isin_data['DY_NTM']         = isin_data['DPS_NTM'] / isin_data['P']

        isin_data['CFY_LTM_IBES']   = isin_data['CFPS_LTM'] / isin_data['P']
        isin_data['CFY_LTM_DS']     = isin_data['CF_LTM_DS'] / isin_data['Mkt_Cap']
        isin_data['CFY_LTM']        = np.where(pd.notnull(isin_data['CFY_LTM_IBES']),
                                               isin_data['CFY_LTM_IBES'],
                                               isin_data['CFY_LTM_DS'])
        isin_data['CFY_NTM']        = isin_data['CFPS_NTM'] / isin_data['P']
        isin_data['CF_LTM']         = isin_data['CFY_LTM'] * isin_data['Mkt_Cap']
        isin_data['CF_NTM']         = isin_data['CFY_NTM'] * isin_data['Mkt_Cap']
        isin_data['CF_Assets_LTM']  = isin_data['CF_LTM'] / isin_data['Assets_LTM']
        isin_data['CF_Assets_NTM']  = isin_data['CF_NTM'] / isin_data['Assets_NTM']
        isin_data['CF_Assets_LTM_5yAvg'] = isin_data['CF_Assets_LTM'].rolling(window=60, min_periods=60).median()

        isin_data['EBITDA_Assets_LTM']       = isin_data['EBITDA_LTM_Best'] / isin_data['Assets_LTM']
        isin_data['EBITDA_Assets_NTM']       = isin_data['EBITDA_NTM'] / isin_data['Assets_NTM']
        isin_data['EBITDA_Assets_LTM_5yAvg'] = isin_data['EBITDA_Assets_LTM'].rolling(window=60, min_periods=60).median()

        # Clip yields/rates to plausible bands (not multiples)
        for c, bounds in {
            'Sales_EV_NTM':  {'lower': -0.02, 'upper': 1.0},
            'EBITDA_EV_NTM': {'lower': -0.02, 'upper': 1.0},
            'EY_LTM':        {'lower': -0.02, 'upper': 1.0},
            'EY_NTM':        {'lower': -0.02, 'upper': 1.0},
            'RoE_LTM':       {'lower': -0.5,  'upper': 1.5},
            'RoE_NTM':       {'lower': -0.5,  'upper': 1.5},
        }.items():
            isin_data[c] = isin_data[c].clip(**bounds)

        # Start-of-month versions
        for c in [
            'Mkt_Cap', 'EV',
            'TR_LTM','EPS_NTM_Chg',
            'EBITDA_Assets_LTM','CF_Assets_LTM','RoE_LTM',
            'EBITDA_Assets_NTM','CF_Assets_NTM','RoE_NTM',
            'EBITDA_Assets_LTM_5yAvg','CF_Assets_LTM_5yAvg','RoE_LTM_5yAvg',
            'EBITDA_EV_LTM','Sales_EV_LTM','CFY_LTM','EY_LTM','DY_LTM',
            'EBITDA_EV_NTM','Sales_EV_NTM','CFY_NTM','EY_NTM','DY_NTM'
        ]:
            isin_data[c + '_SOM'] = isin_data[c].shift(1)

        all_isin_data.append(isin_data)

    # -----------------------------
    # 4) Merge all ISIN data
    # -----------------------------
    if not all_isin_data:
        print(f"[{UNIVERSE}] No per-ISIN CSVs found in {CSV_DIR}  → skipping.")
        return

    isin_final_df = pd.concat(all_isin_data, ignore_index=True)
    df = pd.merge(df, isin_final_df, on=['Date','ISIN'])

    # -----------------------------
    # 5) Medians & filters
    # -----------------------------
    for col in ['EBITDA_Assets_LTM_5yAvg_SOM', 'CF_Assets_LTM_5yAvg_SOM', 'RoE_LTM_5yAvg_SOM']:
        df[f"{col}_Median"] = df.groupby('Date')[col].transform('median')

    df['EBITDA_Assets_LTM_SOM_+'] = np.where(df['EBITDA_Assets_LTM_5yAvg_SOM'] > df['EBITDA_Assets_LTM_5yAvg_SOM_Median'], df['EBITDA_Assets_LTM_SOM'], np.nan)
    df['CF_Assets_LTM_SOM_+']     = np.where(df['CF_Assets_LTM_5yAvg_SOM'] > df['CF_Assets_LTM_5yAvg_SOM_Median'],     df['CF_Assets_LTM_SOM'],     np.nan)
    df['RoE_LTM_SOM_+']           = np.where(df['RoE_LTM_5yAvg_SOM'] > df['RoE_LTM_5yAvg_SOM_Median'],                 df['RoE_LTM_SOM'],           np.nan)

    df['EBITDA_Assets_NTM_SOM_+'] = np.where(df['EBITDA_Assets_LTM_5yAvg_SOM'] > df['EBITDA_Assets_LTM_5yAvg_SOM_Median'], df['EBITDA_Assets_NTM_SOM'], np.nan)
    df['CF_Assets_NTM_SOM_+']     = np.where(df['CF_Assets_LTM_5yAvg_SOM'] > df['CF_Assets_LTM_5yAvg_SOM_Median'],     df['CF_Assets_NTM_SOM'],     np.nan)
    df['RoE_NTM_SOM_+']           = np.where(df['RoE_LTM_5yAvg_SOM'] > df['RoE_LTM_5yAvg_SOM_Median'],                 df['RoE_NTM_SOM'],           np.nan)

    # -----------------------------
    # 6) Rank columns
    # -----------------------------
    cols_to_rank = [
        'TR_LTM_SOM', 'EPS_NTM_Chg_SOM',
        'EBITDA_Assets_LTM_SOM_+','CF_Assets_LTM_SOM_+','RoE_LTM_SOM_+',
        'EBITDA_Assets_NTM_SOM_+','CF_Assets_NTM_SOM_+','RoE_NTM_SOM_+',
        'Sales_EV_LTM_SOM','EBITDA_EV_LTM_SOM','EY_LTM_SOM','CFY_LTM_SOM','DY_LTM_SOM',
        'Sales_EV_NTM_SOM','EBITDA_EV_NTM_SOM','EY_NTM_SOM','CFY_NTM_SOM','DY_NTM_SOM'
    ]
    for col in cols_to_rank:
        df[f"{col}_Rank"] = df.groupby('Date')[col].rank(ascending=False, method='min')

    # Enforce numeric dtype for freshly created rank columns
    rank_cols_made = [f"{c}_Rank" for c in cols_to_rank]
    df[rank_cols_made] = df[rank_cols_made].apply(pd.to_numeric, errors='coerce')

    # -----------------------------
    # 7) Composites (quality / value / CoE / MTUM)
    # -----------------------------
    quality_plus_LTM_cols = ['EBITDA_Assets_LTM_SOM_+_Rank','CF_Assets_LTM_SOM_+_Rank','RoE_LTM_SOM_+_Rank']
    n = len(quality_plus_LTM_cols)
    df['quality_+_LTM_Rank'] = df[quality_plus_LTM_cols].apply(lambda r: np.nan if r.count() < (n-1) else r.nsmallest(n-1).mean(), axis=1)

    quality_plus_NTM_cols = ['EBITDA_Assets_NTM_SOM_+_Rank','CF_Assets_NTM_SOM_+_Rank','RoE_NTM_SOM_+_Rank']
    n = len(quality_plus_NTM_cols)
    df['quality_+_NTM_Rank'] = df[quality_plus_NTM_cols].apply(lambda r: np.nan if r.count() < (n-1) else r.nsmallest(n-1).mean(), axis=1)

    value_LTM_cols = ['Sales_EV_LTM_SOM_Rank','EBITDA_EV_LTM_SOM_Rank','EY_LTM_SOM_Rank','CFY_LTM_SOM_Rank','DY_LTM_SOM_Rank']
    n = len(value_LTM_cols)
    df['value_+_LTM_Rank'] = df[value_LTM_cols].apply(lambda r: r.nsmallest(n-2).mean() if r.count() >= (n-2) else np.nan, axis=1)

    value_NTM_cols = ['Sales_EV_NTM_SOM_Rank','EBITDA_EV_NTM_SOM_Rank','EY_NTM_SOM_Rank','CFY_NTM_SOM_Rank','DY_NTM_SOM_Rank']
    n = len(value_NTM_cols)
    df['value_+_NTM_Rank'] = df[value_NTM_cols].apply(lambda r: r.nsmallest(n-2).mean() if r.count() >= (n-2) else np.nan, axis=1)

    df['CoE_+_LTM_score'] = df['quality_+_LTM_Rank'] + df['value_+_LTM_Rank']
    df['CoE_+_LTM_Rank']  = df.groupby('Date')['CoE_+_LTM_score'].rank(ascending=True, method='min')
    df['CoE_+_NTM_score'] = df['quality_+_NTM_Rank'] + df['value_+_NTM_Rank']
    df['CoE_+_NTM_Rank']  = df.groupby('Date')['CoE_+_NTM_score'].rank(ascending=True, method='min')

    df['quality_+_LTM_TR_MTUM_score'] = df['quality_+_LTM_Rank'] + df['TR_LTM_SOM_Rank']
    df['quality_+_LTM_TR_MTUM']       = df.groupby('Date')['quality_+_LTM_TR_MTUM_score'].rank(ascending=True, method='min')
    df['quality_+_NTM_TR_MTUM_score'] = df['quality_+_NTM_Rank'] + df['TR_LTM_SOM_Rank']
    df['quality_+_NTM_TR_MTUM']       = df.groupby('Date')['quality_+_NTM_TR_MTUM_score'].rank(ascending=True, method='min')

    df['quality_+_LTM_EPS_MTUM_score'] = df['quality_+_LTM_Rank'] + df['EPS_NTM_Chg_SOM_Rank']
    df['quality_+_LTM_EPS_MTUM']       = df.groupby('Date')['quality_+_LTM_EPS_MTUM_score'].rank(ascending=True, method='min')
    df['quality_+_NTM_EPS_MTUM_score'] = df['quality_+_NTM_Rank'] + df['EPS_NTM_Chg_SOM_Rank']
    df['quality_+_NTM_EPS_MTUM']       = df.groupby('Date')['quality_+_NTM_EPS_MTUM_score'].rank(ascending=True, method='min')

    df['value_+_LTM_TR_MTUM_score'] = df['value_+_LTM_Rank'] + df['TR_LTM_SOM_Rank']
    df['value_+_LTM_TR_MTUM']       = df.groupby('Date')['value_+_LTM_TR_MTUM_score'].rank(ascending=True, method='min')
    df['value_+_NTM_TR_MTUM_score'] = df['value_+_NTM_Rank'] + df['TR_LTM_SOM_Rank']
    df['value_+_NTM_TR_MTUM']       = df.groupby('Date')['value_+_NTM_TR_MTUM_score'].rank(ascending=True, method='min')

    df['value_+_LTM_EPS_MTUM_score'] = df['value_+_LTM_Rank'] + df['EPS_NTM_Chg_SOM_Rank']
    df['value_+_LTM_EPS_MTUM']       = df.groupby('Date')['value_+_LTM_EPS_MTUM_score'].rank(ascending=True, method='min')
    df['value_+_NTM_EPS_MTUM_score'] = df['value_+_NTM_Rank'] + df['EPS_NTM_Chg_SOM_Rank']
    df['value_+_NTM_EPS_MTUM']       = df.groupby('Date')['value_+_NTM_EPS_MTUM_score'].rank(ascending=True, method='min')

    df['CoE_+_LTM_TR_MTUM_score'] = df['CoE_+_LTM_Rank'] + df['TR_LTM_SOM_Rank']
    df['CoE_+_LTM_TR_MTUM']       = df.groupby('Date')['CoE_+_LTM_TR_MTUM_score'].rank(ascending=True, method='min')
    df['CoE_+_NTM_TR_MTUM_score'] = df['CoE_+_NTM_Rank'] + df['TR_LTM_SOM_Rank']
    df['CoE_+_NTM_TR_MTUM']       = df.groupby('Date')['CoE_+_NTM_TR_MTUM_score'].rank(ascending=True, method='min')

    df['CoE_+_LTM_EPS_MTUM_score'] = df['CoE_+_LTM_Rank'] + df['EPS_NTM_Chg_SOM_Rank']
    df['CoE_+_LTM_EPS_MTUM']       = df.groupby('Date')['CoE_+_LTM_EPS_MTUM_score'].rank(ascending=True, method='min')
    df['CoE_+_NTM_EPS_MTUM_score'] = df['CoE_+_NTM_Rank'] + df['EPS_NTM_Chg_SOM_Rank']
    df['CoE_+_NTM_EPS_MTUM']       = df.groupby('Date')['CoE_+_NTM_EPS_MTUM_score'].rank(ascending=True, method='min')

    df['CoE_TR_LTM_score'] = df['quality_+_LTM_TR_MTUM'] + df['value_+_LTM_TR_MTUM']
    df['CoE_TR_LTM_Rank']  = df.groupby('Date')['CoE_TR_LTM_score'].rank(ascending=True, method='min')
    df['CoE_TR_NTM_score'] = df['quality_+_NTM_TR_MTUM'] + df['value_+_NTM_TR_MTUM']
    df['CoE_TR_NTM_Rank']  = df.groupby('Date')['CoE_TR_NTM_score'].rank(ascending=True, method='min')

    df['CoE_EPS_LTM_score'] = df['quality_+_LTM_EPS_MTUM'] + df['value_+_LTM_EPS_MTUM']
    df['CoE_EPS_LTM_Rank']  = df.groupby('Date')['CoE_EPS_LTM_score'].rank(ascending=True, method='min')
    df['CoE_EPS_NTM_score'] = df['quality_+_NTM_EPS_MTUM'] + df['value_+_NTM_EPS_MTUM']
    df['CoE_EPS_NTM_Rank']  = df.groupby('Date')['CoE_EPS_NTM_score'].rank(ascending=True, method='min')

    df['EPS_TR_MTUM_score'] = df['EPS_NTM_Chg_SOM_Rank'] + df['TR_LTM_SOM_Rank']
    df['EPS_TR_MTUM']       = df.groupby('Date')['EPS_TR_MTUM_score'].rank(ascending=True, method='min')

    # -----------------------------
    # 8) Portfolio building blocks
    # -----------------------------
    clean = (
        df.copy()
          .assign(Date=lambda x: pd.to_datetime(x['Date']))
          .sort_values(['Date', 'ISIN'])
          .dropna(subset=['TR', 'Mkt_Cap_SOM'])
          .reset_index(drop=True)
    )

    def index_monthly(frame):
        m = frame.copy()
        m['Cap_x_TR'] = m['TR'] * m['Mkt_Cap_SOM']
        grp  = m.groupby(pd.Grouper(key='Date', freq='ME'))
        sums = grp.agg(weighted_TR=("Cap_x_TR", "sum"),
                       mkt_sum     =("Mkt_Cap_SOM", "sum"))
        out = (sums['weighted_TR'] / sums['mkt_sum']).rename('Index_TR').to_frame()
        out['Index_Net_NAV'] = (1 + out['Index_TR']).cumprod()
        return out.reset_index()

    def index_monthly(frame):
        """
        Builds the index time series:
          - Index_TR: cap-weighted average of monthly TR using Mkt_Cap_SOM
          - Index_Net_NAV: cumprod(1 + Index_TR)
          - IndexEqWgt_TR: equal-weight average of monthly TR across ISINs
          - IndexEqWgt_Net_NAV: cumprod(1 + IndexEqWgt_TR)

        Columns are ordered so the equal-weight series sit immediately to the right
        of Index_TR and Index_Net_NAV in the Excel output.
        """
        m = frame.copy()

        # Cap-weighted monthly return
        m['Cap_x_TR'] = m['TR'] * m['Mkt_Cap_SOM']
        grp = m.groupby(pd.Grouper(key='Date', freq='ME'))

        sums = grp.agg(
            weighted_TR=("Cap_x_TR", "sum"),
            mkt_sum     =("Mkt_Cap_SOM", "sum")
        )
        cap_tr = (sums['weighted_TR'] / sums['mkt_sum']).rename('Index_TR')

        # Equal-weight monthly return (simple average of TR across names)
        eq_tr = grp['TR'].mean().rename('IndexEqWgt_TR')

        # Assemble and compute NAVs
        out = pd.concat([cap_tr, eq_tr], axis=1).reset_index()
        out['Index_Net_NAV']       = (1 + out['Index_TR']).cumprod()
        out['IndexEqWgt_Net_NAV']  = (1 + out['IndexEqWgt_TR']).cumprod()

        # Column order so they appear adjacent in the XLS
        return out[['Date', 'Index_TR', 'IndexEqWgt_TR', 'Index_Net_NAV', 'IndexEqWgt_Net_NAV']]

    # ---------- FIXED to coerce rank columns numeric before nsmallest/nlargest ----------
    def ranked_factor_series(frame,
                             *,
                             rank_col: str,
                             label:    str,
                             keep_n:   int = keep_n,
                             screen_n: int = screen_n,
                             ret_col:  str = 'TR',
                             ascending: bool = True):
        """
        Builds monthly factor portfolio from a PRE-RANKED column.
        Returns: nav_df (Date | <label>_NAV | <label>_Net_NAV),
                 members_df (Date | Factor | ISIN | Weight)
        """
        tmp = frame.copy()
        # Ensure numeric before any sorting/selection
        tmp[rank_col] = pd.to_numeric(tmp[rank_col], errors='coerce')
        tmp[ret_col]  = pd.to_numeric(tmp[ret_col],  errors='coerce')
        tmp = tmp.dropna(subset=[rank_col, ret_col])

        tmp['Month'] = tmp['Date'].dt.to_period('M')

        def pick_block(block: pd.DataFrame) -> pd.DataFrame:
            b = block[['ISIN', rank_col, ret_col]].copy()
            b[rank_col] = pd.to_numeric(b[rank_col], errors='coerce')
            b[ret_col]  = pd.to_numeric(b[ret_col],  errors='coerce')
            b = b.dropna(subset=[rank_col, ret_col])
            if b.empty:
                return b[['ISIN', ret_col]]
            return (b.nsmallest(screen_n, rank_col) if ascending else b.nlargest(screen_n, rank_col))[['ISIN', ret_col]]

        top_by_month = {m: pick_block(block) for m, block in tmp.groupby('Month')}

        rows_nav, rows_members = [], []
        prev = set()
        for m in sorted(top_by_month):
            cand = top_by_month[m]
            if cand.empty:
                continue
            names  = list(cand['ISIN'])
            keep_b = [n for n in names if n in prev]
            add_b  = [n for n in names if n not in prev]
            basket = (keep_b + add_b)[:keep_n]
            gross_tr = cand.set_index('ISIN').loc[basket, ret_col].mean()
            churn    = 1 - (len(keep_b) / keep_n) if prev else 1.0
            net_tr   = gross_tr - TRADING_COST_RATE * churn
            rows_nav.append({'Date': m.to_timestamp('M'), 'Gross_TR': gross_tr, 'Net_TR': net_tr})
            w = 1.0 / keep_n if keep_n else np.nan
            for s in basket:
                rows_members.append({'Date': m.to_timestamp('M'), 'Factor': label, 'ISIN': s, 'Weight': w})
            prev = set(basket)
        nav = pd.DataFrame(rows_nav).sort_values('Date')
        if not nav.empty:
            nav[f'{label}_NAV']     = (1 + nav['Gross_TR']).cumprod()
            nav[f'{label}_Net_NAV'] = (1 + nav['Net_TR']).cumprod()
            nav = nav[['Date', f'{label}_NAV', f'{label}_Net_NAV']]
        else:
            nav = pd.DataFrame(columns=['Date', f'{label}_NAV', f'{label}_Net_NAV'])
        return nav, pd.DataFrame(rows_members)

    rank_cols_ready = [
        # quality
        'quality_+_LTM_Rank','quality_+_NTM_Rank','quality_+_LTM_TR_MTUM','quality_+_NTM_TR_MTUM','quality_+_LTM_EPS_MTUM','quality_+_NTM_EPS_MTUM',
        # value
        'value_+_LTM_Rank','value_+_NTM_Rank','value_+_LTM_TR_MTUM','value_+_NTM_TR_MTUM','value_+_LTM_EPS_MTUM','value_+_NTM_EPS_MTUM',
        # CoE
        'CoE_+_LTM_Rank','CoE_+_NTM_Rank','CoE_+_LTM_TR_MTUM','CoE_+_NTM_TR_MTUM','CoE_+_LTM_EPS_MTUM','CoE_+_NTM_EPS_MTUM',
        # Other
        'CoE_TR_LTM_Rank','CoE_TR_NTM_Rank','CoE_EPS_LTM_Rank','CoE_EPS_NTM_Rank','TR_LTM_SOM_Rank','EPS_NTM_Chg_SOM_Rank','EPS_TR_MTUM'
    ]

    nav_frames, members_frames = [], []
    for col in rank_cols_ready:
        label = col.replace('_Rank','')
        nav_df, members_df = ranked_factor_series(clean, rank_col=col, label=label, ascending=True)
        nav_frames.append(nav_df)
        members_frames.append(members_df)

    factors_df  = reduce(lambda l, r: l.merge(r, on='Date', how='left'), nav_frames)
    members_all = pd.concat(members_frames, ignore_index=True)  # Date | Factor | ISIN | Weight

    # Index + factors
    index_df  = index_monthly(clean)
    output_df = index_df.merge(factors_df, on='Date', how='left')

    # -----------------------------
    # 9) Optimiser: pick top fraction by 12m Net_NAV
    # -----------------------------
    net_nav_cols = [
        c for c in output_df.columns
        if c.endswith('Net_NAV') and c not in {'Index_Net_NAV','IndexEqWgt_Net_NAV'} ## strips out index measures from optimiser
    ]
    for col in net_nav_cols:
        output_df[f'{col}_1m_return']  = output_df[col] / output_df[col].shift(1) - 1
        output_df[f'{col}_12m_return'] = output_df[col].shift(1) / output_df[col].shift(13) - 1 # set this for the optimiser lookback period

    def pick_top_fraction_factors(row):
        twelve = row[[f'{c}_12m_return' for c in net_nav_cols]].dropna()
        if twelve.empty:
            return []
        k = compute_top_k(len(twelve))  # uses FACTOR_TOP knob
        chosen_idx = twelve.sort_values(ascending=False).index[:k]
        return list(chosen_idx.str.replace('_12m_return', '', regex=False))

    output_df['Selected_Factor_Cols'] = output_df.apply(pick_top_fraction_factors, axis=1)

    def calc_filtered_net_tr(row):
        chosen = row['Selected_Factor_Cols']
        if not chosen:
            return np.nan
        one_m_cols = [f'{c}_1m_return' for c in chosen]
        return row[one_m_cols].mean()

    output_df['Filtered_Net_TR']      = output_df.apply(calc_filtered_net_tr, axis=1)
    output_df['Filtered_Net_NAV']     = (1 + output_df['Filtered_Net_TR']).cumprod()
    output_df['Filtered_Net_NAV_Rel'] = output_df['Filtered_Net_NAV'] / output_df['Index_Net_NAV']

    # -----------------------------
    # 10) Composite weights (SUM across factors) + turnover
    # -----------------------------
    sel = (
        output_df[['Date', 'Selected_Factor_Cols']]
          .dropna(subset=['Selected_Factor_Cols'])
          .explode('Selected_Factor_Cols')
          .rename(columns={'Selected_Factor_Cols':'FactorCol'})
    )

    # NaN-safe mapping of factor columns -> factor labels used in members_all
    def col_to_factor_label(col):
        if not isinstance(col, str):
            return None
        if col.endswith('_Net_NAV'):
            return col[:-len('_Net_NAV')]
        if col.endswith('_NAV'):
            return col[:-len('_NAV')]
        return col

    sel['Factor'] = sel['FactorCol'].map(col_to_factor_label)
    sel = sel.dropna(subset=['Factor'])

    # Join to members to get constituents for the selected factors
    selected_members = (
        sel.merge(members_all, on=['Date','Factor'], how='left')
          .dropna(subset=['ISIN'])
    )

    if selected_members.empty:
        comp_weights = pd.DataFrame(columns=['Date','ISIN','Composite_Weight','Included_In_Factors'])
        portfolio_view = comp_weights.copy()
        portfolio_view['Weight_%'] = []
        # Turnover scaffold
        ts_dates = output_df[['Date']].drop_duplicates().sort_values('Date')
        turnover = ts_dates.copy()
        turnover['Gross_Turnover_%'] = 0.0
        turnover = turnover.merge(output_df[['Date','Filtered_Net_NAV']].drop_duplicates(), on='Date', how='left')
        turnover['Turnover_$'] = turnover['Gross_Turnover_%'] * turnover['Filtered_Net_NAV'].ffill().fillna(1.0)
        turnover = turnover.drop(columns=['Filtered_Net_NAV']).set_index('Date')
    else:
        # SUM weights across selected factors (each factor contributes 1/KEEP_N per held stock)
        comp_weights_raw = (
            selected_members
              .groupby(['Date','ISIN'], as_index=False)
              .agg(
                  SumWeight=('Weight','sum'),                   # proportional to #selected factors holding the ISIN
                  Included_In_Factors=('Factor','nunique')
              )
        )

        # Renormalize to sum to 1 within each month
        comp_weights = (
            comp_weights_raw
              .merge(
                  comp_weights_raw.groupby('Date', as_index=False)['SumWeight'].sum()
                                  .rename(columns={'SumWeight':'_SumW'}),
                  on='Date', how='left'
              )
        )
        comp_weights['Composite_Weight'] = comp_weights['SumWeight'] / comp_weights['_SumW']
        comp_weights = (comp_weights
                        .drop(columns=['_SumW'])
                        .sort_values(['Date','ISIN'])
                        .reset_index(drop=True))

        # Export view
        portfolio_view = (
            comp_weights[['Date','ISIN','Composite_Weight']]
              .sort_values(['Date','ISIN'])
              .assign(**{'Weight_%': lambda x: x['Composite_Weight'] * 100.0})
        )

        # Turnover (gross, from weight changes)
        prev_w = comp_weights.pivot(index='Date', columns='ISIN', values='Composite_Weight').shift(1).fillna(0.0)
        curr_w = comp_weights.pivot(index='Date', columns='ISIN', values='Composite_Weight').fillna(0.0)
        prev_w, curr_w = prev_w.align(curr_w, join='outer', axis=None)
        prev_w = prev_w.fillna(0.0); curr_w = curr_w.fillna(0.0)
        trade_w = curr_w - prev_w

        nav_series = output_df.set_index('Date')['Filtered_Net_NAV'].reindex(trade_w.index).ffill().fillna(1.0)
        turnover = trade_w.abs().sum(axis=1).rename('Gross_Turnover_%').to_frame()
        turnover['Turnover_$'] = turnover['Gross_Turnover_%'] * nav_series.values

    # -----------------------------
    # 10b) LIVE portfolio using the latest available data (build to trade next month)
    # -----------------------------
    # Determine the latest coherent month where all ingredients exist
    signals_date = pd.to_datetime(df['Date']).max()                  # last month with signals
    nav_date     = pd.to_datetime(output_df['Date']).max()           # last month with factor NAVs
    today_live   = min(d for d in [signals_date, nav_date] if pd.notnull(d)) if (pd.notnull(signals_date) and pd.notnull(nav_date)) else pd.NaT

    members_date = pd.to_datetime(members_all['Date']).max() if not members_all.empty else pd.NaT
    prev_date    = members_date if pd.notnull(members_date) else today_live

    live_selected_factors = pd.DataFrame(columns=['Date','Factor','T12M_Net_Return'])
    live_factor_baskets   = pd.DataFrame(columns=['Date','Factor','ISIN','Weight'])
    live_portfolio        = pd.DataFrame(columns=['Date','ISIN','Composite_Weight','Included_In_Factors','Weight_%'])

    if pd.notnull(today_live):
        TODAY = today_live

        # (1) NOW frame at TODAY (non-SOM signals)
        now_df = (
            df[df['Date'] == TODAY]
              .copy()
              .dropna(subset=['ISIN'])
        )

        if not now_df.empty:
            # medians for gating (non-SOM)
            for col in ['EBITDA_Assets_LTM_5yAvg','CF_Assets_LTM_5yAvg','RoE_LTM_5yAvg']:
                now_df[f"{col}_Median_NOW"] = now_df[col].median()

            # gated + versions
            now_df['EBITDA_Assets_LTM_+'] = np.where(now_df['EBITDA_Assets_LTM_5yAvg'] > now_df['EBITDA_Assets_LTM_5yAvg_Median_NOW'], now_df['EBITDA_Assets_LTM'], np.nan)
            now_df['CF_Assets_LTM_+']     = np.where(now_df['CF_Assets_LTM_5yAvg']     > now_df['CF_Assets_LTM_5yAvg_Median_NOW'],     now_df['CF_Assets_LTM'],     np.nan)
            now_df['RoE_LTM_+']           = np.where(now_df['RoE_LTM_5yAvg']           > now_df['RoE_LTM_5yAvg_Median_NOW'],           now_df['RoE_LTM'],           np.nan)

            now_df['EBITDA_Assets_NTM_+'] = np.where(now_df['EBITDA_Assets_LTM_5yAvg'] > now_df['EBITDA_Assets_LTM_5yAvg_Median_NOW'], now_df['EBITDA_Assets_NTM'], np.nan)
            now_df['CF_Assets_NTM_+']     = np.where(now_df['CF_Assets_LTM_5yAvg']     > now_df['CF_Assets_LTM_5yAvg_Median_NOW'],     now_df['CF_Assets_NTM'],     np.nan)
            now_df['RoE_NTM_+']           = np.where(now_df['RoE_LTM_5yAvg']           > now_df['RoE_LTM_5yAvg_Median_NOW'],           now_df['RoE_NTM'],           np.nan)

            # base NOW ranks for legs (no SOM)
            base_now_cols = {
                'TR_LTM':             'TR_LTM',
                'EPS_NTM_Chg':        'EPS_NTM_Chg',
                'EBITDA_Assets_LTM_+':'EBITDA_Assets_LTM_+',
                'CF_Assets_LTM_+':    'CF_Assets_LTM_+',
                'RoE_LTM_+':          'RoE_LTM_+',
                'EBITDA_Assets_NTM_+':'EBITDA_Assets_NTM_+',
                'CF_Assets_NTM_+':    'CF_Assets_NTM_+',
                'RoE_NTM_+':          'RoE_NTM_+',
                'Sales_EV_LTM':       'Sales_EV_LTM',
                'EBITDA_EV_LTM':      'EBITDA_EV_LTM',
                'EY_LTM':             'EY_LTM',
                'CFY_LTM':            'CFY_LTM',
                'DY_LTM':             'DY_LTM',
                'Sales_EV_NTM':       'Sales_EV_NTM',
                'EBITDA_EV_NTM':      'EBITDA_EV_NTM',
                'EY_NTM':             'EY_NTM',
                'CFY_NTM':            'CFY_NTM',
                'DY_NTM':             'DY_NTM',
            }
            for lbl, col in base_now_cols.items():
                # Coerce base columns to numeric before ranking to avoid object dtype
                now_df[col] = pd.to_numeric(now_df[col], errors='coerce')
                now_df[f'{lbl}_NOW_Rank'] = now_df[col].rank(ascending=False, method='min')

            # robust helper
            def robust_avg(row, cols, allow_k_drop=1):
                n = len(cols)
                # Coerce to numeric row-wise to avoid object dtype issues
                vals = pd.to_numeric(row[cols], errors='coerce').dropna()
                # If not enough valid inputs after coercion, return NaN
                if len(vals) < max(1, n - allow_k_drop):
                    return np.nan
                k = max(1, n - allow_k_drop)
                return vals.nsmallest(k).mean()

            # composites NOW (mirror backtest logic) -> produce *_NOW_Rank directly
            now_df['quality_+_LTM_NOW_Rank'] = now_df.apply(
                lambda r: robust_avg(r, ['EBITDA_Assets_LTM_+_NOW_Rank','CF_Assets_LTM_+_NOW_Rank','RoE_LTM_+_NOW_Rank'], allow_k_drop=1), axis=1
            )
            now_df['quality_+_NTM_NOW_Rank'] = now_df.apply(
                lambda r: robust_avg(r, ['EBITDA_Assets_NTM_+_NOW_Rank','CF_Assets_NTM_+_NOW_Rank','RoE_NTM_+_NOW_Rank'], allow_k_drop=1), axis=1
            )

            # value (best n-2 of 5)
            value_ltm_now_cols = ['Sales_EV_LTM_NOW_Rank','EBITDA_EV_LTM_NOW_Rank','EY_LTM_NOW_Rank','CFY_LTM_NOW_Rank','DY_LTM_NOW_Rank']
            value_ntm_now_cols = ['Sales_EV_NTM_NOW_Rank','EBITDA_EV_NTM_NOW_Rank','EY_NTM_NOW_Rank','CFY_NTM_NOW_Rank','DY_NTM_NOW_Rank']
            now_df['value_+_LTM_NOW_Rank'] = now_df.apply(lambda r: robust_avg(r, value_ltm_now_cols, allow_k_drop=2), axis=1)
            now_df['value_+_NTM_NOW_Rank'] = now_df.apply(lambda r: robust_avg(r, value_ntm_now_cols, allow_k_drop=2), axis=1)

            # momentum blends & CoE
            now_df['quality_+_LTM_TR_MTUM_NOW_Rank'] = (now_df['quality_+_LTM_NOW_Rank'] + now_df['TR_LTM_NOW_Rank']).rank(ascending=True, method='min')
            now_df['quality_+_NTM_TR_MTUM_NOW_Rank'] = (now_df['quality_+_NTM_NOW_Rank'] + now_df['TR_LTM_NOW_Rank']).rank(ascending=True, method='min')
            now_df['quality_+_LTM_EPS_MTUM_NOW_Rank'] = (now_df['quality_+_LTM_NOW_Rank'] + now_df['EPS_NTM_Chg_NOW_Rank']).rank(ascending=True, method='min')
            now_df['quality_+_NTM_EPS_MTUM_NOW_Rank'] = (now_df['quality_+_NTM_NOW_Rank'] + now_df['EPS_NTM_Chg_NOW_Rank']).rank(ascending=True, method='min')

            now_df['value_+_LTM_TR_MTUM_NOW_Rank'] = (now_df['value_+_LTM_NOW_Rank'] + now_df['TR_LTM_NOW_Rank']).rank(ascending=True, method='min')
            now_df['value_+_NTM_TR_MTUM_NOW_Rank'] = (now_df['value_+_NTM_NOW_Rank'] + now_df['TR_LTM_NOW_Rank']).rank(ascending=True, method='min')
            now_df['value_+_LTM_EPS_MTUM_NOW_Rank'] = (now_df['value_+_LTM_NOW_Rank'] + now_df['EPS_NTM_Chg_NOW_Rank']).rank(ascending=True, method='min')
            now_df['value_+_NTM_EPS_MTUM_NOW_Rank'] = (now_df['value_+_NTM_NOW_Rank'] + now_df['EPS_NTM_Chg_NOW_Rank']).rank(ascending=True, method='min')

            now_df['CoE_+_LTM_NOW_Rank'] = (now_df['quality_+_LTM_NOW_Rank'] + now_df['value_+_LTM_NOW_Rank']).rank(ascending=True, method='min')
            now_df['CoE_+_NTM_NOW_Rank'] = (now_df['quality_+_NTM_NOW_Rank'] + now_df['value_+_NTM_NOW_Rank']).rank(ascending=True, method='min')

            now_df['CoE_+_LTM_TR_MTUM_NOW_Rank'] = (now_df['CoE_+_LTM_NOW_Rank'] + now_df['TR_LTM_NOW_Rank']).rank(ascending=True, method='min')
            now_df['CoE_+_NTM_TR_MTUM_NOW_Rank'] = (now_df['CoE_+_NTM_NOW_Rank'] + now_df['TR_LTM_NOW_Rank']).rank(ascending=True, method='min')
            now_df['CoE_+_LTM_EPS_MTUM_NOW_Rank'] = (now_df['CoE_+_LTM_NOW_Rank'] + now_df['EPS_NTM_Chg_NOW_Rank']).rank(ascending=True, method='min')
            now_df['CoE_+_NTM_EPS_MTUM_NOW_Rank'] = (now_df['CoE_+_NTM_NOW_Rank'] + now_df['EPS_NTM_Chg_NOW_Rank']).rank(ascending=True, method='min')

            now_df['CoE_TR_LTM_NOW_Rank']  = (now_df['quality_+_LTM_TR_MTUM_NOW_Rank'] + now_df['value_+_LTM_TR_MTUM_NOW_Rank']).rank(ascending=True, method='min')
            now_df['CoE_TR_NTM_NOW_Rank']  = (now_df['quality_+_NTM_TR_MTUM_NOW_Rank'] + now_df['value_+_NTM_TR_MTUM_NOW_Rank']).rank(ascending=True, method='min')
            now_df['CoE_EPS_LTM_NOW_Rank'] = (now_df['quality_+_LTM_EPS_MTUM_NOW_Rank'] + now_df['value_+_LTM_EPS_MTUM_NOW_Rank']).rank(ascending=True, method='min')
            now_df['CoE_EPS_NTM_NOW_Rank'] = (now_df['quality_+_NTM_EPS_MTUM_NOW_Rank'] + now_df['value_+_NTM_EPS_MTUM_NOW_Rank']).rank(ascending=True, method='min')

            now_df['EPS_TR_MTUM_NOW_Rank'] = (now_df['EPS_NTM_Chg_NOW_Rank'] + now_df['TR_LTM_NOW_Rank']).rank(ascending=True, method='min')

            # ---- (2) LIVE factor selection: top fraction by 12m Net_NAV through TODAY ----
            last_row = output_df[output_df['Date'] == TODAY]
            live_date_for_selection = TODAY if not last_row.empty else output_df['Date'].max()

            live_sel_row = output_df.set_index('Date').sort_index()
            live_sel = {}
            for col in net_nav_cols:
                nav_series = live_sel_row[col]
                if live_date_for_selection in nav_series.index and pd.notnull(nav_series.loc[live_date_for_selection]) and pd.notnull(nav_series.shift(12).loc[live_date_for_selection]):
                    live_sel[col] = nav_series.loc[live_date_for_selection] / nav_series.shift(12).loc[live_date_for_selection] - 1
            live_sel = pd.Series(live_sel).dropna().sort_values(ascending=False)

            k = compute_top_k(len(live_sel))  # knob-driven
            chosen_factor_cols = list(live_sel.index[:k])  # e.g., 'quality_+_LTM_TR_MTUM_Net_NAV'
            chosen_factors = [c.replace('_Net_NAV','') for c in chosen_factor_cols]

            live_selected_factors = pd.DataFrame({
                'Date': [live_date_for_selection]*len(chosen_factors),
                'Factor': chosen_factors,
                'T12M_Net_Return': [live_sel.get(f'{f}_Net_NAV', np.nan) for f in chosen_factors]
            }).sort_values('T12M_Net_Return', ascending=False)

            # ---- (3) Build next-month baskets using NOW ranks + previous month basket as keep-set ----
            live_baskets = []
            w_eq = 1.0 / keep_n if keep_n else np.nan

            for factor in chosen_factors:
                col_now_rank = f'{factor}_NOW_Rank'
                if col_now_rank not in now_df.columns:
                    continue

                # Coerce NOW rank column to numeric before nsmallest
                candidates = (
                    now_df[['ISIN', col_now_rank]]
                      .assign(**{col_now_rank: lambda x: pd.to_numeric(x[col_now_rank], errors='coerce')})
                      .dropna(subset=[col_now_rank])
                      .nsmallest(screen_n, col_now_rank)
                )
                cand_isins = list(candidates['ISIN'])

                prev_rows = members_all[(members_all['Date'] == prev_date) & (members_all['Factor'] == factor)]
                prev_set = set(prev_rows['ISIN']) if not prev_rows.empty else set()

                keep_bucket = [s for s in cand_isins if s in prev_set]
                add_bucket  = [s for s in cand_isins if s not in prev_set]
                basket = (keep_bucket + add_bucket)[:keep_n]

                for s in basket:
                    live_baskets.append({'Date': TODAY, 'Factor': factor, 'ISIN': s, 'Weight': w_eq})

            live_factor_baskets = pd.DataFrame(live_baskets).sort_values(['Factor','ISIN'])

            # ---- (4) Aggregate across factors to composite LIVE portfolio ----
            if not live_factor_baskets.empty:
                comp_raw = (
                    live_factor_baskets
                        .groupby(['Date','ISIN'], as_index=False)
                        .agg(SumWeight=('Weight','sum'), Included_In_Factors=('Factor','nunique'))
                )
                total = comp_raw.groupby('Date', as_index=False)['SumWeight'].sum().rename(columns={'SumWeight':'_SumW'})
                comp = comp_raw.merge(total, on='Date', how='left')
                comp['Composite_Weight'] = comp['SumWeight'] / comp['_SumW']
                live_portfolio = (
                    comp.drop(columns=['_SumW'])
                        .assign(**{'Weight_%': lambda x: x['Composite_Weight']*100.0})
                        .sort_values(['ISIN'])
                )

            if pd.notnull(prev_date) and prev_date != TODAY:
                print(f"[{UNIVERSE}] Note: members_all ends at {prev_date.date()}, NOW signals/NAV use {TODAY.date()}.")

    # -----------------------------
    # 11) Export (lean) + LIVE sheets
    # -----------------------------
    with pd.ExcelWriter(export_xls, engine='openpyxl', datetime_format='yyyy-mm-dd') as writer:
        # (1) backtest time series and helpers
        output_df.to_excel(writer, sheet_name='Filtered_NAV_TS', index=False)
        # (2) monthly composite holdings (backtest)
        portfolio_view.to_excel(writer, sheet_name='Portfolio_Constituents', index=False)
        # (3) turnover (backtest)
        turnover.reset_index().to_excel(writer, sheet_name='Composite_Turnover', index=False)
        # (4) LIVE as-of latest month available (for next month trading)
        live_selected_factors.to_excel(writer, sheet_name='Live_Selected_Factors', index=False)
        live_factor_baskets.to_excel(writer, sheet_name='Live_Factor_Baskets', index=False)
        live_portfolio.to_excel(writer, sheet_name='Live_Portfolio_Constituents', index=False)

    print(f"[{UNIVERSE}] Excel export written to: {export_xls}")

# ======================================
# 6) Run the requested universes
# ======================================
for U in UNIVERSES:
    run_universe(U, keep_n=KEEP_N, screen_n=SCREEN_N)


[FTSE250] Excel export written to: H:/Tech Hardware Shared/$Mike/Quant/Python_Outputs/QC\FTSE250_Filtered_Net_NAV_and_constituents.xlsx
[SnP500] Excel export written to: H:/Tech Hardware Shared/$Mike/Quant/Python_Outputs/QC\SnP500_Filtered_Net_NAV_and_constituents.xlsx
[STOXX600] Excel export written to: H:/Tech Hardware Shared/$Mike/Quant/Python_Outputs/QC\STOXX600_Filtered_Net_NAV_and_constituents.xlsx
[S&PMidCap] Excel export written to: H:/Tech Hardware Shared/$Mike/Quant/Python_Outputs/QC\S&PMidCap_Filtered_Net_NAV_and_constituents.xlsx


In [3]:
# =========================
# AUDIT: point-in-time check
# =========================
import pandas as pd
import numpy as np
import os

# --- user knobs ---
AUDIT_UNIVERSE = "STOXX600"         # e.g., "STOXX600", "FTSE250", "SnP500"
AUDIT_DATE_STR = "31 Aug 2025"      # any parseable date; will be aligned to month-end
AUDIT_COL      = "TR"               # e.g., "TR", "TR_LTM_SOM", "EY_LTM_SOM", etc.

# --- reuse your base paths defined above ---
# BASE, CONSTIT_DIR, CSV_DIR, SPOT_DIR must already exist in your session.

def _month_end(ts_like) -> pd.Timestamp:
    d = pd.to_datetime(ts_like, dayfirst=True, errors='coerce')
    if pd.isna(d):
        raise ValueError(f"Cannot parse date: {ts_like}")
    return (d + pd.offsets.MonthEnd(0)).normalize()

ASOF = _month_end(AUDIT_DATE_STR)

def _load_constituents(universe: str) -> pd.DataFrame:
    p = os.path.join(CONSTIT_DIR, f"{universe}_Constit.csv")
    if not os.path.exists(p):
        raise FileNotFoundError(f"Constituents file not found: {p}")
    dfc = pd.read_csv(p, header=0)
    # columns are MMM-YY like '31-Aug-25' per your pipeline
    dfc.columns = pd.to_datetime(dfc.columns, format='%d-%b-%y')
    dfc = pd.melt(dfc, id_vars=[], var_name='Date', value_name='ISIN')
    dfc['Date'] = dfc['Date'] + pd.offsets.MonthEnd(0)
    dfc['ISIN'] = dfc['ISIN'].fillna('placeholder')
    return dfc

def _load_spot(universe: str) -> pd.DataFrame:
    p = os.path.join(SPOT_DIR, f"{universe}_Spot.csv")
    if not os.path.exists(p):
        return pd.DataFrame(columns=['ISIN','Date'])
    s = pd.read_csv(p)
    s.columns = [c.strip() for c in s.columns]
    if 'ISIN' not in s.columns or 'Date' not in s.columns:
        raise ValueError(f"{universe}_Spot.csv must include 'ISIN' and 'Date'")
    # use your robust parser if present; otherwise safe parse:
    try:
        s['Date'] = parse_date_column(s['Date'])
    except NameError:
        s['Date'] = pd.to_datetime(s['Date'], errors='coerce', dayfirst=True)
    s = s.dropna(subset=['ISIN','Date']).assign(ISIN=lambda x: x['ISIN'].astype(str).str.strip())
    return s

def _load_hist_for_isin(isin: str, spot_df: pd.DataFrame) -> pd.DataFrame:
    p = os.path.join(CSV_DIR, f"{isin}.csv")
    if not os.path.exists(p):
        return pd.DataFrame()
    h = pd.read_csv(p, header=0, na_values=["-"])
    # Excel serials -> datetime (your convention)
    h['Date'] = pd.to_datetime(h['Date'], unit='D', origin='1899-12-30', errors='coerce')
    # add ISIN column if missing
    if 'ISIN' not in h.columns:
        h['ISIN'] = isin
    else:
        h['ISIN'] = h['ISIN'].fillna(isin).astype(str).str.strip()

    # append matching spot rows
    if not spot_df.empty:
        add = spot_df[spot_df['ISIN'] == isin].copy()
        if not add.empty:
            add['Date'] = pd.to_datetime(add['Date'], errors='coerce')
            add['ISIN'] = isin
            h = (pd.concat([h, add], ignore_index=True, sort=False)
                   .sort_values('Date')
                   .drop_duplicates(subset=['Date'], keep='last')
                   .reset_index(drop=True))
    # align to month-end
    h['Date'] = h['Date'] + pd.offsets.MonthEnd(0)

    # numeric coercions for the minimum needed fields
    for col in ['Mkt_Cap','RI','P','Rev_LTM','Rev_NTM','BPS_LTM','BPS_NTM','EPS_LTM','EPS_NTM',
                'EBITDA_LTM_DS','EBITDA_LTM','EBITDA_NTM','NetDebt_LTM_DS','NetDebt_LTM','NetDebt_NTM']:
        if col not in h.columns: h[col] = np.nan
    for col in h.columns:
        if col not in ['Date','ISIN']:
            h[col] = pd.to_numeric(h[col], errors='coerce')

    # compute TR per your logic (from RI)
    h['RI'] = np.where(h['RI'] == 0, np.nan, h['RI'])
    h['TR'] = h['RI'] / h['RI'].shift(1) - 1

    # SOM versions
    h['Mkt_Cap_SOM'] = h['Mkt_Cap'].shift(1)

    return h[['Date','ISIN','Mkt_Cap_SOM','TR']].copy()

def build_audit_frame(universe: str) -> pd.DataFrame:
    cons = _load_constituents(universe)
    spot = _load_spot(universe)
    all_frames = []
    for isin in cons['ISIN'].unique():
        if isin == 'placeholder':
            continue
        dat = _load_hist_for_isin(isin, spot)
        if not dat.empty:
            all_frames.append(dat)
    if not all_frames:
        raise RuntimeError(f"No ISIN histories found under {CSV_DIR} for {universe}")
    panel = (pd.concat(all_frames, ignore_index=True)
               .dropna(subset=['Date','ISIN'])
               .sort_values(['Date','ISIN']))
    return panel

def audit_point_in_time(universe: str, asof: pd.Timestamp, col: str = 'TR') -> dict:
    panel = build_audit_frame(universe)
    # filter to the month-end slice
    snap = panel[panel['Date'] == asof].copy()
    # ensure column exists
    if col not in snap.columns:
        # if user asked for a SOM/other col that isn't in the minimal load, warn
        available = list(snap.columns)
        raise KeyError(f"Requested column '{col}' not in snapshot. Available: {available}")
    # base filters for a fair comparison
    snap['val'] = pd.to_numeric(snap[col], errors='coerce')
    base = snap.dropna(subset=['Mkt_Cap_SOM','val']).copy()

    stats = {}
    stats['asof'] = asof.date()
    stats['universe'] = universe
    stats['metric_col'] = col
    stats['n_constituents_total'] = int(snap['ISIN'].nunique())
    stats['n_with_data'] = int(base['ISIN'].nunique())
    stats['n_missing_val'] = int(snap['ISIN'].nunique() - base['ISIN'].nunique())

    if base.empty:
        stats['note'] = "No valid rows with both Mkt_Cap_SOM and metric."
        return {'stats': stats, 'weights_ok': False, 'cap_weighted': np.nan,
                'equal_weighted': np.nan, 'top_contrib': pd.DataFrame(), 'bottom_contrib': pd.DataFrame()}

    # weights
    base['w_cap'] = base['Mkt_Cap_SOM'] / base['Mkt_Cap_SOM'].sum()
    wsum = float(base['w_cap'].sum())
    stats['weight_sum'] = wsum
    stats['weight_sum_error_bp'] = (wsum - 1.0) * 1e4

    # aggregates
    cap_weighted = float((base['val'] * base['w_cap']).sum())
    equal_weighted = float(base['val'].mean())
    stats['cap_weighted'] = cap_weighted
    stats['equal_weighted'] = equal_weighted

    # contributions
    base['contribution'] = base['val'] * base['w_cap']
    top = (base[['ISIN','w_cap','val','contribution']]
           .sort_values('contribution', ascending=False)
           .head(10)
           .reset_index(drop=True))
    bot = (base[['ISIN','w_cap','val','contribution']]
           .sort_values('contribution', ascending=True)
           .head(10)
           .reset_index(drop=True))

    # reasonableness bands for 'return-like' columns
    if 'TR' in col.upper():
        stats['cap_weighted_bounds_ok'] = (-0.95 <= cap_weighted <= 5.0)  # generous guardrails
        stats['equal_weighted_bounds_ok'] = (-0.95 <= equal_weighted <= 5.0)
    else:
        stats['cap_weighted_bounds_ok'] = None
        stats['equal_weighted_bounds_ok'] = None

    weights_ok = abs(wsum - 1.0) < 1e-8

    # print a tight summary
    print("— AUDIT SUMMARY —")
    print(f"Universe: {universe} | As-of: {asof.date()} | Column: {col}")
    print(f"Constituents (total/with-data/missing): {stats['n_constituents_total']}/{stats['n_with_data']}/{stats['n_missing_val']}")
    print(f"Sum of cap weights: {wsum:.10f}  (error: {stats['weight_sum_error_bp']:.2f} bp)")
    print(f"Cap-weighted {col}: {cap_weighted:.6f}")
    print(f"Equal-weight {col}: {equal_weighted:.6f}")
    if 'TR' in col.upper():
        print(f"Bounds OK? cap={stats['cap_weighted_bounds_ok']}  eq={stats['equal_weighted_bounds_ok']}")
    print("\nTop 10 contributors:")
    print(top.to_string(index=False))
    print("\nBottom 10 contributors:")
    print(bot.to_string(index=False))

    return {
        'stats': stats,
        'weights_ok': weights_ok,
        'cap_weighted': cap_weighted,
        'equal_weighted': equal_weighted,
        'top_contrib': top,
        'bottom_contrib': bot
    }

# ---- run the audit ----
_ = audit_point_in_time(AUDIT_UNIVERSE, ASOF, AUDIT_COL)


— AUDIT SUMMARY —
Universe: STOXX600 | As-of: 2025-08-31 | Column: TR
Constituents (total/with-data/missing): 601/600/1
Sum of cap weights: 1.0000000000  (error: -0.00 bp)
Cap-weighted TR: 0.730829
Equal-weight TR: 0.076439
Bounds OK? cap=True  eq=True

Top 10 contributors:
        ISIN    w_cap       val  contribution
GB00BP6MXD84 0.013024 60.493199      0.787849
FI4000297767 0.003137  1.253542      0.003932
NL0011585146 0.005010  0.311591      0.001561
DK0062498333 0.010106  0.128385      0.001297
NL0010273215 0.017079  0.061164      0.001045
CH0012005267 0.015142  0.066664      0.001009
GB0009895292 0.014361  0.056101      0.000806
GB0005405286 0.013147  0.049485      0.000651
FR0000120578 0.006860  0.094369      0.000647
ES0113900J37 0.007934  0.081345      0.000645

Bottom 10 contributors:
        ISIN    w_cap       val  contribution
DE0007164600 0.021836 -0.950143     -0.020748
DE0007236101 0.012701 -0.791306     -0.010050
DE0008404005 0.009505 -0.937515     -0.008911
DE00084300