In [None]:
# WHAT: Mount Drive and set canonical paths
from google.colab import drive
drive.mount('/content/drive')

import os
BASE = "/content/drive/MyDrive/MLBA_Project"
RAW  = f"{BASE}/data/raw"
PRO  = f"{BASE}/data/processed"
os.makedirs(PRO, exist_ok=True)

print("RAW:", RAW)
print("PRO:", PRO)
print("RAW files:", os.listdir(RAW))

In [None]:
# WHAT: Read *_LargeCap.csv and standardize to (date, fund_id, nav), build union calendar
import pandas as pd, numpy as np, glob, os

def load_fund_csv(path):
    df = pd.read_csv(path)
    df.columns = df.columns.str.strip().str.lower()
    if "date" not in df.columns:
        raise ValueError(f"'date' column missing in {os.path.basename(path)}")
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df = df.dropna(subset=["date"]).sort_values("date")

    nav_col = None
    for c in ["nav","net_asset_value","nav_value","price","close"]:
        if c in df.columns:
            nav_col = c; break
    if nav_col is None:
        raise ValueError(f"No NAV-like column found in {os.path.basename(path)}")

    fund_id = os.path.splitext(os.path.basename(path))[0]
    out = df[["date", nav_col]].rename(columns={nav_col:"nav"}).copy()
    out["fund_id"] = fund_id
    out = out.drop_duplicates(subset=["fund_id","date"], keep="last")
    return out[["date","fund_id","nav"]]

fund_files = sorted(glob.glob(os.path.join(RAW, "*_LargeCap.csv")))
if not fund_files:
    raise FileNotFoundError("No fund CSVs found matching *_LargeCap.csv in RAW.")

nav_long = pd.concat([load_fund_csv(p) for p in fund_files], ignore_index=True)
nav_long = nav_long.sort_values(["fund_id","date"]).reset_index(drop=True)
print("NAV rows:", len(nav_long), "| funds:", nav_long["fund_id"].nunique(),
      "| date range:", nav_long["date"].min().date(), "→", nav_long["date"].max().date())

# Union calendar
cal = pd.DataFrame({"date": sorted(nav_long["date"].unique())})
cal_path = os.path.join(RAW, "trading_calendar_union.csv")
cal.to_csv(cal_path, index=False)
print("Saved calendar →", cal_path)

In [None]:
# WHAT: Coverage report per fund on the union calendar
nav_u = nav_long.copy()
dup = nav_u.duplicated(subset=["fund_id","date"]).sum()
if dup > 0:
    raise AssertionError(f"Found {dup} duplicate (fund_id,date) rows after union filter.")

n_union_days = len(cal)
coverage = (nav_u.groupby("fund_id")["date"].nunique()
            .rename("available_days").reset_index())
coverage["union_days"] = n_union_days
coverage["coverage_pct"] = coverage["available_days"] / n_union_days

print("UNION CALENDAR:", cal['date'].min().date(), "→", cal['date'].max().date(),
      "| days:", len(cal), "| funds:", nav_u["fund_id"].nunique())
print("NAV union panel rows:", len(nav_u),
      "| rows if fully complete:", n_union_days * nav_u["fund_id"].nunique())
print("\nLowest coverage (top 10):")
print(coverage.sort_values("coverage_pct").head(10))

In [None]:
# WHAT: Load benchmark + macro factors and align to union cal with ffill + limited head bfill
import pandas as pd, numpy as np, os

def load_factor_csv(filename, date_col="date", value_col=None):
    path = os.path.join(RAW, filename)
    df = pd.read_csv(path)
    df.columns = df.columns.str.strip().str.lower()
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    df = df.dropna(subset=[date_col]).sort_values(date_col)
    if value_col is None:
        value_col = [c for c in df.columns if c != date_col][0]
    return df[[date_col, value_col]].rename(columns={value_col: value_col})

bench = load_factor_csv("index_tri_nifty100.csv", value_col="tri")

def try_load(name, value_col):
    try:
        return load_factor_csv(name, value_col=value_col)
    except FileNotFoundError:
        print(f"{name} not found → using NaNs")
        return pd.DataFrame({"date": bench["date"], value_col: np.nan})

vix      = try_load("india_vix.csv",   "india_vix")
usd      = try_load("usd_inr.csv",     "usd_inr")
gsec     = try_load("gsec_10y.csv",    "gsec_10y_yield")
gold_inr = try_load("gold_inr.csv",    "gold_inr")
brent    = try_load("brent_crude.csv", "brent_usd")

def align_with_head_bfill(series_df, value_col, bfill_head_days=10):
    out = cal.merge(series_df, on="date", how="left").sort_values("date")
    out[value_col] = out[value_col].ffill()
    out[value_col] = out[value_col].bfill(limit=bfill_head_days)
    return out[["date", value_col]]

tri_al   = align_with_head_bfill(bench,     "tri",            10)
vix_al   = align_with_head_bfill(vix,       "india_vix",      10)
usd_al   = align_with_head_bfill(usd,       "usd_inr",        10)
gsec_al  = align_with_head_bfill(gsec,      "gsec_10y_yield", 10)
gold_al  = align_with_head_bfill(gold_inr,  "gold_inr",       10)
brent_al = align_with_head_bfill(brent,     "brent_usd",      10)

panel = (cal.merge(tri_al,   on="date", how="left")
           .merge(vix_al,   on="date", how="left")
           .merge(usd_al,   on="date", how="left")
           .merge(gsec_al,  on="date", how="left")
           .merge(gold_al,  on="date", how="left")
           .merge(brent_al, on="date", how="left"))

In [None]:
# WHAT: Join NAV with aligned factors and save master (union)
master = (nav_u.merge(panel, on="date", how="left")
               .sort_values(["fund_id","date"])
               .reset_index(drop=True))

out_master = os.path.join(PRO, "clean_master_union.csv")
master.to_csv(out_master, index=False)

print("Saved master:", out_master,
      "| rows:", len(master), "| cols:", len(master.columns),
      "| funds:", master["fund_id"].nunique())

funds = master["fund_id"].unique().tolist()[:2]
for fid in funds:
    snap = master.loc[master["fund_id"] == fid, ["date","fund_id","nav","tri","india_vix","usd_inr","gsec_10y_yield","gold_inr","brent_usd"]].head(5)
    print(f"\n== {fid} ==")
    print(snap.to_string(index=False))