# **JPM MLCOE TSRL 2026 Q1**
---

## 0. Global Configurations
---

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import yfinance as yf
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from typing import Tuple


pd.set_option('display.float_format', lambda v: f"{v:,.2f}")


print('yfinance:', yf.__version__)
print('pandas:', pd.__version__)
print('tensorflow:', tf.__version__)

## 1. Data Pipeline
---

### 1.1 Column Aliases and Identity Checks

In [None]:
BS_ALIASES = {
    "Total Assets": ["Total Assets"],
    "Total Liab": ["Total Liab", "Total Liabilities", "Total Liabilities Net Minority Interest"],
    "Total Stockholder Equity": ["Total Stockholder Equity", "Total Equity Gross Minority Interest", "Stockholders Equity"],
}


def canonicalize_bs(df: pd.DataFrame) -> pd.DataFrame:
    rename_map = {}

    for canon, candidates in BS_ALIASES.items():
        for c in candidates:

            if c in df.columns:
                rename_map[c] = canon
                
                break

    return df.rename(columns = rename_map)


def identity_residual(df: pd.DataFrame) -> pd.Series:

    if "Total Liab" not in df.columns or "Total Stockholder Equity" not in df.columns:
        df = canonicalize_bs(df)
    
    required = ["Total Assets", "Total Liab", "Total Stockholder Equity"]
    missing = [c for c in required if c not in df]

    if missing:
        raise KeyError(f"Missing columns: {missing}")


    return df["Total Assets"] - (df["Total Liab"] + df["Total Stockholder Equity"])


def summarize_identity(resid: pd.Series) -> pd.Series:

    return pd.Series({
        "mean": resid.mean(),
        "std": resid.std(),
        "max_abs": resid.abs().max(),
    })

### 1.2 Load AAPL Data with Caching

In [None]:
DATA_DIR = Path("..").resolve() / "data"
DATA_DIR.mkdir(parents = True, exist_ok = True)

def _fetch_statements(tkr, freq: str):
    bs = tkr.get_balance_sheet(freq = freq)
    is_df = tkr.get_financials(freq = freq)

    if bs is None or is_df is None:
        return pd.DataFrame(), pd.DataFrame()

    bs = bs.T.sort_index()
    is_df = is_df.T.sort_index()

    return bs, is_df


def _load_cached(path: Path) -> pd.DataFrame:
    return pd.read_csv(path, index_col = 0, parse_dates = True)


def load_statements(ticker: str = "AAPL"):
    bs_y_path = DATA_DIR / f"{ticker.lower()}_balance_sheet_yearly.csv"
    is_y_path = DATA_DIR / f"{ticker.lower()}_income_statement_yearly.csv"
    bs_q_path = DATA_DIR / f"{ticker.lower()}_balance_sheet_quarterly.csv"
    is_q_path = DATA_DIR / f"{ticker.lower()}_income_statement_quarterly.csv"

    bs_y = pd.DataFrame()
    is_y = pd.DataFrame()
    bs_q = pd.DataFrame()
    is_q = pd.DataFrame()

    if bs_y_path.exists() and is_y_path.exists():
        bs_y = _load_cached(bs_y_path)
        is_y = _load_cached(is_y_path)

    if bs_q_path.exists() and is_q_path.exists():
        bs_q = _load_cached(bs_q_path)
        is_q = _load_cached(is_q_path)

    if bs_y.empty or is_y.empty or bs_q.empty or is_q.empty:
        tkr = yf.Ticker(ticker)

        if bs_y.empty or is_y.empty:
            bs_y, is_y = _fetch_statements(tkr, "yearly")
            if not bs_y.empty and not is_y.empty:
                bs_y.to_csv(bs_y_path)
                is_y.to_csv(is_y_path)

        if bs_q.empty or is_q.empty:
            bs_q, is_q = _fetch_statements(tkr, "quarterly")
            if not bs_q.empty and not is_q.empty:
                bs_q.to_csv(bs_q_path)
                is_q.to_csv(is_q_path)

    freq = "quarterly"
    if bs_q.empty or is_q.empty or bs_y.shape[0] >= bs_q.shape[0]:
        freq = "yearly"

    if freq == "quarterly":
        bs = bs_q
        is_df = is_q
    else:
        bs = bs_y
        is_df = is_y

    if bs.empty or is_df.empty:
        raise RuntimeError(f"Failed to fetch statements for {ticker}")

    bs = canonicalize_bs(bs)

    return bs, is_df, freq


bs, is_df, stmt_freq = load_statements("AAPL")


print("Statement frequency:", stmt_freq)
print("Balance sheet shape:", bs.shape)
print("Income statement shape:", is_df.shape)
print("Identity residual stats:", summarize_identity(identity_residual(bs)))
bs.head()

### 1.3 Dataset Assembly

In [None]:
TARGET_BS = ['Total Assets', 'Total Liab', 'Total Stockholder Equity']
NET_INCOME_COL = 'Net Income'

targets = bs[TARGET_BS].copy()
net_income = _pick(is_df, ['Net Income']).rename(NET_INCOME_COL)
net_income.index = targets.index

dataset = features.join(targets, how = 'inner').join(net_income, how = 'inner')
dataset = dataset.dropna()
FEATURE_COLS = [c for c in dataset.columns if c not in TARGET_BS + [NET_INCOME_COL]]


print('Dataset shape:', dataset.shape)
print('Feature cols:', len(FEATURE_COLS), 'Target cols:', len(TARGET_BS) + 1)