# **JPM MLCOE TSRL 2026 Q1**
---
**Heartie CHEN**

## 0. Global Configurations
---

In [1]:
# Core dependencies for data ingestion, modeling, and evaluation.
from pathlib import Path
import json
import re
import random
import numpy as np
import pandas as pd
import yfinance as yf
import tensorflow as tf
from tensorflow import keras
import tensorflow_probability as tfp
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from typing import Tuple

pd.set_option('display.float_format', lambda v: f"{v:,.2f}")


print('yfinance:', yf.__version__)
print('pandas:', pd.__version__)
print('tensorflow:', tf.__version__)

yfinance: 1.1.0
pandas: 3.0.0
tensorflow: 2.20.0


In [4]:
# Fix RNG seeds for reproducibility across runs.
CONFIG_DIR = Path('..').resolve() / 'config'
with open(CONFIG_DIR / 'model_config.json', 'r', encoding = 'utf-8-sig') as f:
    MODEL_CONFIG = json.load(f)

SEED = int(MODEL_CONFIG['seed'])
random.seed(SEED)
np.random.seed(SEED)
tf.keras.utils.set_random_seed(SEED)
tf.config.experimental.enable_op_determinism()

# Configure cache location and statement frequency.
DATA_DIR = Path('..').resolve() / MODEL_CONFIG['data_dir']
DATA_DIR.mkdir(parents = True, exist_ok = True)
TARGET_FREQ = MODEL_CONFIG['target_freq']

## 1. Data Pipeline
---

### 1.1 Column Aliases and Identity Checks

In [5]:
# Provider column aliases to canonical balance sheet names.
BS_ALIASES = MODEL_CONFIG['bs_aliases']


def canonicalize_bs(df: pd.DataFrame) -> pd.DataFrame:
    """Return a balance sheet with canonical column names."""

    rename_map = {}

    for canon, candidates in BS_ALIASES.items():
        for c in candidates:

            if c in df.columns:
                rename_map[c] = canon
                
                break


    return df.rename(columns = rename_map)


def identity_residual(df: pd.DataFrame) -> pd.Series:
    """Compute balance sheet identity residuals."""

    if "Total Liabilities" not in df.columns or "Total Equity" not in df.columns:
        df = canonicalize_bs(df)
    
    required = ["Total Assets", "Total Liabilities", "Total Equity"]
    missing = [c for c in required if c not in df]

    if missing:

        raise KeyError(f"Missing columns: {missing}")


    return df["Total Assets"] - (df["Total Liabilities"] + df["Total Equity"])


def summarize_identity(resid: pd.Series) -> pd.Series:
    """Summarize residual statistics."""

    return pd.Series({

        "mean": resid.mean(),
        "std": resid.std(),
        "max_abs": resid.abs().max(),
    })

### 1.2 Load Data with Caching

In [6]:
# Ticker normalization and cached statement loading utilities.
SPECIAL_TICKERS = MODEL_CONFIG['special_tickers']
FISCAL_YEAR_END_MONTHS_RAW = MODEL_CONFIG.get('fiscal_year_end_months', {})


def normalize_ticker(raw: str) -> str:
    """Normalize tickers and apply project-specific mappings."""

    value = raw.strip()
    upper = value.upper()

    if upper in SPECIAL_TICKERS:

        return SPECIAL_TICKERS[upper]

    if re.fullmatch(r"\d+", value):

        return value.zfill(4) + ".HK"


    return upper


FISCAL_YEAR_END_MONTHS = {
    normalize_ticker(k): int(v) for k, v in FISCAL_YEAR_END_MONTHS_RAW.items()
}


def slugify(ticker: str) -> str:
    """Create a filesystem-friendly slug for a ticker."""

    return re.sub(r"[^a-z0-9]+", "_", ticker.lower()).strip("_")


def _fetch_statements(tkr, freq: str):
    """Fetch statements from Yahoo Finance for a ticker."""

    bs = tkr.get_balance_sheet(freq = freq)
    is_df = tkr.get_financials(freq = freq)

    if bs is None or is_df is None:

        return pd.DataFrame(), pd.DataFrame()

    bs = bs.T.sort_index()
    is_df = is_df.T.sort_index()


    return bs, is_df


def _load_cached(path: Path) -> pd.DataFrame:
    """Load a cached statement CSV if available."""

    return pd.read_csv(path, index_col = 0, parse_dates = True)


def load_statements(ticker: str, freq: str = TARGET_FREQ):
    """Load or fetch statements for a ticker and frequency."""

    ticker = normalize_ticker(ticker)
    slug = slugify(ticker)
    freq = freq.lower()

    data_dir = DATA_DIR / freq
    data_dir.mkdir(parents = True, exist_ok = True)

    bs_path = data_dir / f"{slug}_balance_sheet_{freq}.csv"
    is_path = data_dir / f"{slug}_income_statement_{freq}.csv"

    # Prefer cached CSVs to reduce API calls.

    if bs_path.exists() and is_path.exists():
        bs = _load_cached(bs_path)
        is_df = _load_cached(is_path)

    else:
        tkr = yf.Ticker(ticker)
        bs, is_df = _fetch_statements(tkr, freq)

        if not bs.empty and not is_df.empty:
            bs.to_csv(bs_path)
            is_df.to_csv(is_path)

    if bs.empty or is_df.empty:

        raise RuntimeError(f"Failed to fetch statements for {ticker} ({freq})")

    bs = canonicalize_bs(bs)


    return bs, is_df, freq


TICKERS = MODEL_CONFIG['tickers']
PRIMARY_TICKER = MODEL_CONFIG.get('primary_ticker', TICKERS[0])
bs, is_df, stmt_freq = load_statements(PRIMARY_TICKER, freq = TARGET_FREQ)


print("Loaded tickers:", [normalize_ticker(t) for t in TICKERS])
print("Statement frequency:", stmt_freq)
print("Balance sheet shape:", bs.shape)
print("Income statement shape:", is_df.shape)
print("Identity residual stats:", summarize_identity(identity_residual(bs)))


bs.head()

Loaded tickers: ['AAPL', 'GOOG', '0700.HK', '1810.HK', 'IBM', 'TSLA', '9633.HK', '9987.HK', '9988.HK', 'IBKR', 'KO', 'MCD', 'EL', 'BRK-B', 'NESN.SW']
Statement frequency: yearly
Balance sheet shape: (5, 69)
Income statement shape: (5, 39)
Identity residual stats: mean      0.00
std       0.00
max_abs   0.00
dtype: float64


Unnamed: 0,TreasurySharesNumber,OrdinarySharesNumber,ShareIssued,NetDebt,TotalDebt,TangibleBookValue,InvestedCapital,WorkingCapital,NetTangibleAssets,CapitalLeaseObligations,...,OtherCurrentAssets,Inventory,Receivables,OtherReceivables,Accounts Receivable,CashCashEquivalentsAndShortTermInvestments,OtherShortTermInvestments,CashAndCashEquivalents,CashEquivalents,CashFinancial
2021-09-30,,,,,,,,,,11803000000.0,...,,,,,,,,,,
2022-09-30,,15943425000.0,15943425000.0,96423000000.0,132480000000.0,50672000000.0,170741000000.0,-18577000000.0,50672000000.0,12411000000.0,...,21223000000.0,4946000000.0,60932000000.0,32748000000.0,28184000000.0,48304000000.0,24658000000.0,23646000000.0,5100000000.0,18546000000.0
2023-09-30,0.0,15550061000.0,15550061000.0,81123000000.0,111088000000.0,62146000000.0,173234000000.0,-1742000000.0,62146000000.0,12842000000.0,...,14695000000.0,6331000000.0,60985000000.0,31477000000.0,29508000000.0,61555000000.0,31590000000.0,29965000000.0,1606000000.0,28359000000.0
2024-09-30,,15116786000.0,15116786000.0,76686000000.0,106629000000.0,56950000000.0,163579000000.0,-23405000000.0,56950000000.0,,...,14287000000.0,7286000000.0,66243000000.0,32833000000.0,33410000000.0,65171000000.0,35228000000.0,29943000000.0,2744000000.0,27199000000.0
2025-09-30,,14773260000.0,14773260000.0,62723000000.0,98657000000.0,73733000000.0,172390000000.0,-17674000000.0,73733000000.0,,...,14585000000.0,5718000000.0,72957000000.0,33180000000.0,39777000000.0,54697000000.0,18763000000.0,35934000000.0,7667000000.0,28267000000.0


## 2. Features & Dataset

### 2.1 Feature Engineering

#### 2.1.1 Derived Drivers (DSO/DPO/DIH, Margins, Growth, Logs)

In [6]:
def _pick(df: pd.DataFrame, options):
    """Pick the first available series from the provided options."""

    for c in options:

        if c in df.columns:

            return df[c]

    raise KeyError(f'Missing columns: {options}')


def compute_features(bs: pd.DataFrame, is_df: pd.DataFrame, days: float = 365.0, growth_periods: int = 1) -> pd.DataFrame:
    """Compute financial ratios and growth features."""

    rev = _pick(is_df, ['Total Revenue', 'Operating Revenue', 'TotalRevenue', 'OperatingRevenue', 'Revenues'])
    cogs = _pick(is_df, ['Cost Of Revenue', 'Cost of Revenue', 'CostOfRevenue', 'ReconciledCostOfRevenue'])
    op_inc = _pick(is_df, ['Operating Income', 'OperatingIncome'])
    net_inc = _pick(is_df, ['Net Income', 'NetIncome', 'NetIncomeFromContinuingOperationNetMinorityInterest'])
    ar = _pick(bs, ['Accounts Receivable', 'AccountsReceivable'])
    ap = _pick(bs, ['Accounts Payable', 'AccountsPayable'])
    inv = _pick(bs, ['Inventory', 'Inventories'])

    feats = pd.DataFrame(index = bs.index)

    sales_per_day = rev / days
    cogs_per_day = cogs / days

    feats['dso'] = ar / sales_per_day
    feats['dpo'] = ap / cogs_per_day
    feats['dih'] = inv / cogs_per_day
    feats['gross_margin'] = (rev - cogs) / rev
    feats['op_margin'] = op_inc / rev
    feats['net_margin'] = net_inc / rev
    feats['rev_yoy'] = rev.pct_change(periods = growth_periods)
    feats['cogs_yoy'] = cogs.pct_change(periods = growth_periods)
    feats['netinc_yoy'] = net_inc.pct_change(periods = growth_periods)
    feats['log_rev'] = np.log1p(rev)
    feats['log_assets'] = np.log1p(_pick(bs, ['Total Assets', 'TotalAssets']))
    
    feats = feats.replace([np.inf, -np.inf], np.nan)
    feats = feats.sort_index().ffill().bfill()


    return feats


period_days = 365.0 if TARGET_FREQ == 'yearly' else 90.0
growth_periods = 1 if TARGET_FREQ == 'yearly' else 4

features = compute_features(bs, is_df, days = period_days, growth_periods = growth_periods)
features.tail()

Unnamed: 0,dso,dpo,dih,gross_margin,op_margin,net_margin,rev_yoy,cogs_yoy,netinc_yoy,log_rev,log_assets
2021-09-30,26.09,104.69,8.08,0.43,0.3,0.25,-0.03,-0.04,-0.03,26.7,26.59
2022-09-30,26.09,104.69,8.08,0.43,0.3,0.25,-0.03,-0.04,-0.03,26.7,26.59
2023-09-30,28.1,106.72,10.79,0.44,0.3,0.25,-0.03,-0.04,-0.03,26.67,26.59
2024-09-30,31.19,119.66,12.64,0.46,0.32,0.24,0.02,-0.02,-0.03,26.69,26.62
2025-09-30,34.89,115.4,9.45,0.47,0.32,0.27,0.06,0.05,0.19,26.75,26.61


### 2.2 Dataset Assembly

In [7]:
TARGET_BS = ['Total Assets', 'Total Liabilities', 'Total Equity']
TARGET_LE = ['Total Liabilities', 'Total Equity']
NET_INCOME_COL = 'Net Income'
STATE_COLS = [
    'Accounts Receivable', 'Accounts Payable', 'Inventory', 'Net PPE',
    'Total Liabilities', 'Total Equity', 'Retained Earnings'
]


def _has_any(df: pd.DataFrame, options: list[str]) -> bool:
    """Return True if any candidate column exists."""

    return any(c in df.columns for c in options)


def missing_required_columns(bs: pd.DataFrame, is_df: pd.DataFrame) -> list[str]:
    """Return missing required columns for BS and IS."""

    missing = []

    if not _has_any(bs, ['Total Assets', 'TotalAssets']):
        missing.append('Total Assets')
    
    if not _has_any(bs, ['Total Liabilities', 'Total Liabilities', 'TotalLiabilities', 'Total Liabilities Net Minority Interest', 'TotalLiabilitiesNetMinorityInterest']):
        missing.append('Total Liabilities')
    
    if not _has_any(bs, ['Total Equity', 'Stockholders Equity', 'StockholdersEquity', 'Total Equity Gross Minority Interest', 'TotalEquityGrossMinorityInterest']):
        missing.append('Total Equity')
    
    if not _has_any(bs, ['Accounts Receivable', 'AccountsReceivable']):
        missing.append('Accounts Receivable')
    
    if not _has_any(bs, ['Accounts Payable', 'AccountsPayable']):
        missing.append('Accounts Payable')
    
    if not _has_any(bs, ['Inventory', 'Inventories']):
        missing.append('Inventory')
    
    if not _has_any(bs, ['Net PPE', 'NetPPE']):
        missing.append('Net PPE')
    
    if not _has_any(bs, ['Retained Earnings', 'RetainedEarnings']):
        missing.append('Retained Earnings')

    if not _has_any(is_df, ['Total Revenue', 'Operating Revenue', 'TotalRevenue', 'OperatingRevenue', 'Revenues']):
        missing.append('Total Revenue')
   
    if not _has_any(is_df, ['Cost Of Revenue', 'Cost of Revenue', 'CostOfRevenue', 'ReconciledCostOfRevenue']):
        missing.append('Cost Of Revenue')
    
    if not _has_any(is_df, ['Operating Income', 'OperatingIncome']):
        missing.append('Operating Income')
    
    if not _has_any(is_df, ['Net Income', 'NetIncome', 'NetIncomeFromContinuingOperationNetMinorityInterest']):
        missing.append('Net Income')


    return missing


def build_aligned_for_ticker(ticker: str) -> pd.DataFrame:
    """Build an aligned dataset for one ticker."""

    bs, is_df, freq = load_statements(ticker, freq = TARGET_FREQ)
    missing = missing_required_columns(bs, is_df)
    
    if missing:

        print(f"{ticker}: skip, missing {missing}")

        return pd.DataFrame()

    period_days = 365.0 if freq == 'yearly' else 90.0
    growth_periods = 1 if freq == 'yearly' else 4

    features = compute_features(bs, is_df, days = period_days, growth_periods = growth_periods)
    targets = bs[TARGET_BS].copy()
    net_income = _pick(is_df, ['Net Income', 'NetIncome', 'NetIncomeFromContinuingOperationNetMinorityInterest']).rename(NET_INCOME_COL)
    net_income = net_income.reindex(targets.index)

    # Align features, targets, and net income on shared dates.
    dataset = features.join(targets, how = 'inner').join(net_income, how = 'inner')
    dataset = dataset.dropna()

    rev_series = _pick(is_df, ['Total Revenue', 'Operating Revenue', 'TotalRevenue', 'OperatingRevenue', 'Revenues'])
    # Shift state variables to create t-1 inputs.
    prev_state_df = bs.reindex(columns = STATE_COLS).shift(1)
    prev_state_df.columns = [f'prev_{c}' for c in STATE_COLS]
    prev_state_df['prev_Total Revenue'] = rev_series.reindex(targets.index).shift(1)

    # Combine current data with lagged state for modeling.
    aligned = dataset.join(prev_state_df, how = 'inner').dropna()
    aligned['ticker'] = ticker
    aligned['stmt_freq'] = freq


    return aligned


aligned_list = []

for ticker in [normalize_ticker(t) for t in TICKERS]:

    try:
        aligned_t = build_aligned_for_ticker(ticker)

        if aligned_t.empty:

            print(f"{ticker}: empty after alignment")

        else:

            print(f"{ticker}: aligned rows = {aligned_t.shape[0]}")
            aligned_list.append(aligned_t)

    except Exception as exc:

        print(f"{ticker}: failed ({exc})")

if not aligned_list:

    raise RuntimeError('No aligned data available')

aligned = pd.concat(aligned_list, axis = 0).sort_index()
prev_cols = [f'prev_{c}' for c in STATE_COLS] + ['prev_Total Revenue']
drop_cols = TARGET_BS + [NET_INCOME_COL] + prev_cols + ['ticker', 'stmt_freq']
FEATURE_COLS = [c for c in aligned.columns if c not in drop_cols]


print('Dataset shape:', aligned.shape)
print('Feature cols:', len(FEATURE_COLS), 'Target cols:', len(TARGET_LE) + 1)
print('Tickers:', aligned['ticker'].nunique())

AAPL: aligned rows = 3
GOOG: aligned rows = 2
0700.HK: aligned rows = 3
1810.HK: aligned rows = 3
IBM: aligned rows = 3
TSLA: aligned rows = 3
9633.HK: skip, missing ['Retained Earnings']
9633.HK: empty after alignment
9987.HK: aligned rows = 3
9988.HK: aligned rows = 2
IBKR: skip, missing ['Inventory', 'Net PPE']
IBKR: empty after alignment
KO: aligned rows = 3
MCD: aligned rows = 3
EL: aligned rows = 3
BRK-B: skip, missing ['Accounts Payable', 'Inventory', 'Cost Of Revenue', 'Operating Income']
BRK-B: empty after alignment
NESN.SW: aligned rows = 3
Dataset shape: (34, 25)
Feature cols: 11 Target cols: 3
Tickers: 12


### 2.3 Prev-state Matrix for Algebraic Layer

In [8]:
X_feat = aligned[FEATURE_COLS].values.astype('float32')
Y_bs = aligned[TARGET_LE].values.astype('float32')
Y_earn = aligned[[NET_INCOME_COL]].values.astype('float32')
X_prev = aligned[prev_cols].values.astype('float32')


print('Aligned shapes:', X_feat.shape, X_prev.shape, Y_bs.shape, Y_earn.shape)

Aligned shapes: (34, 11) (34, 8) (34, 2) (34, 1)


### 2.4 Scaling (z-score) for Stability

In [9]:
feat_scaler = StandardScaler()
bs_scaler = StandardScaler()
earn_scaler = StandardScaler()
prev_scaler = StandardScaler()

X_feat_scaled = feat_scaler.fit_transform(X_feat)
Y_bs_scaled = bs_scaler.fit_transform(Y_bs)
Y_earn_scaled = earn_scaler.fit_transform(Y_earn)
X_prev_scaled = prev_scaler.fit_transform(X_prev)

### 2.5 Train/Val Split on Scaled Data

In [10]:
n = X_feat_scaled.shape[0]
train_size = max(1, int(0.8 * n))

X_train_feat = X_feat_scaled[:train_size]
Y_train_bs = Y_bs_scaled[:train_size]
Y_train_earn = Y_earn_scaled[:train_size]
X_train_prev = X_prev_scaled[:train_size]

X_val_feat = X_feat_scaled[train_size:] if train_size < n else X_feat_scaled[train_size - 1:]
Y_val_bs = Y_bs_scaled[train_size:] if train_size < n else Y_bs_scaled[train_size - 1:]
Y_val_earn = Y_earn_scaled[train_size:] if train_size < n else Y_earn_scaled[train_size - 1:]
X_val_prev = X_prev_scaled[train_size:] if train_size < n else X_prev_scaled[train_size - 1:]


print('Scaled train:', X_train_feat.shape, Y_train_bs.shape, Y_train_earn.shape)
print('Scaled val:', X_val_feat.shape, Y_val_bs.shape, Y_val_earn.shape)

Scaled train: (27, 11) (27, 2) (27, 1)
Scaled val: (7, 11) (7, 2) (7, 1)


## 3. TensorFlow (Pareja/Pelaez Constrained)

### 3.1 TF Model with Algebraic Generator + Earnings Head

In [11]:
class AlgebraicBS(keras.layers.Layer):
    """Layer that enforces algebraic balance sheet constraints."""

    def __init__(self):
        """Initialize sublayers for balance sheet generation."""
        
        super().__init__()
        
        self.hidden = keras.layers.Dense(64, activation = 'relu')
        self.rev_head = keras.layers.Dense(1, activation = 'relu')
        self.cogs_head = keras.layers.Dense(1, activation = 'relu')
        self.drivers = keras.layers.Dense(5)
        self.margin_head = keras.layers.Dense(1)
        self.payout_head = keras.layers.Dense(1)
        self.earn_head = keras.layers.Dense(1, name = 'net_income_head')


    def call(self, inputs: Tuple[tf.Tensor, tf.Tensor]):
        """Compute constrained balance sheet and earnings outputs."""

        if not isinstance(inputs, (tuple, list)):

            raise TypeError("inputs must be (features, prev_state)")

        features, prev_state = inputs
        ar_prev, ap_prev, inv_prev, ppe_prev, liab_prev, equity_prev, re_prev, rev_prev = tf.split(prev_state, num_or_size_splits = 8, axis = -1)
        hidden = self.hidden(features)
        rev_predicate = self.rev_head(hidden)
        cogs_predicate = self.cogs_head(hidden)
        drivers_raw = self.drivers(hidden)

        dso = tf.nn.softplus(drivers_raw[:, 0:1])
        dpo = tf.nn.softplus(drivers_raw[:, 1:2])
        dih = tf.nn.softplus(drivers_raw[:, 2:3])
        dep_rate = tf.nn.sigmoid(drivers_raw[:, 3:4]) * 0.2
        capex_rate = tf.nn.sigmoid(drivers_raw[:, 4:5]) * 0.2
        net_margin = tf.tanh(self.margin_head(hidden)) * 0.5
        div_payout = tf.nn.sigmoid(self.payout_head(hidden))

        sales_per_day = rev_predicate / 365.0
        cogs_per_day = cogs_predicate / 365.0

        ar_next = dso * sales_per_day
        ap_next = dpo * cogs_per_day
        inv_next = dih * cogs_per_day
        dep = dep_rate * ppe_prev
        capex = capex_rate * rev_predicate
        ppe_next = ppe_prev + capex - dep
        net_income = net_margin * rev_predicate
        earn_predicate = self.earn_head(hidden)
        div = div_payout * net_income
        re_next = re_prev + net_income - div

        other_equity = tf.nn.relu(equity_prev - re_prev)

        equity_next = re_next + other_equity

        other_liab_prev = tf.nn.relu(liab_prev - ap_prev)
        growth = tf.where(rev_prev > 0, rev_predicate / rev_prev - 1.0, tf.zeros_like(rev_predicate))

        other_liab_next = other_liab_prev * (1.0 + growth)
        liab_next = ap_next + other_liab_next
        assets_wo_cash = ar_next + inv_next + ppe_next
        cash_next = equity_next + liab_next - assets_wo_cash
        assets_next = liab_next + equity_next
        bs_out = tf.concat([liab_next, equity_next], axis = -1)


        return bs_out, earn_predicate


def build_pareja_model(feat_dim: int, state_dim: int = 8):
    """Build a Pareja/Pelaez-constrained model."""

    feat_in = keras.Input(shape = (feat_dim,), name = 'features')
    state_in = keras.Input(shape = (state_dim,), name = 'prev_state')
    bs_out, earn_out = AlgebraicBS()([feat_in, state_in])
    
    
    return keras.Model([feat_in, state_in], [bs_out, earn_out], name = 'bs_pareja_style')


pareja_model = build_pareja_model(feat_dim = X_feat_scaled.shape[1], state_dim = X_prev_scaled.shape[1])
pareja_model.summary()

### 3.2 Tests / Sanity Checks

#### 3.2.1 Unit test

In [12]:
print('Running Unit test...')

try: 
    idx = pd.to_datetime(['2023-12-31', '2024-12-31'])

    sample_bs = pd.DataFrame(
        {
            'TotalAssets': [100.0, 120.0],
            'TotalLiabilities': [60.0, 70.0],
            'StockholdersEquity': [40.0, 50.0],
            'AccountsReceivable': [10.0, 12.0],
            'AccountsPayable': [8.0, 9.0],
            'Inventory': [5.0, 6.0],
            'NetPPE': [20.0, 22.0],
            'RetainedEarnings': [15.0, 18.0],
        },

        index = idx,
    )

    sample_is = pd.DataFrame(
        {
            'TotalRevenue': [80.0, 90.0],
            'CostOfRevenue': [30.0, 35.0],
            'OperatingIncome': [10.0, 12.0],
            'NetIncome': [8.0, 9.0],
        },

        index = idx,
    )

    canon = canonicalize_bs(sample_bs)

    assert 'Total Assets' in canon.columns, 'canonicalize_bs failed for Total Assets'
    assert 'Total Liabilities' in canon.columns, 'canonicalize_bs failed for Total Liabilities'
    assert 'Total Equity' in canon.columns, 'canonicalize_bs failed for Total Equity'

    resid = identity_residual(canon)
    assert np.allclose(resid.values, 0.0), 'identity_residual should be zero for balanced data'

    missing = missing_required_columns(sample_bs, sample_is)
    assert missing == [], f'missing_required_columns unexpected: {missing}'

    features_test = compute_features(sample_bs, sample_is, days = 365.0, growth_periods = 1)

    expected_cols = {
        'dso', 'dpo', 'dih', 'gross_margin', 'op_margin', 'net_margin',
        'rev_yoy', 'cogs_yoy', 'netinc_yoy', 'log_rev', 'log_assets'
    }

    assert expected_cols.issubset(set(features_test.columns)), 'compute_features missing expected columns'
    assert np.isfinite(features_test.to_numpy()).all(), 'compute_features produced non-finite values'

    print('Unit tests passed.')

except Exception as e:
    print('Unit tests failed:', e)

    raise

Running Unit test...
Unit tests passed.


#### 3.2.2 Integration test

In [13]:
print('Running Integration test...')

try:
    assert aligned.shape[0] > 0, 'aligned dataset is empty'
    assert X_feat.shape[0] == Y_bs.shape[0] == Y_earn.shape[0] == X_prev.shape[0], 'array length mismatch'

    test_batch = min(2, X_train_feat.shape[0])
    assert test_batch > 0, 'empty training batch'

    bs_hat, earn_hat = pareja_model([X_train_feat[:test_batch], X_train_prev[:test_batch]], training = False)
    assert bs_hat.shape[-1] == len(TARGET_LE), 'unexpected BS output width'
    assert earn_hat.shape[-1] == 1, 'unexpected earnings output width'

    print('Integration tests passed.')

except Exception as e:
    print('Integration tests failed:', e)

    raise


Running Integration test...
Integration tests passed.


### 3.3 Train/Evaluate (MAE on BS + Earnings)

In [14]:
pareja_model.compile(
    optimizer = keras.optimizers.Adam(1e-3),
    loss = [keras.losses.MeanAbsoluteError(), keras.losses.MeanAbsoluteError()],
    loss_weights = [1.0, 0.3],
)

hist = pareja_model.fit(
    [X_train_feat, X_train_prev], [Y_train_bs, Y_train_earn],
    
    validation_data = (
        [X_val_feat, X_val_prev], [Y_val_bs, Y_val_earn]
    ) if len(X_val_feat) > 0 else None,
    
    epochs = 20,
    batch_size = 2,
    verbose = 0,
)


print('Final Train Losses:', '\n', {k: v[-1] for k, v in hist.history.items() if 'loss' in k}, '\n')

bs_predicate_scaled, earn_predicate_scaled = pareja_model.predict([X_feat_scaled, X_prev_scaled], verbose = 0)
bs_predicate = bs_scaler.inverse_transform(bs_predicate_scaled)
earn_predicate = earn_scaler.inverse_transform(earn_predicate_scaled)

2026-01-27 16:56:18.282786: E tensorflow/core/framework/node_def_util.cc:680] NodeDef mentions attribute use_unbounded_threadpool which is not in the op definition: Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=force_synchronous:bool,default=false; attr=metadata:string,default=""> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node ParallelMapDatasetV2/_17}}


Final Train Losses: 
 {'algebraic_bs_loss': 0.057867713272571564, 'loss': 0.44238030910491943, 'val_algebraic_bs_loss': 0.28929322957992554, 'val_loss': 0.5173493027687073} 



In [15]:
bs_predicate_df = pd.DataFrame(bs_predicate, columns = TARGET_LE, index = aligned.index)
assets_predicate = bs_predicate_df['Total Liabilities'] + bs_predicate_df['Total Equity']
bs_predicate_full = bs_predicate_df.copy()
bs_predicate_full['Total Assets'] = assets_predicate
bs_predicate_full = bs_predicate_full[TARGET_BS]
resid_predicate = bs_predicate_full['Total Assets'] - (bs_predicate_full['Total Liabilities'] + bs_predicate_full['Total Equity'])

pred_full = bs_predicate_full.copy()
pred_full['ticker'] = aligned['ticker'].values
pred_full['stmt_freq'] = aligned['stmt_freq'].values
pred_full['pred_net_income'] = earn_predicate.flatten()
pred_full['pred_resid'] = pred_full['Total Assets'] - (pred_full['Total Liabilities'] + pred_full['Total Equity'])

actual_bs = aligned[TARGET_BS].copy()
actual_bs.columns = [f'actual_{c}' for c in actual_bs.columns]
pred_full = pd.concat([pred_full, actual_bs], axis = 1)
pred_full['actual_net_income'] = aligned[NET_INCOME_COL].values

pred_full['err_assets'] = pred_full['Total Assets'] - pred_full['actual_Total Assets']
pred_full['err_liab'] = pred_full['Total Liabilities'] - pred_full['actual_Total Liabilities']
pred_full['err_equity'] = pred_full['Total Equity'] - pred_full['actual_Total Equity']
pred_full['err_net_income'] = pred_full['pred_net_income'] - pred_full['actual_net_income']
pred_full['abs_err_assets'] = pred_full['err_assets'].abs()
pred_full['abs_err_liab'] = pred_full['err_liab'].abs()
pred_full['abs_err_equity'] = pred_full['err_equity'].abs()
pred_full['abs_err_net_income'] = pred_full['err_net_income'].abs()

summary = pred_full.groupby('ticker').agg(
    samples = ('pred_net_income', 'size'),
    mean_pred_assets = ('Total Assets', 'mean'),
    mean_pred_liab = ('Total Liabilities', 'mean'),
    mean_pred_equity = ('Total Equity', 'mean'),
    mean_pred_net_income = ('pred_net_income', 'mean'),
    mean_actual_assets = ('actual_Total Assets', 'mean'),
    mean_actual_liab = ('actual_Total Liabilities', 'mean'),
    mean_actual_equity = ('actual_Total Equity', 'mean'),
    mean_actual_net_income = ('actual_net_income', 'mean'),
    mae_assets = ('abs_err_assets', 'mean'),
    mae_liab = ('abs_err_liab', 'mean'),
    mae_equity = ('abs_err_equity', 'mean'),
    mae_net_income = ('abs_err_net_income', 'mean'),
)

def format_df_for_view(df: pd.DataFrame, decimals: int = 2) -> pd.DataFrame:
    """Create a display-only view with formatted numerics."""

    view = df.copy()
    num_cols = view.select_dtypes(include = [np.number]).columns
    fmt = f"{{:,.{decimals}f}}"
    view[num_cols] = view[num_cols].map(lambda v: "" if pd.isna(v) else fmt.format(v))

    return view


bs_predicate_full_view = format_df_for_view(bs_predicate_full)
pred_full_view = format_df_for_view(pred_full)
summary_view = format_df_for_view(summary)

earn_head_str = np.array2string(
    earn_predicate.flatten(),
    formatter = {"float_kind": lambda v: f"{v:,.2f}"}
)

In [16]:
print('Summary:', '\n')
summary_view

Summary: 



Unnamed: 0_level_0,samples,mean_pred_assets,mean_pred_liab,mean_pred_equity,mean_pred_net_income,mean_actual_assets,mean_actual_liab,mean_actual_equity,mean_actual_net_income,mae_assets,mae_liab,mae_equity,mae_net_income
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0700.HK,3.0,1288782348288.0,420556079104.0,868226170880.0,138858676224.0,1645457333333.33,741978333333.33,903479000000.0,165844000000.0,356675028736.0,321422232384.0,38854464618.67,27580089600.0
1810.HK,3.0,338128633856.0,186000605184.0,152128012288.0,16676213760.0,333636646333.33,167839997333.33,165796649000.0,14535776333.33,51945568093.33,36519344672.0,16132717394.67,3605827256.0
9987.HK,3.0,198108921856.0,188145008640.0,9963913216.0,-334583136.0,11659333333.33,4757333333.33,6902000000.0,726666666.67,186449593984.0,183387680768.0,3061913216.0,1061249792.0
9988.HK,2.0,1295854862336.0,189096853504.0,1106757943296.0,77726236672.0,1758936500000.0,641176500000.0,1117760000000.0,76396000000.0,463081637664.0,452079654688.0,20888689728.0,1330240768.0
AAPL,3.0,170300162048.0,188035579904.0,-17735417856.0,96365174784.0,358934666666.67,294658333333.33,64276333333.33,100913666666.67,188634493696.0,106622742506.67,82011751189.33,4548491882.67
EL,3.0,196093493248.0,193915568128.0,2177919744.0,-3204890624.0,21661333333.33,16462666666.67,5198666666.67,87666666.67,174432154453.33,177452901461.33,5636322858.67,6919305024.0
GOOG,2.0,452753424384.0,185760071680.0,266993336320.0,66219225088.0,383828000000.0,114066500000.0,269761500000.0,66883500000.0,68925408000.0,71693571680.0,11266600800.0,4490325728.0
IBM,3.0,448740294656.0,273936105472.0,174804156416.0,6842824704.0,133219666666.67,109210666666.67,24009000000.0,5055000000.0,315520595221.33,164725433344.0,150795159146.67,3771843733.33
KO,3.0,255666012160.0,225707245568.0,29958744064.0,13548889088.0,97005000000.0,70445666666.67,26559333333.33,10295666666.67,158661001237.33,155261589824.0,6912184106.67,3464088512.0
MCD,3.0,299642945536.0,240421355520.0,59221594112.0,5379416576.0,53921533333.33,58757333333.33,-4835800000.0,7623000000.0,245721412202.67,181664005802.67,64057398208.0,2243583253.33


In [17]:
print('Balance Sheet Predictions:', '\n')
bs_predicate_full_view.tail(100)

Balance Sheet Predictions: 



Unnamed: 0,Total Assets,Total Liabilities,Total Equity
2022-12-31,295644659712.0,240434610176.0,55210049536.0
2022-12-31,250191265792.0,229634424832.0,20556840960.0
2022-12-31,244393410560.0,185758236672.0,58635182080.0
2022-12-31,450404352000.0,185761906688.0,264642428928.0
2022-12-31,1085425188864.0,297162702848.0,788262486016.0
2022-12-31,198697189376.0,188189294592.0,10507894784.0
2022-12-31,223793692672.0,185758236672.0,38035456000.0
2022-12-31,328621555712.0,185758236672.0,142863302656.0
2022-12-31,478690476032.0,277218295808.0,201472180224.0
2023-03-31,1278732140544.0,187701805056.0,1091030286336.0


In [18]:
print('Full Predictions:', '\n')
pred_full_view.tail(100)

Full Predictions: 



Unnamed: 0,Total Assets,Total Liabilities,Total Equity,ticker,stmt_freq,pred_net_income,pred_resid,actual_Total Assets,actual_Total Liabilities,actual_Total Equity,actual_net_income,err_assets,err_liab,err_equity,err_net_income,abs_err_assets,abs_err_liab,abs_err_equity,abs_err_net_income
2022-12-31,295644659712.0,240434610176.0,55210049536.0,MCD,yearly,3273543680.0,0.0,50435600000.0,56439000000.0,-6003400000.0,6177000000.0,245209059712.0,183995610176.0,61213449536.0,-2903456320.0,245209059712.0,183995610176.0,61213449536.0,2903456320.0
2022-12-31,250191265792.0,229634424832.0,20556840960.0,KO,yearly,9225701376.0,0.0,92763000000.0,66937000000.0,25826000000.0,9542000000.0,157428265792.0,162697424832.0,-5269159040.0,-316298624.0,157428265792.0,162697424832.0,5269159040.0,316298624.0
2022-12-31,244393410560.0,185758236672.0,58635182080.0,NESN.SW,yearly,10402189312.0,0.0,135182000000.0,92390000000.0,42792000000.0,9270000000.0,109211410560.0,93368236672.0,15843182080.0,1132189312.0,109211410560.0,93368236672.0,15843182080.0,1132189312.0
2022-12-31,450404352000.0,185761906688.0,264642428928.0,GOOG,yearly,63798050816.0,0.0,365264000000.0,109120000000.0,256144000000.0,59972000000.0,85140352000.0,76641906688.0,8498428928.0,3826050816.0,85140352000.0,76641906688.0,8498428928.0,3826050816.0
2022-12-31,1085425188864.0,297162702848.0,788262486016.0,0700.HK,yearly,162303148032.0,0.0,1578131000000.0,795271000000.0,782860000000.0,188243000000.0,-492705811136.0,-498108297152.0,5402486016.0,-25939851968.0,492705811136.0,498108297152.0,5402486016.0,25939851968.0
2022-12-31,198697189376.0,188189294592.0,10507894784.0,9987.HK,yearly,-1458843648.0,0.0,11826000000.0,4666000000.0,7160000000.0,442000000.0,186871189376.0,183523294592.0,3347894784.0,-1900843648.0,186871189376.0,183523294592.0,3347894784.0,1900843648.0
2022-12-31,223793692672.0,185758236672.0,38035456000.0,TSLA,yearly,16450433024.0,0.0,82338000000.0,36440000000.0,45898000000.0,12583000000.0,141455692672.0,149318236672.0,-7862544000.0,3867433024.0,141455692672.0,149318236672.0,7862544000.0,3867433024.0
2022-12-31,328621555712.0,185758236672.0,142863302656.0,1810.HK,yearly,3626856448.0,0.0,273507211000.0,129584151000.0,143923060000.0,2474030000.0,55114344712.0,56174085672.0,-1059757344.0,1152826448.0,55114344712.0,56174085672.0,1059757344.0,1152826448.0
2022-12-31,478690476032.0,277218295808.0,201472180224.0,IBM,yearly,1867137024.0,0.0,127243000000.0,105222000000.0,22021000000.0,1640000000.0,351447476032.0,171996295808.0,179451180224.0,227137024.0,351447476032.0,171996295808.0,179451180224.0,227137024.0
2023-03-31,1278732140544.0,187701805056.0,1091030286336.0,9988.HK,yearly,74767540224.0,0.0,1753044000000.0,630123000000.0,1122921000000.0,72783000000.0,-474311859456.0,-442421194944.0,-31890713664.0,1984540224.0,474311859456.0,442421194944.0,31890713664.0,1984540224.0
