In [3]:
import sys, os
sys.path.insert(0, os.path.abspath("../src"))  # <- notebooks 目录相对到 src
import pandas as pd, numpy as np

from fund_pipeline import data_handling as dh  # should work now
from fund_pipeline import intermediary_builder as ib

## Clean & Manipulate Data

In [4]:
RAW = "../data"
OUT = "../outputs"

ref  = dh.clean_reference_data(f"{RAW}/ReferenceData.csv", OUT)
pos  = dh.clean_positions(f"{RAW}/PositionLevelPNLAndExposure.csv", OUT)
acct = dh.clean_accounts(f"{RAW}/AccountInformation.csv", OUT)
idx  = dh.clean_index_returns(f"{RAW}/DailyIndexReturns.csv", OUT)
aum  = dh.clean_aum(f"{RAW}/AUM.csv", OUT)

integrated = dh.integrate_internal(pos, ref, acct, OUT)
manifest   = dh.build_manifest(OUT)
manifest

Dropped 3 singleton SEDOL(s) (3 rows) → ../outputs/dropped/positions_singleton_sedol.csv
Reason: these SEDOLs appear only once in positions and are likely incomplete/anomalous.
🧹 Dropped 9 rows missing all reference attributes → ../outputs/dropped/integrated_missing_reference.csv


{'generated_at': '2025-10-28T04:46:28.490543Z',
 'paths': {'clean_reference': '../outputs/cleaned/ReferenceData_clean.csv',
  'clean_positions': '../outputs/cleaned/PositionLevelPNLAndExposure_clean.csv',
  'clean_accounts': '../outputs/cleaned/AccountInformation_clean.csv',
  'clean_index_returns': '../outputs/cleaned/DailyIndexReturns_returns.csv',
  'clean_aum': '../outputs/cleaned/AUM_clean.csv',
  'integrated_internal': '../outputs/integrated/TooSharpe_InternalIntegrated.csv',
  'dropped_positions_singleton_sedol': '../outputs/dropped/positions_singleton_sedol.csv'}}

## Create Intermediary File

In [5]:
df_intermediary, diag = ib.build_intermediary_from_integrated(
    integrated,
    flow_eps=1e-8,
    widx_base=1.0,
    participation_rate=0.2,
    return_diagnostics=True,
)

# show result
print(diag)
df_intermediary.head()

{'collapsed_rows': 0, 'did_aggregate': False, 'rows': 711423, 'bucket': 'outputs/intermediary/20251028_004634', 'parquet': 'outputs/intermediary/20251028_004634/intermediary.parquet', 'latest': 'outputs/intermediary/latest'}


Unnamed: 0,Date,Account,SEDOL,PNL,GMV,NMV,USym,BasicProduct,BBYKIdentifier,Country,...,RC_NAV,PNL_CONTRIB_NAV,LONG_GMV,SHORT_GMV,LONG_NMV,SHORT_NMV,SIGN_NMV,ADV_20D,ADV_COVER_GMV,DAYS_TO_LIQUIDATE
0,2022-01-03,ASTATISTICAL,2000019,2212.5,102242.702637,102242.702637,AMZN-USAA,COMMON,AMZN US Equity,United States,...,,,102242.702637,0.0,102242.702637,0.0,1.0,11600000000.0,9e-06,4.4e-05
1,2022-01-03,ASTATISTICAL,2001119,-301.560055,13954.499626,-13954.499626,AIR-USAA,COMMON,AIR US Equity,United States,...,,-0.002949,0.0,13954.499626,0.0,13954.499626,-1.0,15300000.0,0.000912,0.00456
2,2022-01-03,ASTATISTICAL,2002059,1359.379898,27801.099472,-27801.099472,CLFD-USAA,COMMON,CLFD US Equity,United States,...,,-0.097415,0.0,27801.099472,0.0,27801.099472,-1.0,20200000.0,0.001376,0.006881
3,2022-01-03,ASTATISTICAL,2002305,-1220.608765,101360.155106,101360.155106,ABT-USAA,COMMON,ABT US Equity,United States,...,,0.043905,101360.155106,0.0,101360.155106,0.0,1.0,992000000.0,0.000102,0.000511
4,2022-01-03,ASTATISTICAL,2002479,499.839523,151619.214781,151619.214781,AES-USAA,COMMON,AES US Equity,United States,...,,0.004931,151619.214781,0.0,151619.214781,0.0,1.0,135000000.0,0.001123,0.005616
