In [1]:
import pandas as pd
import akshare as ak

etfs = ['561300', '159726', '515100', '513500', '161119', '518880', '164824', '159985', '513330', '513100', '513030', '513520']

df = pd.DataFrame()

end_date = '20250731'

for i in etfs:
    etf_df = ak.fund_etf_hist_em(symbol=i, period="daily", start_date="20220701", end_date=end_date, adjust="hfq")
    etf_df['close'] = etf_df['收盘'].astype(float)
    etf_df['date'] = etf_df['日期']
    etf_df['symbol'] = i
    etf_df = etf_df.ffill()
    etf_df = etf_df.infer_objects(copy=False)
    df = pd.concat([df, etf_df])

  import pkg_resources


  0%|          | 0/12 [00:00<?, ?it/s]

In [4]:
import pandas as pd
import duckdb

# 要查询的 ETF 列表
etfs = ['561300', '159726', '515100', '513500', '161119', '518880', '164824', '159985', '513330', '513100', '513030', '513520']

# 查询的起止日期
start_date = '2022-07-01'
end_date = '2025-08-13'

# 连接到 DuckDB 数据库 (只读模式)
path = '../dataset/data/etf_prices/**/data.parquet'
con = duckdb.connect(database=':memory:', read_only=False) 

# 构建 SQL 查询语句
# 将 etfs 列表转换为适合 SQL "IN" 子句的元组格式
symbols_tuple = tuple(etfs)

query = f"""
SELECT
    date,
    close,
    symbol
FROM read_parquet('{path}')
WHERE
    symbol IN {symbols_tuple}
    AND date BETWEEN '{start_date}' AND '{end_date}'
ORDER BY
    symbol, date
"""

# 执行查询并将结果加载到 pandas DataFrame
try:
    df = con.execute(query).fetchdf()
finally:
    # 关闭数据库连接
    con.close()

# 将 'close' 列转换为浮点数类型
df['close'] = df['close'].astype(float)

# 将 'date' 列转换为日期时间类型
df['date'] = pd.to_datetime(df['date'])

# 对每个 symbol 分组后进行前向填充 (ffill)
# 这是为了匹配您原始脚本中的逻辑
df = df.groupby('symbol', group_keys=False).apply(lambda x: x.ffill())

# 重新推断数据类型，以匹配原始脚本
df = df.infer_objects(copy=False)

# 打印 DataFrame 的前几行以供查阅
print(df.head())

        date  close  symbol
0 2024-08-15  0.920  159726
1 2024-08-16  0.932  159726
2 2024-08-19  0.932  159726
3 2024-08-20  0.920  159726
4 2024-08-21  0.915  159726


  df = df.groupby('symbol', group_keys=False).apply(lambda x: x.ffill())


In [5]:
df_pivot = df.pivot(index='date', columns='symbol', values='close')
df_pivot = df_pivot.dropna()
df_pivot

symbol,159726,159985,161119,164824,513030,513100,513330,513500,513520,515100,518880,561300
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2024-08-15,0.920,1.868,1.705,1.603,1.352,7.110,0.339,3.896,1.440,1.758,5.443,0.708
2024-08-16,0.932,1.859,1.705,1.634,1.373,7.345,0.348,3.950,1.495,1.754,5.453,0.710
2024-08-19,0.932,1.848,1.705,1.625,1.373,7.210,0.352,3.884,1.463,1.765,5.513,0.713
2024-08-20,0.920,1.864,1.705,1.660,1.396,7.340,0.350,3.932,1.497,1.753,5.513,0.710
2024-08-21,0.915,1.877,1.705,1.664,1.390,7.345,0.342,3.934,1.498,1.743,5.537,0.706
...,...,...,...,...,...,...,...,...,...,...,...,...
2025-08-07,1.298,1.975,1.762,1.471,1.867,8.420,0.532,4.348,1.547,1.978,7.484,0.882
2025-08-08,1.307,1.984,1.762,1.474,1.881,8.455,0.524,4.348,1.574,1.981,7.508,0.882
2025-08-11,1.300,1.969,1.761,1.474,1.889,8.540,0.526,4.384,1.590,1.979,7.429,0.884
2025-08-12,1.314,1.984,1.759,1.481,1.875,8.515,0.516,4.374,1.593,1.984,7.399,0.888


In [6]:
from sklearn import set_config
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    RandomizedSearchCV,
    train_test_split
)
from sklearn.pipeline import Pipeline
from scipy.stats import loguniform
import matplotlib.pyplot as plt

from skfolio import RatioMeasure, RiskMeasure
from skfolio.datasets import load_factors_dataset, load_sp500_dataset
from skfolio.distance import KendallDistance
from skfolio.model_selection import (
    CombinatorialPurgedCV,
    WalkForward,
    cross_val_predict,
)
from skfolio.moments import (
    DenoiseCovariance,
    DetoneCovariance,
    EWMu,
    GerberCovariance,
    ShrunkMu,
    ShrunkCovariance
)
from skfolio.optimization import (
    MeanRisk,
    NestedClustersOptimization,
    ObjectiveFunction,
    RiskBudgeting,
    HierarchicalRiskParity,
    DistributionallyRobustCVaR,
    StackingOptimization,
    MaximumDiversification,
    HierarchicalEqualRiskContribution,
    InverseVolatility,
    ConvexOptimization
)
from skfolio.pre_selection import SelectKExtremes
from skfolio.preprocessing import prices_to_returns
from skfolio.prior import BlackLitterman, EmpiricalPrior, FactorModel
from skfolio.uncertainty_set import BootstrapMuUncertaintySet
from skfolio.portfolio import MultiPeriodPortfolio
from skfolio.cluster import HierarchicalClustering, LinkageMethod

In [7]:
prices = df_pivot.copy()
prices = prices.sort_values(by='date').dropna()

cv = WalkForward(test_size=1, train_size=7, freq="MS")


model = HierarchicalRiskParity(
    risk_measure=RiskMeasure.CVAR,
    prior_estimator=EmpiricalPrior(
        covariance_estimator=ShrunkCovariance(shrinkage=0.9)
    )
)

model3 = RiskBudgeting(
    risk_measure=RiskMeasure.EVAR,
    portfolio_params=dict(name="Risk Parity - Covariance Shrinkage")
    # solver='SCS'
)

estimators = [
    ("model1", model),
    ("model2", MaximumDiversification()),
    ("model3", DistributionallyRobustCVaR(wasserstein_ball_radius=0.01))
]

model_stacking = StackingOptimization(
    estimators=estimators,
    final_estimator=model3,
)

X = prices_to_returns(prices)
X.index = pd.to_datetime(X.index)

pred_stacking = cross_val_predict(
    model_stacking,
    X,
    cv=cv,
    n_jobs=-1,
    portfolio_params=dict(name="Stacking")
)

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


In [8]:
returns = pred_stacking.plot_cumulative_returns()
returns.show(renderer='iframe')

In [10]:
returns

In [20]:
composition = pred_stacking.plot_composition()
composition.show(renderer='iframe')

In [21]:
pred_stacking.summary()

Mean                                     0.040%
Annualized Mean                          10.12%
Variance                               0.00077%
Annualized Variance                       0.19%
Semi-Variance                          0.00044%
Annualized Semi-Variance                  0.11%
Standard Deviation                        0.28%
Annualized Standard Deviation             4.40%
Semi-Deviation                            0.21%
Annualized Semi-Deviation                 3.32%
Mean Absolute Deviation                   0.19%
CVaR at 95%                               0.60%
EVaR at 95%                               1.53%
Worst Realization                         2.59%
CDaR at 95%                               2.13%
MAX Drawdown                              3.11%
Average Drawdown                          0.48%
EDaR at 95%                               2.37%
First Lower Partial Moment               0.096%
Ulcer Index                              0.0074
Gini Mean Difference                    

In [19]:
pred_stacking.weights_per_observation

Unnamed: 0,159726,159985,161119,164824,513030,513100,513330,513500,513520,515100,518880,561300
2022-09-01,0.018662,0.028534,0.670745,0.053054,0.021103,0.021200,0.013135,0.017874,0.019541,0.017503,0.101688,0.016961
2022-09-02,0.018662,0.028534,0.670745,0.053054,0.021103,0.021200,0.013135,0.017874,0.019541,0.017503,0.101688,0.016961
2022-09-05,0.018662,0.028534,0.670745,0.053054,0.021103,0.021200,0.013135,0.017874,0.019541,0.017503,0.101688,0.016961
2022-09-06,0.018662,0.028534,0.670745,0.053054,0.021103,0.021200,0.013135,0.017874,0.019541,0.017503,0.101688,0.016961
2022-09-07,0.018662,0.028534,0.670745,0.053054,0.021103,0.021200,0.013135,0.017874,0.019541,0.017503,0.101688,0.016961
...,...,...,...,...,...,...,...,...,...,...,...,...
2025-06-24,0.016205,0.058234,0.676137,0.030993,0.030080,0.009944,0.027933,0.016175,0.022993,0.042024,0.045668,0.023614
2025-06-25,0.016205,0.058234,0.676137,0.030993,0.030080,0.009944,0.027933,0.016175,0.022993,0.042024,0.045668,0.023614
2025-06-26,0.016205,0.058234,0.676137,0.030993,0.030080,0.009944,0.027933,0.016175,0.022993,0.042024,0.045668,0.023614
2025-06-27,0.016205,0.058234,0.676137,0.030993,0.030080,0.009944,0.027933,0.016175,0.022993,0.042024,0.045668,0.023614


In [20]:
df_c = df_pivot.copy()
df_c = prices_to_returns(df_c)
df_c.index = pd.to_datetime(df_c.index)

end_date = df_c.index.max()

train_start_date = (end_date - pd.DateOffset(months=7))
train_end_date = (end_date - pd.DateOffset(months=1))

train_df = df_c.loc[train_start_date:train_end_date]
predict_df = df_c.loc[train_end_date:]

model_stacking.fit(train_df)

pred_weight = model_stacking.predict(predict_df)

In [21]:
train_start_date

Timestamp('2024-12-31 00:00:00')

In [22]:
weight_df = pred_weight.weights_per_observation.tail(6)
weight_df.round(2)

Unnamed: 0,159726,159985,161119,164824,513030,513100,513330,513500,513520,515100,518880,561300
2025-07-24,0.02,0.06,0.67,0.03,0.03,0.01,0.03,0.02,0.02,0.04,0.04,0.03
2025-07-25,0.02,0.06,0.67,0.03,0.03,0.01,0.03,0.02,0.02,0.04,0.04,0.03
2025-07-28,0.02,0.06,0.67,0.03,0.03,0.01,0.03,0.02,0.02,0.04,0.04,0.03
2025-07-29,0.02,0.06,0.67,0.03,0.03,0.01,0.03,0.02,0.02,0.04,0.04,0.03
2025-07-30,0.02,0.06,0.67,0.03,0.03,0.01,0.03,0.02,0.02,0.04,0.04,0.03
2025-07-31,0.02,0.06,0.67,0.03,0.03,0.01,0.03,0.02,0.02,0.04,0.04,0.03
