In [5]:
import pandas as pd
import akshare as ak

etfs = ['561300', '159726', '515100', '513500', '161119', '518880', '164824', '159985', '513330', '513100', '513030', '513520']

df = pd.DataFrame()

end_date = '20250731'

for i in etfs:
    etf_df = ak.fund_etf_hist_em(symbol=i, period="daily", start_date="20220701", end_date=end_date, adjust="hfq")
    etf_df['close'] = etf_df['收盘'].astype(float)
    etf_df['date'] = etf_df['日期']
    etf_df['symbol'] = i
    etf_df = etf_df.ffill()
    etf_df = etf_df.infer_objects(copy=False)
    df = pd.concat([df, etf_df])

In [26]:
import pandas as pd
import duckdb

# 要查询的 ETF 列表
etfs = ['561300', '159726', '515100', '513500', '161119', '518880', '164824', '159985', '513330', '513100', '513030', '513520']

# 查询的起止日期
start_date = '2022-07-01'
end_date = '2025-08-13'

# 连接到 DuckDB 数据库 (只读模式)
con = duckdb.connect(database='../dataset/quant_data.duckdb', read_only=True)

# 构建 SQL 查询语句
# 将 etfs 列表转换为适合 SQL "IN" 子句的元组格式
symbols_tuple = tuple(etfs)

query = f"""
SELECT
    日期 AS date,
    收盘 AS close,
    symbol
FROM
    etf_prices
WHERE
    symbol IN {symbols_tuple}
    AND date BETWEEN '{start_date}' AND '{end_date}'
ORDER BY
    symbol, date
"""

# 执行查询并将结果加载到 pandas DataFrame
try:
    df = con.execute(query).fetchdf()
finally:
    # 关闭数据库连接
    con.close()

# 将 'close' 列转换为浮点数类型
df['close'] = df['close'].astype(float)

# 将 'date' 列转换为日期时间类型
df['date'] = pd.to_datetime(df['date'])

# 对每个 symbol 分组后进行前向填充 (ffill)
# 这是为了匹配您原始脚本中的逻辑
df = df.groupby('symbol', group_keys=False).apply(lambda x: x.ffill())

# 重新推断数据类型，以匹配原始脚本
df = df.infer_objects(copy=False)

# 打印 DataFrame 的前几行以供查阅
print(df.head())

        date  close  symbol
0 2022-07-01  1.007  159726
1 2022-07-04  1.008  159726
2 2022-07-05  1.006  159726
3 2022-07-06  0.992  159726
4 2022-07-07  0.997  159726






In [27]:
df_pivot = df.pivot(index='date', columns='symbol', values='close')
df_pivot = df_pivot.dropna()
df_pivot

symbol,159726,159985,161119,164824,513030,513100,513330,513500,513520,515100,518880,561300
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2022-07-01,1.007,1.527,1.544,1.091,0.910,3.995,0.524,2.382,0.970,1.540,3.784,0.866
2022-07-04,1.008,1.510,1.544,1.093,0.912,4.010,0.533,2.412,0.972,1.541,3.801,0.872
2022-07-05,1.006,1.514,1.543,1.103,0.910,4.070,0.533,2.432,0.973,1.537,3.797,0.870
2022-07-06,0.992,1.499,1.543,1.095,0.874,4.095,0.520,2.420,0.969,1.518,3.725,0.858
2022-07-07,0.997,1.531,1.543,1.109,0.878,4.145,0.521,2.442,0.977,1.520,3.676,0.864
...,...,...,...,...,...,...,...,...,...,...,...,...
2025-07-25,1.294,1.958,1.759,1.520,1.900,8.330,0.526,4.328,1.555,1.994,7.409,0.885
2025-07-28,1.289,1.943,1.759,1.515,1.917,8.420,0.526,4.370,1.534,1.986,7.388,0.887
2025-07-29,1.289,1.939,1.758,1.506,1.883,8.415,0.524,4.368,1.530,1.982,7.363,0.890
2025-07-30,1.302,1.958,1.759,1.507,1.888,8.400,0.519,4.360,1.531,1.988,7.377,0.890


In [16]:
from sklearn import set_config
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    RandomizedSearchCV,
    train_test_split
)
from sklearn.pipeline import Pipeline
from scipy.stats import loguniform
import matplotlib.pyplot as plt

from skfolio import RatioMeasure, RiskMeasure
from skfolio.datasets import load_factors_dataset, load_sp500_dataset
from skfolio.distance import KendallDistance
from skfolio.model_selection import (
    CombinatorialPurgedCV,
    WalkForward,
    cross_val_predict,
)
from skfolio.moments import (
    DenoiseCovariance,
    DetoneCovariance,
    EWMu,
    GerberCovariance,
    ShrunkMu,
    ShrunkCovariance
)
from skfolio.optimization import (
    MeanRisk,
    NestedClustersOptimization,
    ObjectiveFunction,
    RiskBudgeting,
    HierarchicalRiskParity,
    DistributionallyRobustCVaR,
    StackingOptimization,
    MaximumDiversification,
    HierarchicalEqualRiskContribution,
    InverseVolatility,
    ConvexOptimization
)
from skfolio.pre_selection import SelectKExtremes
from skfolio.preprocessing import prices_to_returns
from skfolio.prior import BlackLitterman, EmpiricalPrior, FactorModel
from skfolio.uncertainty_set import BootstrapMuUncertaintySet
from skfolio.portfolio import MultiPeriodPortfolio
from skfolio.cluster import HierarchicalClustering, LinkageMethod

In [18]:
prices = df_pivot.copy()
prices = prices.sort_values(by='date').dropna()

cv = WalkForward(test_size=1, train_size=7, freq="MS")


model = HierarchicalRiskParity(
    risk_measure=RiskMeasure.CVAR,
    prior_estimator=EmpiricalPrior(
        covariance_estimator=ShrunkCovariance(shrinkage=0.9)
    )
)

model3 = RiskBudgeting(
    risk_measure=RiskMeasure.EVAR,
    portfolio_params=dict(name="Risk Parity - Covariance Shrinkage")
    # solver='SCS'
)

estimators = [
    ("model1", model),
    ("model2", MaximumDiversification()),
    ("model3", DistributionallyRobustCVaR(wasserstein_ball_radius=0.01))
]

model_stacking = StackingOptimization(
    estimators=estimators,
    final_estimator=model3,
)

X = prices_to_returns(prices)
X.index = pd.to_datetime(X.index)

pred_stacking = cross_val_predict(
    model_stacking,
    X,
    cv=cv,
    n_jobs=-1,
    portfolio_params=dict(name="Stacking")
)

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


In [19]:
returns = pred_stacking.plot_cumulative_returns()
returns.show(renderer='iframe')

In [20]:
composition = pred_stacking.plot_composition()
composition.show(renderer='iframe')

In [21]:
pred_stacking.summary()

Mean                                     0.040%
Annualized Mean                          10.12%
Variance                               0.00077%
Annualized Variance                       0.19%
Semi-Variance                          0.00044%
Annualized Semi-Variance                  0.11%
Standard Deviation                        0.28%
Annualized Standard Deviation             4.40%
Semi-Deviation                            0.21%
Annualized Semi-Deviation                 3.32%
Mean Absolute Deviation                   0.19%
CVaR at 95%                               0.60%
EVaR at 95%                               1.53%
Worst Realization                         2.59%
CDaR at 95%                               2.13%
MAX Drawdown                              3.11%
Average Drawdown                          0.48%
EDaR at 95%                               2.37%
First Lower Partial Moment               0.096%
Ulcer Index                              0.0074
Gini Mean Difference                    

In [19]:
pred_stacking.weights_per_observation

Unnamed: 0,159726,159985,161119,164824,513030,513100,513330,513500,513520,515100,518880,561300
2022-09-01,0.018662,0.028534,0.670745,0.053054,0.021103,0.021200,0.013135,0.017874,0.019541,0.017503,0.101688,0.016961
2022-09-02,0.018662,0.028534,0.670745,0.053054,0.021103,0.021200,0.013135,0.017874,0.019541,0.017503,0.101688,0.016961
2022-09-05,0.018662,0.028534,0.670745,0.053054,0.021103,0.021200,0.013135,0.017874,0.019541,0.017503,0.101688,0.016961
2022-09-06,0.018662,0.028534,0.670745,0.053054,0.021103,0.021200,0.013135,0.017874,0.019541,0.017503,0.101688,0.016961
2022-09-07,0.018662,0.028534,0.670745,0.053054,0.021103,0.021200,0.013135,0.017874,0.019541,0.017503,0.101688,0.016961
...,...,...,...,...,...,...,...,...,...,...,...,...
2025-06-24,0.016205,0.058234,0.676137,0.030993,0.030080,0.009944,0.027933,0.016175,0.022993,0.042024,0.045668,0.023614
2025-06-25,0.016205,0.058234,0.676137,0.030993,0.030080,0.009944,0.027933,0.016175,0.022993,0.042024,0.045668,0.023614
2025-06-26,0.016205,0.058234,0.676137,0.030993,0.030080,0.009944,0.027933,0.016175,0.022993,0.042024,0.045668,0.023614
2025-06-27,0.016205,0.058234,0.676137,0.030993,0.030080,0.009944,0.027933,0.016175,0.022993,0.042024,0.045668,0.023614


In [20]:
df_c = df_pivot.copy()
df_c = prices_to_returns(df_c)
df_c.index = pd.to_datetime(df_c.index)

end_date = df_c.index.max()

train_start_date = (end_date - pd.DateOffset(months=7))
train_end_date = (end_date - pd.DateOffset(months=1))

train_df = df_c.loc[train_start_date:train_end_date]
predict_df = df_c.loc[train_end_date:]

model_stacking.fit(train_df)

pred_weight = model_stacking.predict(predict_df)

In [21]:
train_start_date

Timestamp('2024-12-31 00:00:00')

In [22]:
weight_df = pred_weight.weights_per_observation.tail(6)
weight_df.round(2)

Unnamed: 0,159726,159985,161119,164824,513030,513100,513330,513500,513520,515100,518880,561300
2025-07-24,0.02,0.06,0.67,0.03,0.03,0.01,0.03,0.02,0.02,0.04,0.04,0.03
2025-07-25,0.02,0.06,0.67,0.03,0.03,0.01,0.03,0.02,0.02,0.04,0.04,0.03
2025-07-28,0.02,0.06,0.67,0.03,0.03,0.01,0.03,0.02,0.02,0.04,0.04,0.03
2025-07-29,0.02,0.06,0.67,0.03,0.03,0.01,0.03,0.02,0.02,0.04,0.04,0.03
2025-07-30,0.02,0.06,0.67,0.03,0.03,0.01,0.03,0.02,0.02,0.04,0.04,0.03
2025-07-31,0.02,0.06,0.67,0.03,0.03,0.01,0.03,0.02,0.02,0.04,0.04,0.03
