In [1]:
## 数据集转换
import os
data_path = "../data/cleaned/csi300_stock_feats.csv"
if not os.path.exists(data_path):
    import qlib
    from qlib.data import D
    # 初始化 Qlib 的数据存储
    qlib.init(provider_uri = "../data/raw/qlib_data/cn_data")
    fields = ['$open', '$high', '$low', '$close', '$volume', '$amount', '$vwap']
    df = D.features(D.instruments(market='csi300'), fields, start_time='20160101', end_time='20201231', freq='day')
    df.to_csv("../data/cleaned/csi300_stock_feats.csv")
    print(df)

In [2]:
## 数据读取+特征转换
import polars as pl
from polars_ta.prefix.tdx import *
from polars_ta.prefix.wq import *

OPEN, HIGH, LOW, CLOSE, VOLUME, AMOUNT, VWAP = [pl.col(col) for col in ['open', 'high', 'low', 'close', 'volume', 'amount', 'vwap']]

def fast_linregress(x, y):
            x_mean = np.mean(x)
            y_mean = np.mean(y)
            slope = np.dot(x - x_mean, y - y_mean) / np.dot(x - x_mean, x - x_mean)
            intercept = y_mean - slope * x_mean
            y_pred = slope * x + intercept
            ss_total = np.sum((y - np.mean(y)) ** 2) + 1e-12
            ss_residual = np.sum((y - y_pred) ** 2)
            r2 = 1 - (ss_residual / ss_total)
            resd = np.sum(y - y_pred)
            return slope, intercept, r2, resd

def func_ts_date(df: pl.DataFrame) -> pl.DataFrame:
    print(df['instrument'][0])
    df = df.sort(by=['datetime'])
    df = df.with_columns([
        ((CLOSE - OPEN) / OPEN).alias('KMID'),
        ((HIGH - LOW) / OPEN).alias("KLEN"),
        ((CLOSE - OPEN) / (HIGH - LOW + 1e-12)).alias("KMID2"),
        ((HIGH - max_(OPEN, CLOSE)) / OPEN).alias("KUP"),
        ((HIGH - max_(OPEN, CLOSE)) / (HIGH - LOW + 1e-12)).alias("KUP2"),
        ((min_(OPEN, CLOSE) - LOW) / OPEN).alias("KLOW"),
        ((min_(OPEN, CLOSE) - LOW) / (HIGH - LOW + 1e-12)).alias("KLOW2"),
        ((2 * CLOSE - HIGH - LOW) / OPEN).alias("KSFT"),
        ((2 * CLOSE - HIGH - LOW) / (HIGH - LOW + 1e-12)).alias("KSFT2"),
        *[(ts_delay(OPEN, i) / CLOSE).alias(f'OPEN{i}') for i in [0]],
        *[(ts_delay(HIGH, i) / CLOSE).alias(f'HIGH{i}') for i in [0]],
        *[(ts_delay(LOW, i) / CLOSE).alias(f'LOW{i}') for i in [0]],
        *[(ts_delay(VWAP, i) / CLOSE).alias(f'VWAP{i}') for i in [0]],
    ])
    for i in [5,10,20,30,60]:
        df = df.with_columns([
            (ts_delay(CLOSE, i) / CLOSE).alias(f'ROC{i}'),
            (ts_mean(CLOSE, i) / CLOSE).alias(f'MA{i}'),
            (CLOSE.rolling_std(i) / CLOSE).alias(f'STD{i}'),
            (CLOSE.rolling_max(i) / CLOSE).alias(f'MAX{i}'),
            (CLOSE.rolling_min(i) / CLOSE).alias(f'MIN{i}'),
            (CLOSE.rolling_quantile(0.8, interpolation='linear', window_size=i) / CLOSE).alias(f'QTLU{i}'),
            (CLOSE.rolling_quantile(0.2, interpolation='linear', window_size=i) / CLOSE).alias(f'QTLD{i}'),
            (ts_rank(CLOSE, i)).alias(f'RANK{i}'),
            (ts_RSV(HIGH, LOW, CLOSE, i)).alias(f'RSV{i}'),
            (1 - ts_arg_max(HIGH, i) / i).alias(f'IMAX{i}'),
            (1 - ts_arg_min(LOW, i) / i).alias(f'IMIN{i}'),
            (ts_corr(CLOSE, log1p(VOLUME), i)).alias(f'CORR{i}'),
            (ts_corr(CLOSE / ts_delay(CLOSE, 1), log1p(VOLUME / ts_delay(VOLUME, 1)), i)).alias(f'CORD{i}'),
            (ts_mean(CLOSE > ts_delay(CLOSE, 1), i)).alias(f'CNTP{i}'),
            (ts_mean(CLOSE < ts_delay(CLOSE, 1), i)).alias(f'CNTN{i}'),
            (ts_sum(max_(CLOSE - ts_delay(CLOSE, 1), 0), i) / (ts_sum(abs_(CLOSE - ts_delay(CLOSE, 1)), i) + 1e-12)).alias(f'SUMP{i}'),
            (ts_sum(max_(ts_delay(CLOSE, 1) - CLOSE, 0), i) / (ts_sum(abs_(CLOSE - ts_delay(CLOSE, 1)), i) + 1e-12)).alias(f'SUMN{i}'),
            (ts_mean(VOLUME, i) / (VOLUME + 1e-12)).alias(f'VMA{i}'),
            (VOLUME.rolling_std(i) / (VOLUME + 1e-12)).alias(f'VSTD{i}'),
            ((abs_(ts_returns(CLOSE, 1)) * VOLUME).rolling_std(i) / (ts_mean(abs_(ts_returns(CLOSE, 1)) * VOLUME, i) + 1e-12)).alias(f'WVMA{i}'),
            (ts_sum(max_(VOLUME - ts_delay(VOLUME, 1), 0), i) / (ts_sum(abs_(VOLUME - ts_delay(VOLUME, 1)), i) + 1e-12)).alias(f'VSUMP{i}'),
            (ts_sum(max_(ts_delay(VOLUME, 1) - VOLUME, 0), i) / (ts_sum(abs_(VOLUME - ts_delay(VOLUME, 1)), i) + 1e-12)).alias(f'VSUMN{i}')
        ])
        df = df.with_columns([
            (pl.col(f"IMAX{i}") -pl.col(f"IMIN{i}")).alias(f"IMXD{i}"),
            (pl.col(f"CNTP{i}") - pl.col(f"CNTN{i}")).alias(f'CNTD{i}'),
            (pl.col(f"SUMP{i}") - pl.col(f"SUMN{i}")).alias(f'SUMD{i}'),
            (pl.col(f"VSUMP{i}") - pl.col(f"VSUMN{i}")).alias(f'VSUMD{i}'),
        ])

        reg = [fast_linregress(x = np.arange(i), y = df["close"][idx: idx + i].to_numpy()) for idx in range(len(df) - i + 1)]
        beta = [None] * (i - 1) + [item[0] for item in reg if item]
        rsqr = [None] * (i - 1) + [item[2] for item in reg if item]
        resi = [None] * (i - 1) + [item[3] for item in reg if item]
        row_n = len(df)
        df = df.with_columns([
            pl.Series(f'BETA{i}', beta[:row_n]),
            pl.Series(f'RSQR{i}', rsqr[:row_n]),
            pl.Series(f'RESI{i}', resi[:row_n]),
        ])
    return df

pldf = pl.read_csv("../data/cleaned/csi300_stock_feats.csv")
# pldf = pldf.group_by('instrument').map_groups(func_ts_date)
pldf = func_ts_date(pldf.filter(pl.col('instrument') == 'SH600009'))
# print(pldf)

SH600009


In [3]:
## 数据装载
from qlib.data.dataset.loader import StaticDataLoader
from qlib.data.dataset.handler import DataHandlerLP
from qlib.data.dataset import DatasetH
from qlib.data.dataset.processor import DropnaProcessor, CSZScoreNorm, ProcessInf, Fillna

df = pldf.to_pandas()
df['datetime'] = pd.to_datetime(df['datetime'])
df.set_index(['instrument', 'datetime'], inplace=True)
features = df[[col for col in df.columns if col.isupper()]]
labels = df[['close']]

## 创建数据加载器
dl = StaticDataLoader(config=dict(feature=features,label=labels))
## 创建数据处理器
LEARN_PROCESSORS = [DropnaProcessor(),CSZScoreNorm(fields_group='label')]
INFER_PROCESSORS = []
dh = DataHandlerLP(data_loader=dl,infer_processors=LEARN_PROCESSORS,learn_processors=INFER_PROCESSORS)
## 创建数据集
ds = DatasetH(handler=dh, segments={"train": ('2016-01-01', '2017-12-31'), "valid": ('2018-01-01', '2018-12-31'),"test": ('2019-01-01', '2019-12-31')})

In [4]:
use_initial = False
if use_initial:
    import qlib
    import pandas as pd
    from qlib.contrib.data.handler import Alpha158
    from qlib.data.dataset import DatasetH
    ## 加载数据
    qlib.init(provider_uri = "../data/raw/qlib_data/cn_data")
    dh = Alpha158(instruments="csi300",start_time='20160101',end_time='20201231',freq="day")
    ds = DatasetH(handler=dh,segments={"train": ('20160101', '20171231'), "valid": ('20180101', '20181231'),"test": ('20190101', '20191231')})

In [5]:
from qlib.contrib.model.gbdt import LGBModel
model = LGBModel(
    loss="mse",
    colsample_bytree=0.8879,
    learning_rate=0.0421,
    subsample=0.8789,
    lambda_l1=205.6999,
    lambda_l2=580.9768,
    max_depth=8,
    num_leaves=210,
    num_threads=20,
)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
## 自定义qlib初始化
from qlib.utils import register_wrapper
from qlib.workflow import R
from qlib.data.data import D, Cal, DatasetD, BaseProvider
class MyR:
    def __init__(self,):
        self.exp_manager = {'active_experiment':None}
    def log_metrics(self, **metrics): 
        pass
class MyDD():
    def fetch(self, instruments, start_time, end_time, fields):
        return df.loc[(instruments, slice(start_time, end_time)), fields]
# class MyD:
#     def features(self, instruments, fields, start_time, end_time, freq='day', disk_cache='False'):
#         return df.loc[(instruments, slice(start_time, end_time)), fields]
class MyCal:
    def load_calendar(self,freq='day',future=False):
        return df.index.get_level_values('datetime')
    
    def calendar(self, start_time=None, end_time=None,freq='day',future=False):
        calendar = self.load_calendar()
        if future:
            calendar = calendar[calendar >= pd.Timestamp.today()]
        if start_time:
            start_time = pd.Timestamp(start_time)
            calendar = calendar[calendar >= start_time]
        if end_time:
            end_time = pd.Timestamp(end_time)
            calendar = calendar[calendar <= end_time]
        return calendar

register_wrapper(R, MyR)
register_wrapper(DatasetD,MyDD)
register_wrapper(D, BaseProvider)
register_wrapper(Cal, MyCal)

In [7]:
# 训练模型
model.fit(ds)

Training until validation scores don't improve for 50 rounds
[20]	train's l2: 0	valid's l2: 0
[40]	train's l2: 0	valid's l2: 0
Early stopping, best iteration is:
[1]	train's l2: 0	valid's l2: 0


In [8]:
from qlib.contrib.evaluate import backtest_daily
from qlib.contrib.evaluate import risk_analysis
from qlib.contrib.strategy import TopkDropoutStrategy
from qlib.backtest.executor import BaseExecutor

strategy_obj = TopkDropoutStrategy(model=model,dataset=ds,topk=50,n_drop=5)
executor_obj = BaseExecutor(time_per_step='day')
report_normal, positions_normal = backtest_daily(
    start_time="2019-01-01", end_time="2019-12-31", strategy=strategy_obj
)

AssertionError: 

In [None]:
# 分析回测结果
analysis = dict()
analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"])
analysis["excess_return_with_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"] - report_normal["cost"])
analysis_df = pd.concat(analysis)  # type: pd.DataFrame
print(analysis_df)