In [12]:
## 特征转换
import polars as pl
from polars_ta.prefix.tdx import *
from polars_ta.prefix.wq import *

df = pl.read_csv("../data/cleaned/csi300_stock_feats.csv")

OPEN, HIGH, LOW, CLOSE, VOLUME, AMOUNT, VWAP = [pl.col(col) for col in ['open', 'high', 'low', 'close', 'volume', 'amount', 'vwap']]

def fast_linregress(x, y):
            x_mean = np.mean(x)
            y_mean = np.mean(y)
            slope = np.dot(x - x_mean, y - y_mean) / np.dot(x - x_mean, x - x_mean)
            intercept = y_mean - slope * x_mean
            y_pred = slope * x + intercept
            ss_total = np.sum((y - np.mean(y)) ** 2) + 1e-12
            ss_residual = np.sum((y - y_pred) ** 2)
            r2 = 1 - (ss_residual / ss_total)
            resd = np.sum(y - y_pred)
            return slope, intercept, r2, resd

def func_ts_date(df: pl.DataFrame) -> pl.DataFrame:
    print(df['instrument'][0])
    df = df.sort(by=['datetime'])
    df = df.with_columns([
        ((CLOSE - OPEN) / OPEN).alias('KMID'),
        ((HIGH - LOW) / OPEN).alias("KLEN"),
        ((CLOSE - OPEN) / (HIGH - LOW + 1e-12)).alias("KMID2"),
        ((HIGH - max_(OPEN, CLOSE)) / OPEN).alias("KUP"),
        ((HIGH - max_(OPEN, CLOSE)) / (HIGH - LOW + 1e-12)).alias("KUP2"),
        ((min_(OPEN, CLOSE) - LOW) / OPEN).alias("KLOW"),
        ((min_(OPEN, CLOSE) - LOW) / (HIGH - LOW + 1e-12)).alias("KLOW2"),
        ((2 * CLOSE - HIGH - LOW) / OPEN).alias("KSFT"),
        ((2 * CLOSE - HIGH - LOW) / (HIGH - LOW + 1e-12)).alias("KSFT2"),
        *[(ts_delay(OPEN, i) / CLOSE).alias(f'OPEN{i}') for i in [0]],
        *[(ts_delay(HIGH, i) / CLOSE).alias(f'HIGH{i}') for i in [0]],
        *[(ts_delay(LOW, i) / CLOSE).alias(f'LOW{i}') for i in [0]],
        *[(ts_delay(VWAP, i) / CLOSE).alias(f'VWAP{i}') for i in [0]],
    ])
    for i in [5,10,20,30,60]:
        df = df.with_columns([
            (ts_delay(CLOSE, i) / CLOSE).alias(f'ROC{i}'),
            (ts_mean(CLOSE, i) / CLOSE).alias(f'MA{i}'),
            (CLOSE.rolling_std(i) / CLOSE).alias(f'STD{i}'),
            (CLOSE.rolling_max(i) / CLOSE).alias(f'MAX{i}'),
            (CLOSE.rolling_min(i) / CLOSE).alias(f'MIN{i}'),
            (CLOSE.rolling_quantile(0.8, interpolation='linear', window_size=i) / CLOSE).alias(f'QTLU{i}'),
            (CLOSE.rolling_quantile(0.2, interpolation='linear', window_size=i) / CLOSE).alias(f'QTLD{i}'),
            (ts_rank(CLOSE, i)).alias(f'RANK{i}'),
            (ts_RSV(HIGH, LOW, CLOSE, i)).alias(f'RSV{i}'),
            (1 - ts_arg_max(HIGH, i) / i).alias(f'IMAX{i}'),
            (1 - ts_arg_min(LOW, i) / i).alias(f'IMIN{i}'),
            (ts_corr(CLOSE, log1p(VOLUME), i)).alias(f'CORR{i}'),
            (ts_corr(CLOSE / ts_delay(CLOSE, 1), log1p(VOLUME / ts_delay(VOLUME, 1)), i)).alias(f'CORD{i}'),
            (ts_mean(CLOSE > ts_delay(CLOSE, 1), i)).alias(f'CNTP{i}'),
            (ts_mean(CLOSE < ts_delay(CLOSE, 1), i)).alias(f'CNTN{i}'),
            (ts_sum(max_(CLOSE - ts_delay(CLOSE, 1), 0), i) / (ts_sum(abs_(CLOSE - ts_delay(CLOSE, 1)), i) + 1e-12)).alias(f'SUMP{i}'),
            (ts_sum(max_(ts_delay(CLOSE, 1) - CLOSE, 0), i) / (ts_sum(abs_(CLOSE - ts_delay(CLOSE, 1)), i) + 1e-12)).alias(f'SUMN{i}'),
            (ts_mean(VOLUME, i) / (VOLUME + 1e-12)).alias(f'VMA{i}'),
            (VOLUME.rolling_std(i) / (VOLUME + 1e-12)).alias(f'VSTD{i}'),
            ((abs_(ts_returns(CLOSE, 1)) * VOLUME).rolling_std(i) / (ts_mean(abs_(ts_returns(CLOSE, 1)) * VOLUME, i) + 1e-12)).alias(f'WVMA{i}'),
            (ts_sum(max_(VOLUME - ts_delay(VOLUME, 1), 0), i) / (ts_sum(abs_(VOLUME - ts_delay(VOLUME, 1)), i) + 1e-12)).alias(f'VSUMP{i}'),
            (ts_sum(max_(ts_delay(VOLUME, 1) - VOLUME, 0), i) / (ts_sum(abs_(VOLUME - ts_delay(VOLUME, 1)), i) + 1e-12)).alias(f'VSUMN{i}')
        ])
        df = df.with_columns([
            (pl.col(f"IMAX{i}") -pl.col(f"IMIN{i}")).alias(f"IMXD{i}"),
            (pl.col(f"CNTP{i}") - pl.col(f"CNTN{i}")).alias(f'CNTD{i}'),
            (pl.col(f"SUMP{i}") - pl.col(f"SUMN{i}")).alias(f'SUMD{i}'),
            (pl.col(f"VSUMP{i}") - pl.col(f"VSUMN{i}")).alias(f'VSUMD{i}'),
        ])

        reg = [fast_linregress(x = np.arange(i), y = df["close"][idx: idx + i].to_numpy()) for idx in range(len(df) - i + 1)]
        beta = [None] * (i - 1) + [item[0] for item in reg if item]
        rsqr = [None] * (i - 1) + [item[2] for item in reg if item]
        resi = [None] * (i - 1) + [item[3] for item in reg if item]
        row_n = len(df)
        df = df.with_columns([
            pl.Series(f'BETA{i}', beta[:row_n]),
            pl.Series(f'RSQR{i}', rsqr[:row_n]),
            pl.Series(f'RESI{i}', resi[:row_n]),
        ])
    return df

# df = df.group_by('instrument').map_groups(func_ts_date)
# print(df)

df = func_ts_date(df.filter(pl.col('instrument') == 'SH600005'))


SH600005


In [13]:
df[['datetime','MAX5','MIN5', 'RSQR5','close', 'RESI5', 'BETA5']].head(10)
# df

datetime,MAX5,MIN5,RSQR5,close,RESI5,BETA5
str,f64,f64,f64,f64,f64,f64
"""2016-01-04""",,,,1.5577798,,
"""2016-01-05""",,,,1.6060054,,
"""2016-01-06""",,,,1.7652411,,
"""2016-01-07""",,,,1.5914468,,
"""2016-01-08""",1.008316,0.889813,0.37053,1.7506825,4.4409e-16,0.037125
"""2016-01-11""",1.049216,0.945917,0.074777,1.6824386,8.8818e-16,0.013831
"""2016-01-12""",1.086226,0.979283,0.155217,1.6251137,-6.6613e-16,-0.018926
"""2016-01-13""",1.055403,0.959408,0.000566,1.6587807,-4.4409e-16,0.00091
"""2016-01-14""",1.025586,0.952026,0.135739,1.7070063,2.2204e-16,-0.011101
"""2016-01-15""",1.059887,1.0,0.060533,1.610555,2.2204e-16,-0.006187


In [6]:
df.select([col for col in df.columns if not col.islower()])

KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,MA5,STD5,MAX5,MIN5,QTLU5,QTLD5,RANK5,RSV5,IMAX5,IMIN5,CORR5,CORD5,CNTP5,CNTN5,SUMP5,SUMN5,VMA5,VSTD5,WVMA5,VSUMP5,VSUMN5,IMXD5,CNTD5,…,VSUMN30,IMXD30,CNTD30,SUMD30,VSUMD30,BETA30,RSQR30,RESI30,ROC60,MA60,STD60,MAX60,MIN60,QTLU60,QTLD60,RANK60,RSV60,IMAX60,IMIN60,CORR60,CORD60,CNTP60,CNTN60,SUMP60,SUMN60,VMA60,VSTD60,WVMA60,VSUMP60,VSUMN60,IMXD60,CNTD60,SUMD60,VSUMD60,BETA60,RSQR60,RESI60
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
-0.098022,0.107481,-0.912,0.008598,0.08,0.00086,0.008,-0.105761,-0.984,1.108675,1.118208,0.999047,1.058824,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0.021611,0.072692,0.297296,0.02554,0.351352,0.02554,0.351352,0.021611,0.297296,0.978846,1.025,0.953846,0.99057,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0.018234,0.03071,0.593749,0.001919,0.062499,0.010557,0.343752,0.026872,0.875002,0.982092,1.001885,0.971725,0.988487,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
-0.071291,0.081888,-0.870589,0.006744,0.082353,0.003854,0.047058,-0.074181,-0.905883,1.076764,1.084025,0.995851,1.042044,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
-0.00597,0.068657,-0.086956,0.00995,0.144928,0.052736,0.768116,0.036816,0.536231,1.006006,1.016016,0.946947,0.994787,,1.023635,0.040233,1.062059,0.965,1.052459,0.993,0.4,0.23347,0.2,1.0,0.643622,,,,,,0.704077,0.334692,,,,-0.8,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.021739,0.04837,0.449436,0.015761,0.325844,0.01087,0.22472,0.016848,0.348312,0.978723,1.015426,0.968085,0.998748,1.035128,1.021232,0.027967,1.05374,0.98671,1.044166,0.997342,0.4,0.372463,0.6,0.8,0.006196,-0.378944,0.4,0.6,0.355224,0.644776,1.576293,0.462574,1.045704,0.524215,0.475785,-0.2,-0.2,…,0.492089,-0.333333,-0.266667,-0.206089,0.015822,-0.002162,0.002664,-5.3291e-15,0.846005,1.011128,0.075916,1.146286,0.838844,1.060018,0.963724,0.433333,0.517697,0.4,0.016667,0.100587,0.168408,0.466667,0.533333,0.570247,0.429753,1.454835,0.511551,1.554582,0.488577,0.511423,0.383333,-0.066667,0.140494,-0.022846,0.011422,0.07439,-4.2633e-14
0.010907,0.05294,0.206029,0.026071,0.492462,0.015962,0.301509,0.000798,0.015076,0.989211,1.025789,0.973421,1.003694,1.01315,1.007845,0.027978,1.042642,0.976317,1.033168,0.986837,0.6,0.450937,0.4,0.6,0.071555,-0.301117,0.6,0.4,0.444979,0.555021,1.244147,0.37534,1.053301,0.488206,0.511794,-0.2,0.2,…,0.488797,-0.3,-0.2,-0.16129,0.022405,0.004212,0.01286,4.0856e-14,0.830009,1.003311,0.071706,1.134212,0.843956,1.048853,0.96025,0.466667,0.545588,0.383333,0.016667,0.059874,0.163412,0.483333,0.516667,0.578121,0.421879,1.16127,0.404143,1.549226,0.50571,0.49429,0.366667,-0.033333,0.156243,0.01142,0.008601,0.046292,-3.1974e-14
-0.11292,0.116279,-0.971111,0.0,0.0,0.003359,0.028889,-0.109561,-0.942223,1.127294,1.127294,0.996213,1.034069,1.154107,1.08477,0.052355,1.141,1.0,1.113726,1.064554,0.2,0.019695,0.2,1.0,-0.785023,-0.887945,0.4,0.6,0.126925,0.873075,0.502988,0.300787,1.466709,0.733153,0.266847,-0.8,-0.2,…,0.41683,-0.966667,-0.2,-0.26504,0.166339,0.003764,0.009916,-7.6383e-14,0.934464,1.111665,0.077332,1.255467,0.934181,1.160982,1.062908,0.1,0.223957,0.366667,0.066667,-0.083811,-0.060003,0.466667,0.533333,0.525294,0.474706,0.375286,0.148687,1.753673,0.556162,0.443838,0.3,-0.066667,0.050588,0.112325,0.004381,0.012653,-1.4744e-13
-0.002055,0.055768,-0.036842,0.021133,0.378947,0.03258,0.584211,0.009392,0.168421,1.002059,1.023235,0.967353,0.990434,1.152071,1.064881,0.055701,1.117647,1.0,1.10823,1.007762,0.2,0.173165,0.2,1.0,-0.916085,-0.88806,0.4,0.6,0.129111,0.870889,0.682663,0.380883,1.408148,0.677801,0.322199,-0.8,-0.2,…,0.461566,-0.166667,-0.2,-0.200497,0.076869,-0.000557,0.000176,-4.9738e-14,0.961189,1.123098,0.076875,1.267649,0.943246,1.172247,1.073221,0.083333,0.196281,0.35,0.05,-0.175281,-0.064784,0.45,0.55,0.514926,0.485074,0.439819,0.184333,1.742771,0.541783,0.458217,0.3,-0.1,0.029852,0.083566,0.000251,0.000043,-1.5987e-13


In [11]:
df['instrument'].n_unique()

509