In [2]:
import pandas as pd
import numpy as np
from scipy.stats import kurtosis, skew, rankdata, mstats
from sklearn.linear_model import LinearRegression

# 基本运算符

In [3]:
def at_add(sigA, sigB):
    result = np.add(sigA, sigB)
    result[np.isnan(sigA) | np.isnan(sigB)] = np.nan
    return result

def at_div(sigA, sigB):
    with np.errstate(divide='ignore', invalid='ignore'):
        result = np.divide(sigA, sigB)
        result[np.isnan(sigA) | np.isnan(sigB) | (sigB == 0)] = np.nan
    return result

def at_log(sig):
    with np.errstate(divide='ignore', invalid='ignore'):
        result = np.log(sig)
        result[np.isnan(sig) | (sig <= 0)] = np.nan
    return result

def at_mask(sig, mask):
    result = np.where(mask, sig, np.nan)
    return result


def at_mul(sigA, sigB):
    result = np.multiply(sigA, sigB)
    result[np.isnan(sigA) | np.isnan(sigB)] = np.nan
    result[np.isinf(sigA) | np.isinf(sigB)] = np.inf
    return result

def at_signsqrt(sig):
    with np.errstate(invalid='ignore'):
        result = np.sign(sig) * np.sqrt(np.abs(sig))
        result[np.isnan(sig)] = np.nan
        result[np.isinf(sig)] = np.inf
    return result

def at_sub(sigA, sigB):
    result = np.subtract(sigA, sigB)
    result[np.isnan(sigA) | np.isnan(sigB)] = np.nan
    return result

def at_sign(sig):
    result = np.sign(sig)
    result[np.isnan(sig)] = np.nan
    result[result == 0] = 0
    return result

def at_signlog(sig):
    with np.errstate(divide='ignore', invalid='ignore'):
        result = np.sign(sig) * np.log(np.abs(sig))
        result[np.isnan(sig)] = np.nan
        result[sig == 0] = 0
    return result

def at_signpower(sig, power):
    with np.errstate(invalid='ignore'):
        result = np.sign(sig) * np.power(np.abs(sig), power)
        result[np.isnan(sig)] = np.nan
        result[np.isinf(sig)] = np.inf
    return result

def at_sigmoid(sig):
    result = 1 / (1 + np.exp(-sig))
    result[np.isnan(sig)] = np.nan
    return result

def at_inf2zero(sig):
    result = np.where(np.isinf(sig), 0, sig)
    return result

def at_inf2nan(sig):
    result = np.where(np.isinf(sig), np.nan, sig)
    return result

# 截面算子

In [4]:
def cs_rank(sig):
    return sig.rank(pct=True) * (2 - 1) + 1

def cs_scale(sig):
    min_val = sig.min()
    max_val = sig.max()
    return (sig - min_val) / (max_val - min_val) * (2 - 1) + 1

def cs_zscore(sig):
    return (sig - sig.mean()) / sig.std()

def cs_poslimit(preA, pos_limit):
    total = preA.sum()
    ratio = pos_limit / total
    return np.clip(preA * ratio, None, pos_limit)

def cs_harmonic_mean(sig):
    return len(sig) / np.sum(1.0 / sig)

def cs_ind_avg(sig, industry):
    return sig.groupby(industry).transform('mean')

def cs_indneut(sig, industry):
    industry_avg = cs_ind_avg(sig, industry)
    return sig - industry_avg

def cs_norm(sig, scaling=1):
    return (sig - sig.mean()) / sig.std() * scaling

def cs_norm_spread(data1, data2):
    return (data2 - data1) / (np.abs(data1) + np.abs(data2))

def cs_edge_flip(sig, percent):
    median = np.median(sig)
    diff = np.abs(sig - median)
    threshold = np.percentile(diff, percent * 100)
    return np.where(diff > threshold, -sig, sig)

def cs_remove_middle(sig, filter_percentile):
    lower_bound = np.percentile(sig, filter_percentile * 100)
    upper_bound = np.percentile(sig, (1 - filter_percentile) * 100)
    return np.where((sig > lower_bound) & (sig < upper_bound), np.nan, sig)

def cs_winsor(sig, filter_percentile):
    return mstats.winsorize(sig, limits=(filter_percentile, filter_percentile))

def cs_multi_regress(sig, factor, intercept=False):
    X = np.column_stack(factor)
    if intercept:
        X = np.column_stack((np.ones(len(sig)), X))
    model = LinearRegression(fit_intercept=False).fit(X, sig)
    y_hat = model.predict(X)
    residual = sig - y_hat
    beta = model.coef_
    return y_hat, residual, beta, X

def cs_multi_regress_predict(sig, factor, intercept=True, lookback=20, shift=1):
    results = []
    for i in range(lookback, len(sig)):
        y_hat, residual, beta, X = cs_multi_regress(sig[i-lookback:i], [f[i-lookback:i] for f in factor], intercept)
        results.append(y_hat[-1])
    return np.array([np.nan] * (lookback + shift - 1) + results)

def cs_neut_with_sector(sig, factor):
    return sig - cs_multi_regress(sig, [factor])[0]

def cs_demean(sig):
    return sig - sig.mean()

def cs_cut(sigA, sigB):
    return np.sign(sigA - cs_demean(sigB)) * sigB

def cs_umr(sigA, sigB):
    return (sigA - cs_demean(sigB)) * sigB

# 时序算子

In [49]:
def ts_correlation(sigA, sigB, lookback):
    return sigA.rolling(window=lookback).corr(sigB)

def ts_delay(sig, days):
    return sig.shift(days)

def ts_delta(sig, lookback):
    return sig - sig.shift(lookback)

def ts_detrend(sig, lookback):
    x = np.arange(lookback)
    y = sig[-lookback:]
    if len(y) < lookback:
        return np.nan, np.nan, np.nan, np.nan, np.nan
    A = np.vstack([x, np.ones(len(x))]).T
    beta, alpha = np.linalg.lstsq(A, y, rcond=None)[0]
    y_hat = alpha + beta * x
    error = y - y_hat
    rsq = 1 - np.sum(error**2) / np.sum((y - np.mean(y))**2)
    return beta, alpha, y_hat, error, rsq

def ts_fill(sig, lookback):
    return sig.fillna(method='ffill', limit=lookback)

def ts_harmonic_mean(sig, lookback):
    return lookback / np.sum(1.0 / sig.rolling(window=lookback))

def ts_ir(sig, lookback):
    return sig.rolling(window=lookback).mean() / sig.rolling(window=lookback).std()

def ts_kurtosis(sig, lookback):
    return sig.rolling(window=lookback).apply(lambda x: kurtosis(x, nan_policy='omit'))

def ts_skewness(sig, lookback):
    return sig.rolling(window=lookback).apply(lambda x: skew(x, nan_policy='omit'))

def ts_mean(sig, lookback):
    return sig.rolling(window=lookback).mean()

def ts_sum(sig, lookback):
    return sig.rolling(window=lookback).sum()

def ts_std(sig, lookback):
    return sig.rolling(window=lookback).std()

def ts_cov(sig, lookback):
    return sig.rolling(window=lookback).cov()

def ts_median(sig, lookback):
    return sig.rolling(window=lookback).median()

def ts_wmean(sig, lookback):
    weights = np.arange(1, lookback + 1)
    return sig.rolling(window=lookback).apply(lambda x: np.dot(x, weights) / weights.sum(), raw=True)

def ts_ewm(sig, lookback):
    return sig.ewm(span=lookback, adjust=False).mean()

def ts_mon(sig, lookback):
    return sig.diff(lookback).abs().sum()

def ts_to_max(sig, lookback):
    return sig / sig.rolling(window=lookback).max()

def ts_to_min(sig, lookback):
    return sig / sig.rolling(window=lookback).min()

def ts_to_maxmin_norm(sig, lookback):
    min_val = sig.rolling(window=lookback).min()
    max_val = sig.rolling(window=lookback).max()
    return (sig - min_val) / (max_val - min_val)

def ts_max_to_min(sig, lookback):
    max_val = sig.rolling(window=lookback).max()
    min_val = sig.rolling(window=lookback).min()
    return max_val - min_val

def ts_to_mean(sig, lookback):
    return sig / sig.rolling(window=lookback).mean()

def ts_to_ewm(sig, lookback):
    return sig / sig.ewm(span=lookback, adjust=False).mean()

def ts_to_wm(sig, lookback):
    weights = np.arange(1, lookback + 1)
    return sig.rolling(window=lookback).apply(lambda x: np.dot(x, weights) / weights.sum(), raw=True)

def ts_pctchg_abs(sig, lookback):
    return sig.diff(lookback).abs() / sig.shift(lookback)

def ts_pctchg(sig, lookback):
    return sig.diff(lookback) / sig.shift(lookback)

def ts_meanrank(sig, lookback):
    return sig.rolling(window=lookback).apply(lambda x: rankdata(x).mean())

def ts_rankcorr(sigA, sigB, lookback):
    return sigA.rolling(window=lookback).corr(sigB)

def ts_log_pctchg(sig, lookback):
    return np.log(sig / sig.shift(lookback))

def ts_regress(x, y, lookback):
    x = x[-lookback:]
    y = y[-lookback:]
    if len(x) < lookback or len(y) < lookback:
        return np.nan
    A = np.vstack([x, np.ones(len(x))]).T
    beta, alpha = np.linalg.lstsq(A, y, rcond=None)[0]
    return beta, alpha

def ts_product(sig, lookback):
    return sig.rolling(window=lookback).apply(np.prod)

def ts_ret(sig, lookback):
    return (sig - sig.shift(lookback)) / np.abs(sig.shift(lookback))

def ts_std_norm(sig, lookback):
    return sig / sig.rolling(window=lookback).std()

def ts_argmax(sig, lookback):
    return sig.rolling(window=lookback).apply(np.argmax) + 1  # 返回的是在滚动窗口内最大值和最小值的位置索引

def ts_argmin(sig, lookback):
    return sig.rolling(window=lookback).apply(np.argmin) + 1

def ts_max(sig, lookback):
    return sig.rolling(window=lookback).max()

def ts_min(sig, lookback):
    return sig.rolling(window=lookback).min()

def ts_rank(sig, lookback):
    return sig.rolling(window=lookback).apply(lambda x: rankdata(x)[-1])

def ts_cokurt(sigA, sigB, lookback):
    def cokurt_aggregate(x):
        y = sigB.loc[x.index]
        return kurtosis(x, y, nan_policy='omit')
    
    return sigA.rolling(window=lookback).apply(cokurt_aggregate, raw=False)

def ts_coskew(sigA, sigB, lookback):
    def coskew_aggregate(x):
        y = sigB.loc[x.index]
        return skew(x, y, nan_policy='omit')
    
    return sigA.rolling(window=lookback).apply(coskew_aggregate, raw=False)

def ts_transform_frequency(sig, freq=20, start=0):
    return sig[start::freq]

def ts_quantile(sig, lookback, lamba):
    return sig.rolling(window=lookback).quantile(lamba)


def ts_split(sigA, sigB, lookback=20, method='mean', lamba=0.2):
    def split_aggregate(x):
        y = sigB.loc[x.index] 
        split_point = int(len(x) * lamba)
        sorted_indices = np.argsort(x)[-split_point:]  # np.argsort(x)为升序排列后的索引数组
        print(sorted_indices)
        if method == 'mean':
            return np.mean(y.iloc[sorted_indices])
        elif method == 'max':
            return np.max(y.iloc[sorted_indices])
        elif method == 'std':
            return np.std(y.iloc[sorted_indices])
        else:
            raise ValueError("Unsupported method")
    
    return sigA.rolling(window=lookback).apply(split_aggregate, raw=False)


# 因子构造

In [6]:
ORIGINAL_TRAIN = pd.read_csv('./input/train.csv',engine = 'pyarrow')
ORIGINAL_TRAIN

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.50,1.000026,8493.03,1.000000,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.000660,20605.09,1.000000,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.00,1.000298,18995.00,1.000000,-8.389950,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.90,1.000214,479032.40,1.000000,-4.010200,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.10,1.000000,-7.349849,0,0_0_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237975,195,480,540,2440722.89,-1,1.000317,28280361.74,0.999734,0.999734,1.000317,32257.04,1.000434,319862.40,1.000328,2.310276,26454,480_540_195
5237976,196,480,540,349510.47,-1,1.000643,9187699.11,1.000129,1.000386,1.000643,205108.40,1.000900,93393.07,1.000819,-8.220077,26454,480_540_196
5237977,197,480,540,0.00,0,0.995789,12725436.10,0.995789,0.995789,0.995789,16790.66,0.995883,180038.32,0.995797,1.169443,26454,480_540_197
5237978,198,480,540,1000898.84,1,0.999210,94773271.05,0.999210,0.999210,0.998970,125631.72,0.999210,669893.00,0.999008,-1.540184,26454,480_540_198


In [51]:
prices = ['reference_price', 'far_price', 'mid_price',
                  'near_price', 'ask_price', 'bid_price', 'wap']
sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]

df = ORIGINAL_TRAIN.copy()
# df['price_spread'] = at_sub(df['ask_price'], df['bid_price'])
# df['size_ratio'] = at_div(df['ask_size'], df['bid_size'])
# df['price_impact'] = at_sub(df['far_price'], df['near_price'])
# df['ask_volume'] = at_mul(df['ask_size'], df['ask_price'])
# df['bid_volume'] = at_mul(df['bid_size'], df['bid_price'])


lookback = 5
grouped = df.groupby(['stock_id', 'date_id'])
# for price in (prices + sizes):
#     df[f'{price}_ewm'] = grouped[price].apply(lambda x: ts_ewm(x, lookback))
#     df[f'{price}_mon'] = grouped[price].apply(lambda x: ts_mon(x, lookback))
#     df[f'{price}_max'] = grouped[price].apply(lambda x: ts_to_max(x, lookback))
#     df[f'{price}_min'] = grouped[price].apply(lambda x: ts_to_min(x, lookback))
#     df[f'{price}_maxmin_norm'] = grouped[price].apply(lambda x: ts_to_maxmin_norm(x, lookback))
#     df[f'{price}_max_to_min'] = grouped[price].apply(lambda x: ts_max_to_min(x, lookback))
#     df[f'{price}_to_mean'] = grouped[price].apply(lambda x: ts_to_mean(x, lookback))
#     df[f'{price}_to_ewm'] = grouped[price].apply(lambda x: ts_to_ewm(x, lookback))
#     df[f'{price}_to_wm'] = grouped[price].apply(lambda x: ts_to_wm(x, lookback))
#     df[f'{price}_pctchg_abs'] = grouped[price].apply(lambda x: ts_pctchg_abs(x, lookback))
#     df[f'{price}_pctchg'] = grouped[price].apply(lambda x: ts_pctchg(x, lookback))
#     df[f'{price}_meanrank'] = grouped[price].apply(lambda x: ts_meanrank(x, lookback))
#     df[f'{price}_rankcorr'] = grouped[price].apply(lambda x: ts_rankcorr(x, x, lookback))
#     df[f'{price}_log_pctchg'] = grouped[price].apply(lambda x: ts_log_pctchg(x, lookback))
#     df[f'{price}_product'] = grouped[price].apply(lambda x: ts_product(x, lookback))
