In [1]:
import scipy as sp
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
from pathlib import Path

from tqdm import *
import KDCJ_003_

import warnings

warnings.filterwarnings("ignore")

# KDCJ_ALPHA 构建

In [2]:
DATA_DIR = Path("./database")

start_date = "2020-01-02"
end_date = "2022-01-01"
n = 5  # 调仓周期


df = pd.read_pickle(DATA_DIR / "20140101_20221214_全A_日级别.pkl")
whole_trade_datetime = sorted(list(set(df.index.get_level_values(1))))


def get_pre_trade_day(date, n):
    return whole_trade_datetime[whole_trade_datetime.index(pd.Timestamp(date)) - n]


df = (
    df.reset_index()
    .set_index(["date"])
    .sort_index()
    .loc[get_pre_trade_day(start_date, 5) : end_date]
    .reset_index()
    .set_index(["order_book_id", "date"])
)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,volume,high,num_trades,prev_close,limit_down,limit_up,total_turnover,close,open,low
order_book_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
000881.XSHE,2019-12-25,1307570.00,6.8015,1862.0,6.83,6.0887,7.4351,8.944289e+06,6.7817,6.7619,6.7025
603718.XSHG,2019-12-25,3476938.00,12.9041,5009.0,12.77,11.4847,14.0435,4.425453e+07,12.6742,12.7841,12.5642
000066.XSHE,2019-12-25,63652544.00,16.0046,49662.0,15.82,14.0077,17.1162,1.021046e+09,15.7980,15.4734,15.4439
300569.XSHE,2019-12-25,6922977.66,4.7927,4122.0,14.75,4.2716,5.2205,3.537368e+07,4.7315,4.7927,4.7090
600666.XSHG,2019-12-25,29482622.00,2.1600,5802.0,2.12,2.0100,2.2300,6.082612e+07,2.0200,2.1000,2.0100
...,...,...,...,...,...,...,...,...,...,...,...
600355.XSHG,2021-12-31,28588200.00,3.3700,14799.0,3.30,2.9700,3.6300,9.413848e+07,3.2900,3.3500,3.2600
600098.XSHG,2021-12-31,9419275.00,8.1478,9033.0,8.05,7.1429,8.7291,7.674863e+07,7.9607,7.9508,7.9212
300052.XSHE,2021-12-31,52702059.00,34.1800,94030.0,31.41,25.1300,37.6900,1.674599e+09,32.6600,31.3500,29.8300
300047.XSHE,2021-12-31,22417211.00,7.8895,26268.0,7.79,6.2138,9.3257,1.755550e+08,7.8396,7.7897,7.6900


In [3]:
open = df.open.unstack("order_book_id")
high = df.high.unstack("order_book_id")
low = df.low.unstack("order_book_id")
close = df.close.unstack("order_book_id")
prev_close = df.prev_close.unstack("order_book_id")
volume = df.volume.unstack("order_book_id")
amount = df.total_turnover.unstack("order_book_id")
avg_price = amount.div(volume, fill_value=0)

In [4]:
alpha_names = ["alpha_{}".format(str(i).rjust(3, "0")) for i in range(1, 2)]
print(alpha_names)

['alpha_001']


In [5]:
date_list = sorted(open.index.tolist())
factor_get = []

for k in alpha_names:
    alpha_xxx_temp = pd.DataFrame()
    try:
        for i in tqdm(range(0, len(open.index) - n)):
            KDCJ_alpha = KDCJ_003_.KDCJ_003(
                open.iloc[i : i + n],
                high.iloc[i : i + n],
                low.iloc[i : i + n],
                close.iloc[i : i + n],
                prev_close.iloc[i : i + n],
                volume.iloc[i : i + n],
                amount.iloc[i : i + n],
                avg_price.iloc[i : i + n],
            )
            alpha_xxx_temp = pd.concat(
                [
                    alpha_xxx_temp,
                    pd.DataFrame(
                        eval("KDCJ_alpha.{}()".format(k)),
                        columns=[date_list[i + n - 1]],
                    ),
                ],
                axis=1,
            )
        alpha_xxx_temp.T.to_pickle("{}{}.pkl".format("alpha", k))
        factor_get.append(k)
    except:
        print("Error:{}".format(k))

  0%|          | 0/486 [00:00<?, ?it/s]

100%|██████████| 486/486 [00:03<00:00, 138.81it/s]


In [6]:
alpha001 = alpha_xxx_temp.T
alpha001

order_book_id,000001.XSHE,000002.XSHE,000004.XSHE,000005.XSHE,000006.XSHE,000007.XSHE,000008.XSHE,000009.XSHE,000010.XSHE,000011.XSHE,...,600927.XSHG,001296.XSHE,301190.XSHE,603230.XSHG,688167.XSHG,688210.XSHG,301166.XSHE,301189.XSHE,688206.XSHG,688265.XSHG
2019-12-31,-0.954802,-0.456834,-0.426963,-0.473601,-0.691409,-0.626237,-0.615699,0.012899,0.210884,-0.704172,...,,,,,,,,,,
2020-01-02,-0.948989,-0.367020,0.329907,0.480550,0.234563,-0.627711,-0.904517,0.248376,0.507199,-0.493646,...,,,,,,,,,,
2020-01-03,-0.114227,0.051937,0.701938,-0.125823,-0.357070,0.094934,-0.642734,-0.201459,0.593036,-0.995865,...,,,,,,,,,,
2020-01-06,-0.057578,-0.169076,0.863307,-0.650650,-0.416217,-0.860734,-0.622761,-0.100654,0.450534,-0.825727,...,,,,,,,,,,
2020-01-07,0.070051,-0.481087,0.790417,-0.952367,-0.324018,-0.220279,-0.423454,0.864921,-0.458158,-0.955000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-24,-0.465651,-0.988065,-0.685277,-0.164253,-0.882409,-0.827780,0.676005,0.858182,-0.730938,-0.850289,...,,,,,,,,,,
2021-12-27,0.388486,-0.863486,-0.220270,-0.342094,-0.454617,-0.671960,0.644243,0.217459,0.292714,0.896469,...,-1.000000,,,,,,,,,
2021-12-28,-0.127794,-0.869226,-0.500873,-0.154246,-0.586817,-0.865592,-0.129398,-0.457436,0.121017,0.196260,...,-0.156556,-1.000000,1.000000,-1.000000,-1.000000,,,,,
2021-12-29,0.008089,-0.911615,-0.637070,-0.176441,-0.460045,-0.693674,-0.446153,-0.332817,0.514183,-0.804636,...,-0.282043,-0.963121,-0.628732,0.218586,-0.902763,1.000000,,,,


In [8]:
factor_get

[]

# 因子检验

In [None]:
path = ".\\database\\"

change_day = 5

# 暴露度获取
market_cap = pd.DataFrame(
    pd.read_pickle("{}market_cap.pkl".format(path)), columns=["market_cap"]
).loc[start_date:end_date]
industry_exposure = pd.read_pickle("{}industry_exposure.pkl".format(path)).loc[
    start_date:end_date
]

# 新股 涨停 ST 停牌筛选
new_stock_filter = pd.read_pickle("{}new_stock_filter.pkl".format(path))
st_filter = pd.read_pickle("{}st_filter.pkl".format(path))
suspended_filter = pd.read_pickle("{}suspended_filter.pkl".format(path))
limit_up_down_filter = pd.read_pickle("{}limit_up_down_filter.pkl".format(path))
for k in [new_stock_filter, st_filter, suspended_filter, limit_up_down_filter]:
    k.columns.names = ["order_book_id"]

# 价格数据
close = pd.read_pickle("{}20140101_20221214_全A_日级别.pkl".format(path)).close.unstack(
    "order_book_id"
)


# 离群值处理
def filter_extreme_MAD(series, n):
    median = series.median()
    new_median = ((series - median).abs()).median()
    if new_median == 0:
        new_median = np.inf
    return series.clip(median - n * new_median, median + n * new_median)


# 中性化处理
def neutralization(factor, market_cap=market_cap, industry_exposure=industry_exposure):
    factor_resid = pd.DataFrame()
    factor_ols = pd.concat(
        [factor.stack(), market_cap, industry_exposure], axis=1
    ).dropna()
    datetime_list = sorted(list(set(market_cap.index.get_level_values(0))))
    for i in datetime_list:
        try:
            factor_ols_temp = factor_ols.loc[i]  # 截面数据做回归
            x = factor_ols_temp.iloc[:, 1:]  # 市值/行业
            y = factor_ols_temp.iloc[:, 0]  # 因子值
            factor_ols_resid_temp = pd.DataFrame(
                sm.OLS(y.astype(float), x.astype(float), hasconst=False, missing="drop")
                .fit()
                .resid,
                columns=["{}".format(i)],
            )
            factor_resid = pd.concat([factor_resid, factor_ols_resid_temp], axis=1)
        except:
            factor_resid = pd.concat([factor_resid, pd.DataFrame()], axis=1)
    factor_resid = factor_resid.T
    factor_resid.index = pd.to_datetime(factor_resid.index)
    return factor_resid


# 单因子检测
def Factor_Return_N_IC(factor, n=change_day, Rank_IC=True, close=close):

    date_list_whole = sorted(list(set(factor.index.get_level_values(0))))
    start_date = date_list_whole[0]
    end_date = date_list_whole[-1]
    stock_list = sorted(list(set(factor.index.get_level_values(1))))
    close = close.pct_change(n).shift(-n).stack()
    close = pd.concat([close, factor], axis=1).dropna().reset_index()
    close.columns = ["date", "stock", "change_days", "factor"]
    if Rank_IC == True:
        rank_ic = (
            close.groupby("date")["change_days", "factor"]
            .corr(method="spearman")
            .reset_index()
            .set_index(["date"])
        )
        return rank_ic[rank_ic.level_1 == "factor"][["change_days"]]


# ic_ir检测
def ic_ir(x, name):
    t_stat, p_value = stats.ttest_1samp(x, 0)
    IC = {
        "name": name,
        "IC mean": round(x.mean()[0], 4),
        "IC std": round(x.std()[0], 4),
        "IR": round(x.mean()[0] / x.std()[0], 4),
        "t_stat": round(t_stat[0], 4),
        "p_value": round(p_value[0], 4),
        "IC>0": round(len(x[x > 0].dropna()) / len(x), 4),
        "ABS_IC>2%": round((len(x[abs(x) > 0.02].dropna()) / len(x)), 4),
    }
    return pd.DataFrame([IC])


factor_name = factor_get

In [None]:
ic_summary = pd.DataFrame()
for j in factor_name:

    print(j)

    # 因子获取
    factor_alpha = (
        pd.read_pickle("{}{}.pkl".format(path, j))
        .dropna(axis=1, how="all")
        .loc[start_date:end_date]
    )

    # 券池 新股 涨停 ST 停牌筛选
    factor_alpha = (
        factor_alpha.mask(new_stock_filter)
        .mask(st_filter)
        .mask(suspended_filter)
        .mask(limit_up_down_filter)
        .dropna(axis=1, how="all")
    )

    # 离群值处理
    factor_alpha = factor_alpha.apply(lambda x: filter_extreme_MAD(x, 3), axis=1)

    # 标准化处理
    factor_alpha = factor_alpha.sub(factor_alpha.mean(axis=1), axis=0).div(
        factor_alpha.std(axis=1), axis=0
    )

    # 中性化处理
    factor_alpha = neutralization(factor_alpha)

    # 单因子检验
    Result = Factor_Return_N_IC(factor_alpha.stack())

    # ICIR
    ic_summary = pd.concat([ic_summary, ic_ir(Result, j)], axis=0)

ic_summary

alpha_001
alpha_002
alpha_003


Unnamed: 0,name,IC mean,IC std,IR,t_stat,p_value,IC>0,ABS_IC>2%
0,alpha_001,0.0201,0.034,0.5917,13.0303,0.0,0.7443,0.5876
0,alpha_002,-0.0059,0.0387,-0.1534,-3.3774,0.0008,0.4577,0.5835
0,alpha_003,-0.0411,0.0827,-0.4971,-10.9471,0.0,0.2928,0.8041
