In [1]:
import scipy as sp
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
from pathlib import Path

from tqdm import *
import KDCJ_003_

import warnings

warnings.filterwarnings("ignore")

# KDCJ_ALPHA 构建

In [None]:
path = ".\\database\\"

start_date = "2020-01-02"
end_date = "2022-01-01"
n = 5  # 调仓周期

df = pd.read_pickle("{}20140101_20221214_全A_日级别.pkl".format(path))

whole_trade_datetime = sorted(list(set(df.index.get_level_values(1))))


def get_pre_trade_day(date, n):
    return whole_trade_datetime[whole_trade_datetime.index(pd.Timestamp(date)) - n]


df = (
    df.reset_index()
    .set_index(["date"])
    .sort_index()
    .loc[get_pre_trade_day(start_date, 5) : end_date]
    .reset_index()
    .set_index(["order_book_id", "date"])
)

open = df.open.unstack("order_book_id")
high = df.high.unstack("order_book_id")
low = df.low.unstack("order_book_id")
close = df.close.unstack("order_book_id")
prev_close = df.prev_close.unstack("order_book_id")
volume = df.volume.unstack("order_book_id")
amount = df.total_turnover.unstack("order_book_id")
avg_price = amount.div(volume, fill_value=0)

In [3]:
open

order_book_id,000001.XSHE,000002.XSHE,000004.XSHE,000005.XSHE,000006.XSHE,000007.XSHE,000008.XSHE,000009.XSHE,000010.XSHE,000011.XSHE,...,688787.XSHG,688788.XSHG,688789.XSHG,688793.XSHG,688798.XSHG,688799.XSHG,688800.XSHG,688819.XSHG,688981.XSHG,689009.XSHG
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-12-25,15.7668,26.1110,21.69,3.05,4.7913,9.78,3.6081,4.8214,3.42,8.1311,...,,,,,,,,,,
2019-12-26,15.6613,26.1969,22.00,3.03,4.7731,9.60,3.6181,5.4477,3.41,8.0959,...,,,,,,,,,,
2019-12-27,15.8434,26.8239,23.73,3.10,4.8095,9.50,3.6380,6.1635,3.39,8.2104,...,,,,,,,,,,
2019-12-30,15.7763,26.9270,22.03,3.06,4.7731,9.84,3.6081,6.2629,3.43,8.1575,...,,,,,,,,,,
2019-12-31,15.8818,26.9270,22.69,3.09,4.8550,9.72,3.6081,6.2331,3.38,8.2809,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-27,17.0263,18.6365,19.03,2.24,4.2595,5.87,2.8800,14.0870,3.73,11.1477,...,85.5971,92.4479,273.7359,91.2697,210.7623,37.2796,139.7000,41.2687,52.53,65.00
2021-12-28,16.9182,18.3822,18.95,2.26,4.3082,5.81,2.8300,13.9473,3.77,11.0814,...,87.4577,91.0966,277.5881,97.2208,211.6674,39.1377,137.8255,41.6816,52.83,66.60
2021-12-29,16.8593,18.5046,19.57,2.30,4.2595,5.75,2.6900,14.2167,3.79,10.9772,...,85.7961,92.0679,267.7635,99.2045,217.7048,38.7269,144.7651,41.8782,53.13,68.50
2021-12-30,16.4663,18.2221,19.38,2.29,4.2693,5.81,2.6300,14.2167,3.75,10.8540,...,85.3185,93.4685,260.7857,101.1882,223.5732,39.0203,145.4830,41.7602,52.75,67.99


In [None]:
alpha_names = ["alpha_{}".format(str(i).rjust(3, "0")) for i in range(1, 4)]
print(alpha_names)

['alpha_001', 'alpha_002', 'alpha_003']


In [None]:
date_list = sorted(open.index.tolist())
factor_get = []

for k in alpha_names:
    alpha_xxx_temp = pd.DataFrame()
    try:
        for i in tqdm(range(0, len(open.index) - n)):
            KDCJ_alpha = KDCJ_003_.KDCJ_003(
                open.iloc[i : i + n],
                high.iloc[i : i + n],
                low.iloc[i : i + n],
                close.iloc[i : i + n],
                prev_close.iloc[i : i + n],
                volume.iloc[i : i + n],
                amount.iloc[i : i + n],
                avg_price.iloc[i : i + n],
            )
            alpha_xxx_temp = pd.concat(
                [
                    alpha_xxx_temp,
                    pd.DataFrame(
                        eval("KDCJ_alpha.{}()".format(k)),
                        columns=[date_list[i + n - 1]],
                    ),
                ],
                axis=1,
            )
        alpha_xxx_temp.T.to_pickle("{}{}.pkl".format(path, k))
        factor_get.append(k)
    except:
        print("Error:{}".format(k))

100%|██████████| 486/486 [00:10<00:00, 46.69it/s]
100%|██████████| 486/486 [00:09<00:00, 49.04it/s] 
100%|██████████| 486/486 [00:12<00:00, 37.54it/s]


In [10]:
factor_get

['alpha_001', 'alpha_002', 'alpha_003']

# 因子检验

In [None]:
path = ".\\database\\"

change_day = 5

# 暴露度获取
market_cap = pd.DataFrame(
    pd.read_pickle("{}market_cap.pkl".format(path)), columns=["market_cap"]
).loc[start_date:end_date]
industry_exposure = pd.read_pickle("{}industry_exposure.pkl".format(path)).loc[
    start_date:end_date
]

# 新股 涨停 ST 停牌筛选
new_stock_filter = pd.read_pickle("{}new_stock_filter.pkl".format(path))
st_filter = pd.read_pickle("{}st_filter.pkl".format(path))
suspended_filter = pd.read_pickle("{}suspended_filter.pkl".format(path))
limit_up_down_filter = pd.read_pickle("{}limit_up_down_filter.pkl".format(path))
for k in [new_stock_filter, st_filter, suspended_filter, limit_up_down_filter]:
    k.columns.names = ["order_book_id"]

# 价格数据
close = pd.read_pickle("{}20140101_20221214_全A_日级别.pkl".format(path)).close.unstack(
    "order_book_id"
)


# 离群值处理
def filter_extreme_MAD(series, n):
    median = series.median()
    new_median = ((series - median).abs()).median()
    if new_median == 0:
        new_median = np.inf
    return series.clip(median - n * new_median, median + n * new_median)


# 中性化处理
def neutralization(factor, market_cap=market_cap, industry_exposure=industry_exposure):
    factor_resid = pd.DataFrame()
    factor_ols = pd.concat(
        [factor.stack(), market_cap, industry_exposure], axis=1
    ).dropna()
    datetime_list = sorted(list(set(market_cap.index.get_level_values(0))))
    for i in datetime_list:
        try:
            factor_ols_temp = factor_ols.loc[i]  # 截面数据做回归
            x = factor_ols_temp.iloc[:, 1:]  # 市值/行业
            y = factor_ols_temp.iloc[:, 0]  # 因子值
            factor_ols_resid_temp = pd.DataFrame(
                sm.OLS(y.astype(float), x.astype(float), hasconst=False, missing="drop")
                .fit()
                .resid,
                columns=["{}".format(i)],
            )
            factor_resid = pd.concat([factor_resid, factor_ols_resid_temp], axis=1)
        except:
            factor_resid = pd.concat([factor_resid, pd.DataFrame()], axis=1)
    factor_resid = factor_resid.T
    factor_resid.index = pd.to_datetime(factor_resid.index)
    return factor_resid


# 单因子检测
def Factor_Return_N_IC(factor, n=change_day, Rank_IC=True, close=close):

    date_list_whole = sorted(list(set(factor.index.get_level_values(0))))
    start_date = date_list_whole[0]
    end_date = date_list_whole[-1]
    stock_list = sorted(list(set(factor.index.get_level_values(1))))
    close = close.pct_change(n).shift(-n).stack()
    close = pd.concat([close, factor], axis=1).dropna().reset_index()
    close.columns = ["date", "stock", "change_days", "factor"]
    if Rank_IC == True:
        rank_ic = (
            close.groupby("date")["change_days", "factor"]
            .corr(method="spearman")
            .reset_index()
            .set_index(["date"])
        )
        return rank_ic[rank_ic.level_1 == "factor"][["change_days"]]


# ic_ir检测
def ic_ir(x, name):
    t_stat, p_value = stats.ttest_1samp(x, 0)
    IC = {
        "name": name,
        "IC mean": round(x.mean()[0], 4),
        "IC std": round(x.std()[0], 4),
        "IR": round(x.mean()[0] / x.std()[0], 4),
        "t_stat": round(t_stat[0], 4),
        "p_value": round(p_value[0], 4),
        "IC>0": round(len(x[x > 0].dropna()) / len(x), 4),
        "ABS_IC>2%": round((len(x[abs(x) > 0.02].dropna()) / len(x)), 4),
    }
    return pd.DataFrame([IC])


factor_name = factor_get

In [None]:
ic_summary = pd.DataFrame()
for j in factor_name:

    print(j)

    # 因子获取
    factor_alpha = (
        pd.read_pickle("{}{}.pkl".format(path, j))
        .dropna(axis=1, how="all")
        .loc[start_date:end_date]
    )

    # 券池 新股 涨停 ST 停牌筛选
    factor_alpha = (
        factor_alpha.mask(new_stock_filter)
        .mask(st_filter)
        .mask(suspended_filter)
        .mask(limit_up_down_filter)
        .dropna(axis=1, how="all")
    )

    # 离群值处理
    factor_alpha = factor_alpha.apply(lambda x: filter_extreme_MAD(x, 3), axis=1)

    # 标准化处理
    factor_alpha = factor_alpha.sub(factor_alpha.mean(axis=1), axis=0).div(
        factor_alpha.std(axis=1), axis=0
    )

    # 中性化处理
    factor_alpha = neutralization(factor_alpha)

    # 单因子检验
    Result = Factor_Return_N_IC(factor_alpha.stack())

    # ICIR
    ic_summary = pd.concat([ic_summary, ic_ir(Result, j)], axis=0)

ic_summary

alpha_001
alpha_002
alpha_003


Unnamed: 0,name,IC mean,IC std,IR,t_stat,p_value,IC>0,ABS_IC>2%
0,alpha_001,0.0201,0.034,0.5917,13.0303,0.0,0.7443,0.5876
0,alpha_002,-0.0059,0.0387,-0.1534,-3.3774,0.0008,0.4577,0.5835
0,alpha_003,-0.0411,0.0827,-0.4971,-10.9471,0.0,0.2928,0.8041
