In [50]:
import pandas as pd
import numpy as np
import h5py


def mad(s, n=3):
    median = s.quantile(0.5)
    diff_median = ((s - median).abs()).quantile(0.5)
    max_range = median + 1.4826 * n * diff_median
    min_range = median - 1.4826 * n * diff_median
    return np.clip(s, min_range, max_range)


f = h5py.File("factors.h5", "r")
asset = np.array(f["factors"]).T

asset = pd.DataFrame(asset)

col_name = np.array(f["col_name"])
col_name = [i.decode('UTF-8') for i in col_name]
asset.columns = col_name

f.close()

asset['time'] = asset['time'].astype(int)
asset['id'] = asset['id'].astype(int)
asset['id'] = 'id'+asset['id'].astype(str)
asset.sort_values(['time', 'id'], inplace=True)
asset.head()


Unnamed: 0,time,id,ret_1,fwd_ret_3,mmt_120,mmt_overnight_15,mmt_smooth,mmt_high_252,turnover_mean,turnover_std,mmt_relative_15,mmt_amplitude_adjust_120,mmt_time_rank_5
0,20130104,id159915,,0.034482,,,,,,,,,
1,20130104,id162411,,0.005744,,,,,,,,,
2,20130104,id510050,,-0.009261,,,,,,,,,
3,20130104,id510300,,0.001712,,,,,,,,,
4,20130104,id510410,,0.007868,,,,,,,,,


In [51]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm


reb = 3  # 每n天调仓
estimate_dates = 10  # 每10天重新估计一次模型


full_sample_dates = asset['time'].unique()
full_sample_dates = np.sort(full_sample_dates)
dates = full_sample_dates[252:]  # 保证第一次调仓前最少有一年的训练样本
n_dates = len(dates)
is_rebalance = np.linspace(1, n_dates, n_dates)
is_rebalance = (np.mod(is_rebalance, reb) == 0)
is_rebalance[0] = 1  # 第一天总是调仓

X_names = ['mmt_120', 'mmt_overnight_15',
           'mmt_smooth', 'mmt_high_252', 'turnover_mean', 'turnover_std',
           'mmt_relative_15', 'mmt_amplitude_adjust_120', 'mmt_time_rank_5']

y_names = ['target']
asset['target'] = (asset[f'fwd_ret_{reb}'] > (reb * 0.003)).astype(int)

model1 = XGBClassifier(learning_rate=0.1, objective='binary:logistic', n_estimators=1000,max_depth=8, use_label_encoder=False, colsample_bytree=0.8)
model2 = LGBMClassifier(learning_rate=0.07, n_estimators=1000, max_depth=4, objective='binary', colsample_bytree=0.8)
models = [model1, model2]

In [52]:
def fit_ensemble(first_models, train_data, reb_dates, full_sample_dates, X_names, y_names):
    s = np.where(full_sample_dates == reb_dates)
    train_val_date_split = int(0.7*s[0][0])
    train = train_data[train_data['time'] <= full_sample_dates[train_val_date_split]]
    X_train, y_train = train[X_names].values, train[y_names].values.ravel()
    del train
    val = train_data[train_data['time'] > full_sample_dates[train_val_date_split]]
    X_val, y_val = val[X_names].values, val[y_names].values.ravel()
    del val

    val_features = np.zeros((X_val.shape[0], len(first_models)))
    for i, model in enumerate(first_models):
        model.fit(X_train, y_train, early_stopping_rounds=80, eval_metric="auc", eval_set=[(X_val, y_val)], verbose=False)
        y_hat = model.predict_proba(X_val)[:, 1]
        val_features[:, i] = y_hat

    sec_model = LogisticRegression()
    sec_model.fit(val_features, y_val)

    return sec_model


def predict_ensemble(first_models, sec_model, test_data):
    predict_features = np.zeros((test_data.shape[0], len(first_models)))
    for i, model in enumerate(first_models):
        y_predict = model.predict_proba(test_data)[:, 1]
        predict_features[:, i] = y_predict

    return sec_model.predict_proba(predict_features)[:, 1]


predict_data_list = []
for idx, is_reb in enumerate(tqdm(is_rebalance)):
    if is_reb:
        if idx == 0 or (np.mod(idx+1, estimate_dates) == 0):
            train_data = asset[asset['time'] <
                               dates[idx]].copy()  # 再平衡日的数据不能参与训练
            # 剔除预期收益率为缺失值的行,主要适用于大于1的再平衡周期
            train_data.dropna(subset=[f'fwd_ret_{reb}'], inplace=True)
            sec_model = fit_ensemble(
                models, train_data, dates[idx], full_sample_dates, X_names, y_names)

        test_data = asset[asset['time'] == dates[idx]]
        X_test = test_data[X_names].values
        y_predict = predict_ensemble(models, sec_model, X_test)

        time_id = test_data[['time', 'id']]
        predict_data = time_id.assign(predict_ret=y_predict)
        predict_data_list.append(predict_data)

finall_predict_data = pd.concat(predict_data_list)
asset = pd.merge(asset, finall_predict_data, on=['time', 'id'], how='left')
result = asset[['time', 'id', 'predict_ret']]
result.reset_index(drop=True)
#result.to_hdf('result.h5', 'res')
result.to_csv('res.csv',index=False)

100%|██████████| 2125/2125 [31:48<00:00,  1.11it/s] 


ImportError: Missing optional dependency 'pytables'.  Use pip or conda to install pytables.