In [1]:
import numpy as np
import pandas as pd

import sqlalchemy

import matplotlib as mpl
import matplotlib.pyplot as plt

from common.funcs import read_ticker_from_sql, concat_shift, batch_concat_shift

In [2]:
df = read_ticker_from_sql('MA2209.ZCE', '20220719')
df.shape

(7978, 20)

In [3]:
# 制作数据集 x, 所有行, 指定列
raw_x = df[['open', 'high', 'low', 'close', 'volume', 'average', 
            'ask1_price', 'ask1_volume', 'bid1_price', 'bid1_volume']]
raw_x.shape

(7978, 10)

In [4]:
# 制作特征集 x, 添加 前1, 3, 5秒的 close, ask1_volume 增量, bid1_volume 增量
columns = ['close', 'ask1_price', 'ask1_volume', 'bid1_price', 'bid1_volume']
data_x = raw_x[0:2000].copy()
periods = 120
data_x = batch_concat_shift(data_x, periods, columns)

In [5]:
data_x.shape

(2000, 610)

In [6]:
# 制作标注集 y -- 如果 close > close_60 则为1(上涨），如果 close_10 <= 0 则为0(没上涨)
data_y = (data_x['close'] >= data_x[f"close_{periods}"]).apply(lambda x: 1 if x else 0)
data_y
data_y.shape

(2000,)

In [7]:
data_y.value_counts()

0    1200
1     800
dtype: int64

In [8]:
# 移除close列
data_x.drop('close', axis=1, inplace=True)
data_x.columns

Index(['open', 'high', 'low', 'volume', 'average', 'ask1_price', 'ask1_volume',
       'bid1_price', 'bid1_volume', 'close_1',
       ...
       'close_119', 'ask1_price_119', 'ask1_volume_119', 'bid1_price_119',
       'bid1_volume_119', 'close_120', 'ask1_price_120', 'ask1_volume_120',
       'bid1_price_120', 'bid1_volume_120'],
      dtype='object', length=609)

In [9]:
data_y.head()

id
1    0
2    1
3    1
4    0
5    0
dtype: int64

In [10]:
# xgb 分类模型
# 训练集和测试集拆分
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=1/5)

In [11]:
X_train.shape, y_train.shape

((1600, 609), (1600,))

In [12]:
import xgboost as xgb

xgb_train = xgb.DMatrix(X_train, y_train)
xgb_test = xgb.DMatrix(X_test, y_test)

In [13]:
# 模型训练
params = {
    "objective": "binary:logistic",
    "booster": "gbtree",
    "eta": 1.0,
    "gamma": 1.0,
    "min_child_weight": 1,
    "max_depth": 10
}

num_round = 50
watchlist = [(xgb_train, 'train'), (xgb_test, 'test')]

model = xgb.train(params, xgb_train, num_round, evals=watchlist)

[0]	train-logloss:0.17519	test-logloss:0.27161
[1]	train-logloss:0.08226	test-logloss:0.19053
[2]	train-logloss:0.04750	test-logloss:0.18143
[3]	train-logloss:0.03153	test-logloss:0.17111
[4]	train-logloss:0.02727	test-logloss:0.17349
[5]	train-logloss:0.02336	test-logloss:0.17520
[6]	train-logloss:0.02121	test-logloss:0.17854
[7]	train-logloss:0.01951	test-logloss:0.17841
[8]	train-logloss:0.01914	test-logloss:0.17874
[9]	train-logloss:0.01700	test-logloss:0.17141
[10]	train-logloss:0.01699	test-logloss:0.17066
[11]	train-logloss:0.01699	test-logloss:0.17063
[12]	train-logloss:0.01699	test-logloss:0.17063
[13]	train-logloss:0.01699	test-logloss:0.17063
[14]	train-logloss:0.01699	test-logloss:0.17063
[15]	train-logloss:0.01699	test-logloss:0.17063
[16]	train-logloss:0.01699	test-logloss:0.17063
[17]	train-logloss:0.01699	test-logloss:0.17063
[18]	train-logloss:0.01699	test-logloss:0.17063
[19]	train-logloss:0.01699	test-logloss:0.17063
[20]	train-logloss:0.01699	test-logloss:0.17063
[2

In [14]:
model.save_model('./mod/ma2209.mod')

In [15]:
# 加载模型进行预测
bst = xgb.Booster()
bst.load_model('./mod/ma2209.mod')
pred = bst.predict(xgb_test)
print(pred, pred.shape)

[9.89755929e-01 9.93606806e-01 9.87131953e-01 3.86833446e-03
 9.99496222e-01 5.10101579e-03 1.74574123e-03 2.23082118e-03
 3.54262628e-03 1.07391493e-03 9.56718028e-01 2.11576819e-01
 3.10020521e-03 7.25596845e-02 9.95850682e-01 2.31318757e-01
 9.75320756e-01 3.31238960e-03 7.69287464e-04 9.96107638e-01
 9.95400965e-01 1.44052377e-03 9.31752741e-01 8.88393901e-04
 5.67201614e-01 9.98275042e-01 5.95149770e-03 5.19456249e-03
 9.87520099e-01 9.96869862e-01 9.94231462e-01 3.73623846e-03
 9.95348394e-01 9.97308373e-01 1.35852327e-03 9.97641087e-01
 4.96015593e-04 9.89883006e-01 9.99496222e-01 4.62979227e-02
 2.31700251e-03 8.87245119e-01 5.68495272e-03 2.04071659e-03
 9.75731667e-03 9.98238683e-01 9.96128082e-01 3.46459192e-03
 5.20664151e-04 2.05190224e-03 5.27490955e-03 9.92722571e-01
 9.70169809e-03 3.85868567e-04 4.35594261e-01 6.71140617e-04
 7.60595500e-01 9.84454751e-01 1.36050244e-03 7.96937663e-03
 1.13851335e-02 1.15623008e-02 3.02705646e-01 5.20664151e-04
 3.05841560e-03 2.249019

In [16]:
# 模型评估
from sklearn import metrics
from sklearn.metrics import accuracy_score

y_pred = np.where(pred > 0.5, 1, 0)

print('AUC: %.4f' % metrics.roc_auc_score(y_test, pred))
print('ACC: %.4f' % metrics.accuracy_score(y_test, y_pred))
print('Recall: %.4f' % metrics.recall_score(y_test, y_pred))
print('F1-Score: %.4f' % metrics.f1_score(y_test, y_pred))
print('Precesion: %.4f' % metrics.precision_score(y_test, y_pred))

AUC: 0.9854
ACC: 0.9300
Recall: 0.9437
F1-Score: 0.9152
Precesion: 0.8882


In [17]:
y_pred

array([1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [18]:
from xgboost import plot_importance

# 特征重要性
# plot_importance(model)
# plt.show()

In [19]:
data_v_x = raw_x[2000:1000].copy()
data_v_x = batch_concat_shift(data_v_x, periods, columns)

data_v_y = (data_v_x['close'] >= data_v_x[f"close_{periods}"]).apply(lambda x: 1 if x else 0)

data_v_x.drop('close', axis=1, inplace=True)
print(data_v_x.shape, data_v_y.shape)


xgb_data_v = xgb.DMatrix(data_v_x, data_v_y)

v_pred = bst.predict(xgb_data_v)

y_v_pred = np.where(v_pred > 0.5, 1, 0)
y_v_pred

(0, 609) (0,)


array([], shape=(0, 0), dtype=int64)

In [20]:
print('AUC: %.4f' % metrics.roc_auc_score(data_v_y, v_pred))
print('ACC: %.4f' % metrics.accuracy_score(data_v_y, y_v_pred))
print('Recall: %.4f' % metrics.recall_score(data_v_y, y_v_pred))
print('F1-Score: %.4f' % metrics.f1_score(data_v_y, y_v_pred))
print('Precesion: %.4f' % metrics.precision_score(data_v_y, y_v_pred))

ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.

In [21]:
importance_eval_list = ['weight', 'gain', 'cover', 'total_gain', 'total_cover']
for i, importance_type in enumerate(importance_eval_list):
    feat_importance = model.get_score(importance_type=importance_type)
    feat_importance = pd.DataFrame.from_dict(feat_importance, orient='index')
    feat_importance.columns = [importance_type]
    if i == 0:
        df_temp = feat_importance
    else:
        df_temp = pd.merge(df_temp, feat_importance, how='outer', left_index=True, right_index=True)

df_temp.sort_values(by='weight', ascending=False, inplace=True)
print('特征重要性结果为:\n',df_temp)

特征重要性结果为:
                 weight        gain       cover  total_gain  total_cover
close_120         13.0   15.479531   48.087631  201.233902   625.139221
volume             5.0    3.454446   34.928825   17.272232   174.644135
bid1_price         4.0   71.566200   69.971565  286.264801   279.886261
ask1_price_120     4.0   13.467570   55.993008   53.870281   223.972031
ask1_price         4.0   77.235832  134.141891  308.943329   536.567566
...                ...         ...         ...         ...          ...
ask1_volume_35     1.0    2.816666   15.000000    2.816666    15.000000
bid1_volume_32     1.0    1.278661   21.236250    1.278661    21.236250
ask1_volume_32     1.0    1.059927   13.331123    1.059927    13.331123
ask1_volume_28     1.0    0.787644   10.557967    0.787644    10.557967
bid1_price_120     1.0  163.395874  228.250000  163.395874   228.250000

[88 rows x 5 columns]
