In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from machine_learning_class import machine_learning_class
import machine_learning_class as ml
from datetime import datetime
import scipy.stats as sts
import matplotlib.pyplot as plt


In [17]:
###  model train
train_data_path = 'feature_matrix.h5'
target_name = 'Target_60_Seconds_MidReturn60'
ml_obj = machine_learning_class(train_data_path, target_name, select_key='clean_feature')
ml_obj.get_X_Y_train()
data_train = lgb.Dataset(ml_obj.X_train, ml_obj.y_train, silent=True)
data_eval = lgb.Dataset(ml_obj.X_test, ml_obj.y_test, reference=data_train)

### setting model's training parameters important! 
model_1_params = {'application': 'regression', 
                    'boosting':'gbdt',
                    'num_iterations':500,
                    'learning_rate':0.1,
                    'max_depth':15,
                    'num_leaves':2000,
                    'verbose':2, 
                    'feature_fraction':0.7,
                    'bagging_fraction':0.7,
                    'bagging_freq':5,
                    'min_data_in_leaf':500,
                    'lambda_l2':1,
                    'num_threads':15,
                    'early_stopping_round':10,
                    'metric':'l2'}

start = datetime.now()
# lgb_model = lgb.train(model_1_params, data_train, valid_sets=data_eval)

########
def corr_metric(y_hat, data):
    y_real = data.get_label()
    corr = np.corrcoef(y_hat, y_real)[0][1]
    return 'Correlation', corr, True

lgb_model = lgb.train(model_1_params, data_train, valid_sets=[data_eval, data_train], feval=corr_metric, 
                      valid_names=['val', 'train'], learning_rates=lambda iter: 0.05 * (0.999 ** iter),
                      evals_result = {})
# The code in the section is to show how to define a metric for evaluate the training result during the model training process
# learning_rates=lambda iter: 0.05 * (0.999 ** iter) is used to control the learning rate decay. With the growing of iteration nums,
# the learning rate can be reduced.
########


end = datetime.now()
total_minute = (end - start).days * 24 * 60 + (end - start).seconds / 60
print('total training time is {} minutes'.format(total_minute))
lgb_model.save_model('lgbmodel3' + '.txt')



(36847668, 45) (36847668,)
X_train:shape (29478134, 45)
y_train:shape (29478134,)
X_test:shape (7369534, 45)
y_test:shape (7369534,)




[1]	train's l2: 3.09512e-06	train's Correlation: 0.313008	val's l2: 3.09389e-06	val's Correlation: 0.307407
Training until validation scores don't improve for 10 rounds.
[2]	train's l2: 3.06787e-06	train's Correlation: 0.331793	val's l2: 3.06722e-06	val's Correlation: 0.32566
[3]	train's l2: 3.04539e-06	train's Correlation: 0.339903	val's l2: 3.04533e-06	val's Correlation: 0.333305
[4]	train's l2: 3.02207e-06	train's Correlation: 0.343907	val's l2: 3.02256e-06	val's Correlation: 0.337193
[5]	train's l2: 3.00025e-06	train's Correlation: 0.346373	val's l2: 3.0013e-06	val's Correlation: 0.339502
[6]	train's l2: 2.98096e-06	train's Correlation: 0.349811	val's l2: 2.98251e-06	val's Correlation: 0.342862
[7]	train's l2: 2.9617e-06	train's Correlation: 0.351214	val's l2: 2.96378e-06	val's Correlation: 0.344203
[8]	train's l2: 2.94549e-06	train's Correlation: 0.352726	val's l2: 2.94812e-06	val's Correlation: 0.345537
[9]	train's l2: 2.92937e-06	train's Correlation: 0.353662	val's l2: 2.93256e-

[76]	train's l2: 2.64873e-06	train's Correlation: 0.396267	val's l2: 2.67865e-06	val's Correlation: 0.381048
[77]	train's l2: 2.64733e-06	train's Correlation: 0.396758	val's l2: 2.67758e-06	val's Correlation: 0.381426
[78]	train's l2: 2.64588e-06	train's Correlation: 0.397273	val's l2: 2.67644e-06	val's Correlation: 0.381835
[79]	train's l2: 2.64477e-06	train's Correlation: 0.397638	val's l2: 2.6756e-06	val's Correlation: 0.382107
[80]	train's l2: 2.64337e-06	train's Correlation: 0.398135	val's l2: 2.67457e-06	val's Correlation: 0.382469
[81]	train's l2: 2.64179e-06	train's Correlation: 0.39873	val's l2: 2.67333e-06	val's Correlation: 0.382942
[82]	train's l2: 2.64042e-06	train's Correlation: 0.399233	val's l2: 2.67226e-06	val's Correlation: 0.38334
[83]	train's l2: 2.63935e-06	train's Correlation: 0.399602	val's l2: 2.67144e-06	val's Correlation: 0.383621
[84]	train's l2: 2.63797e-06	train's Correlation: 0.400108	val's l2: 2.67042e-06	val's Correlation: 0.383995
[85]	train's l2: 2.636

[151]	train's l2: 2.57705e-06	train's Correlation: 0.423106	val's l2: 2.62763e-06	val's Correlation: 0.400275
[152]	train's l2: 2.57629e-06	train's Correlation: 0.423394	val's l2: 2.62712e-06	val's Correlation: 0.400472
[153]	train's l2: 2.57557e-06	train's Correlation: 0.423667	val's l2: 2.6266e-06	val's Correlation: 0.40067
[154]	train's l2: 2.57498e-06	train's Correlation: 0.423891	val's l2: 2.62623e-06	val's Correlation: 0.400812
[155]	train's l2: 2.57416e-06	train's Correlation: 0.424211	val's l2: 2.62568e-06	val's Correlation: 0.401033
[156]	train's l2: 2.57355e-06	train's Correlation: 0.424445	val's l2: 2.62527e-06	val's Correlation: 0.401192
[157]	train's l2: 2.57282e-06	train's Correlation: 0.424728	val's l2: 2.62478e-06	val's Correlation: 0.401386
[158]	train's l2: 2.57207e-06	train's Correlation: 0.425028	val's l2: 2.62429e-06	val's Correlation: 0.401587
[159]	train's l2: 2.57147e-06	train's Correlation: 0.425257	val's l2: 2.62393e-06	val's Correlation: 0.401724
[160]	train'

[226]	train's l2: 2.53204e-06	train's Correlation: 0.440662	val's l2: 2.5989e-06	val's Correlation: 0.411653
[227]	train's l2: 2.53144e-06	train's Correlation: 0.440897	val's l2: 2.59854e-06	val's Correlation: 0.411796
[228]	train's l2: 2.53096e-06	train's Correlation: 0.441084	val's l2: 2.59826e-06	val's Correlation: 0.411904
[229]	train's l2: 2.53059e-06	train's Correlation: 0.441229	val's l2: 2.59802e-06	val's Correlation: 0.411999
[230]	train's l2: 2.53009e-06	train's Correlation: 0.441423	val's l2: 2.59772e-06	val's Correlation: 0.412121
[231]	train's l2: 2.52957e-06	train's Correlation: 0.441631	val's l2: 2.59737e-06	val's Correlation: 0.412262
[232]	train's l2: 2.52907e-06	train's Correlation: 0.441828	val's l2: 2.59705e-06	val's Correlation: 0.412392
[233]	train's l2: 2.52871e-06	train's Correlation: 0.441972	val's l2: 2.59683e-06	val's Correlation: 0.412478
[234]	train's l2: 2.52828e-06	train's Correlation: 0.442139	val's l2: 2.59657e-06	val's Correlation: 0.412586
[235]	train

[301]	train's l2: 2.49907e-06	train's Correlation: 0.453486	val's l2: 2.57907e-06	val's Correlation: 0.419508
[302]	train's l2: 2.49874e-06	train's Correlation: 0.45361	val's l2: 2.57888e-06	val's Correlation: 0.419581
[303]	train's l2: 2.49827e-06	train's Correlation: 0.453793	val's l2: 2.57859e-06	val's Correlation: 0.419697
[304]	train's l2: 2.49783e-06	train's Correlation: 0.453961	val's l2: 2.57835e-06	val's Correlation: 0.41979
[305]	train's l2: 2.49739e-06	train's Correlation: 0.454137	val's l2: 2.57809e-06	val's Correlation: 0.419895
[306]	train's l2: 2.49696e-06	train's Correlation: 0.454305	val's l2: 2.57784e-06	val's Correlation: 0.419992
[307]	train's l2: 2.49663e-06	train's Correlation: 0.454432	val's l2: 2.57763e-06	val's Correlation: 0.420075
[308]	train's l2: 2.49627e-06	train's Correlation: 0.454572	val's l2: 2.57741e-06	val's Correlation: 0.420163
[309]	train's l2: 2.49583e-06	train's Correlation: 0.454743	val's l2: 2.57715e-06	val's Correlation: 0.420268
[310]	train'

[376]	train's l2: 2.47245e-06	train's Correlation: 0.463783	val's l2: 2.56356e-06	val's Correlation: 0.425642
[377]	train's l2: 2.47208e-06	train's Correlation: 0.463926	val's l2: 2.56335e-06	val's Correlation: 0.425721
[378]	train's l2: 2.47171e-06	train's Correlation: 0.464073	val's l2: 2.56315e-06	val's Correlation: 0.425802
[379]	train's l2: 2.47143e-06	train's Correlation: 0.464176	val's l2: 2.56299e-06	val's Correlation: 0.425863
[380]	train's l2: 2.4711e-06	train's Correlation: 0.464304	val's l2: 2.56283e-06	val's Correlation: 0.425929
[381]	train's l2: 2.47077e-06	train's Correlation: 0.464426	val's l2: 2.56264e-06	val's Correlation: 0.425999
[382]	train's l2: 2.47044e-06	train's Correlation: 0.464547	val's l2: 2.56244e-06	val's Correlation: 0.426078
[383]	train's l2: 2.47014e-06	train's Correlation: 0.464663	val's l2: 2.5623e-06	val's Correlation: 0.42613
[384]	train's l2: 2.46975e-06	train's Correlation: 0.464811	val's l2: 2.56206e-06	val's Correlation: 0.426223
[385]	train's

[451]	train's l2: 2.44962e-06	train's Correlation: 0.472542	val's l2: 2.55065e-06	val's Correlation: 0.430723
[452]	train's l2: 2.44923e-06	train's Correlation: 0.472689	val's l2: 2.55041e-06	val's Correlation: 0.430814
[453]	train's l2: 2.44883e-06	train's Correlation: 0.472838	val's l2: 2.55016e-06	val's Correlation: 0.43091
[454]	train's l2: 2.44858e-06	train's Correlation: 0.472933	val's l2: 2.55003e-06	val's Correlation: 0.430962
[455]	train's l2: 2.44834e-06	train's Correlation: 0.473029	val's l2: 2.54991e-06	val's Correlation: 0.431013
[456]	train's l2: 2.44816e-06	train's Correlation: 0.473101	val's l2: 2.54982e-06	val's Correlation: 0.431049
[457]	train's l2: 2.44781e-06	train's Correlation: 0.473234	val's l2: 2.54962e-06	val's Correlation: 0.431128
[458]	train's l2: 2.44764e-06	train's Correlation: 0.473307	val's l2: 2.54952e-06	val's Correlation: 0.43117
[459]	train's l2: 2.44742e-06	train's Correlation: 0.473391	val's l2: 2.54936e-06	val's Correlation: 0.431233
[460]	train'

<lightgbm.basic.Booster at 0x1aef0a3278>

In [18]:
predict_train = lgb_model.predict(ml_obj.X_train)
predict_eval = lgb_model.predict(ml_obj.X_test)

In [19]:
np.corrcoef(predict_train, ml_obj.y_train)

array([[1.        , 0.47775274],
       [0.47775274, 1.        ]])

In [20]:
sts.spearmanr(predict_train, ml_obj.y_train)

SpearmanrResult(correlation=0.42270311395294513, pvalue=0.0)

In [21]:
np.corrcoef(predict_eval, ml_obj.y_test)

array([[1.        , 0.43380716],
       [0.43380716, 1.        ]])

In [22]:
sts.spearmanr(predict_eval, ml_obj.y_test)

SpearmanrResult(correlation=0.4005097759485668, pvalue=0.0)

In [23]:
predict_train.std(), predict_eval.std()

(0.0006894407366038288, 0.0006819168794399814)

In [24]:
ml_obj.y_train.std()

0.0017678113370546134

In [25]:
ml_obj.y_test.std()

0.0017673056175202674

In [26]:
hd = pd.HDFStore(train_data_path, 'r')

In [27]:
d = hd['clean_feature']

In [35]:
hd.close()

In [31]:
names = pd.Series(d.columns.values)

In [32]:
names

0                             Transaction_ACD_nperiod:3
1                        Transaction_OLD_UOS_nperiod:12
2                             Transaction_UOS_nperiod:9
3                              Transaction_VR_nperiod:3
4                          Transaction_OLD_VR_nperiod:9
5                             Transaction_WAD_nperiod:9
6                             Transaction_KDJ_nperiod:3
7                             Transaction_RSV_nperiod:3
8                      Tran_Price_Change_Vol_nperiod:20
9                          Mid_Change_Origin_nperiod:60
10                                      VRSI_nperiod:15
11                                    RSI_TA_nperiod:15
12                                      BIAS_nperiod:15
13                           Transaction_CYM_nperiod:15
14                                Ask_Bid_CYS_nperiod:3
15                               Ask_Bid_CYS_nperiod:15
16                            Transaction_CHO_nperiod:6
17                              Ask_Bid_1_New_np

In [36]:
names.to_hdf('feature_matrix.h5', key='columns_names')

In [None]:

###  model predict
print('use the model to predict...')
data_ = pd.read_hdf('data_sample.h5', key='X')
data_input = data_.loc[:, data_.columns != 'Target']
data_input_target = data_.loc[:, data_.columns == 'Target'].values
lgb_model = lgb.Booster(model_file='sample_model.txt') 
y_testset_predict = lgb_model.predict(data_input.values, num_iteration=lgb_model.best_iteration)
# plt.plot(y_testset_predict, data_input_target, '.')
# plt.show()