In [1]:
#! /usr/bin/python
from sklearn import preprocessing
from sklearn.utils import shuffle
import numpy as np
import xgboost as xgb
import time



In [2]:
# load data
data = np.loadtxt('../data/stock_train_data_20170923.csv',delimiter=',',skiprows=1)
print('data shape:', data.shape)

# data preprocessing
np.random.shuffle(data)
data_X = data[:,1:89]
data_Y = data[:,90]
weight_samples = data[:,89]
group_samples = data[:,91]
era_samples = data[:,92]
scaler = preprocessing.StandardScaler().fit(data_X)
data_X = scaler.transform(data_X)
data_cv = xgb.DMatrix(data_X, data_Y, weight = weight_samples)
print('data_X shape:', data_X.shape)
print('data_Y shape:', data_Y.shape)

('data shape:', (296104, 93))
('data_X shape:', (296104, 88))
('data_Y shape:', (296104,))


In [3]:

# setup parameters for xgboost
param = {}
# logistic regression for binary classification. Output probability.
param['objective'] = 'binary:logistic' 
param['metrics'] = {'logloss'}
param['eta'] = 0.1          # step size of each boosting step
param['max_depth'] = 30       # maximum depth of the tree
param['silent'] = 1
param['nthread'] = 7
param['seed'] = 0
param['nrounds'] = 2
param['lambda'] = 10
param['alpha'] = 5
#param['eval_metric'] = "auc"
# https://rdrr.io/cran/xgboost/man/xgb.train.html
# https://www.cnblogs.com/haobang008/p/5909207.html

start = time.time();
print ('running cross validation')
# do cross validation, this will print result out as
# [iteration]  metric_name:mean_value+std_value
# std_value is standard deviation of the metric
res = xgb.cv(param, data_cv,      
             nfold=10, metrics='auc',
             callbacks=[xgb.callback.print_evaluation(show_stdv=True),
                        xgb.callback.early_stop(3)])
end = time.time();
print (res)
print('time elapse:', end- start);

running cross validation
[0]	train-auc:0.959516+0.00112376	test-auc:0.63857+0.00543886
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 3 rounds.
[1]	train-auc:0.978562+0.00182015	test-auc:0.661235+0.00507074
[2]	train-auc:0.986188+0.00110891	test-auc:0.676576+0.00420345
[3]	train-auc:0.989892+0.000862911	test-auc:0.687171+0.00403417
[4]	train-auc:0.992222+0.000621927	test-auc:0.695638+0.00502428
[5]	train-auc:0.99374+0.000440746	test-auc:0.700944+0.00511168
[6]	train-auc:0.994918+0.000361786	test-auc:0.706333+0.00494038
[7]	train-auc:0.995782+0.0002484	test-auc:0.711167+0.00473141
[8]	train-auc:0.996445+0.000207139	test-auc:0.715027+0.00471442
[9]	train-auc:0.996965+0.000168862	test-auc:0.718522+0.00441242
   test-auc-mean  test-auc-std  train-auc-mean  train-auc-std
0       0.638570      0.005439        0.959516       0.001124
1       0.661235      0.005071        0.978562       0.001820
2       0.676576

In [4]:
# model training

per_train = 0.9
# data preprocessing
#np.random.shuffle(data)
# rebuild the data with era (time)

data_X = data[:,1:89]
data_Y = data[:,90]
weight_samples = data[:,89]
group_samples = data[:,91].reshape(-1,1)
era_samples = data[:,92]

#data_X = np.append(data_X, group_samples, axis= 1)
scaler = preprocessing.StandardScaler().fit(data_X)
data_X = scaler.transform(data_X)
data_cv = xgb.DMatrix(data_X, data_Y, weight = weight_samples)
print('data_X shape:', data_X.shape)
print('data_Y shape:', data_Y.shape)

# more work needed for traing set selection...
test_X = data_X[int(data_X.shape[0] * per_train):]
test_Y = data_Y[int(data_X.shape[0] * per_train):]
weight_test = weight_samples[int(data_X.shape[0] * per_train):]
train_X = data_X[0:int(data_X.shape[0] * per_train)]
train_Y = data_Y[0:int(data_X.shape[0] * per_train)]
weight_train = weight_samples[0:int(data_X.shape[0] * per_train)]
print("train_X",train_X.shape)
print("test_X",test_X.shape)

xg_train = xgb.DMatrix( train_X, label=train_Y, weight = weight_train)
xg_test = xgb.DMatrix(test_X, label=test_Y, weight = weight_test)
watchlist = [ (xg_train,'train'), (xg_test, 'test') ]

num_round = 10
start = time.time();
bst = xgb.train(param, xg_train, num_round, watchlist);
end = time.time();
print('time elapse train:', end- start);
# Note: this convention has been changed since xgboost-unity
# get prediction, this is in 1D array, need reshape to (ndata, nclass)
start = time.time();
y_pred = bst.predict( xg_test )
end = time.time();
print('time elapse predict:', end- start);
print(y_pred)
print('error:', np.sum(test_Y == (y_pred > 0)).astype(float) / test_Y.shape[0])


('data_X shape:', (296104, 88))
('data_Y shape:', (296104,))
('train_X', (266493, 88))
('test_X', (29611, 88))
[0]	train-error:0.072963	test-error:0.415483
[1]	train-error:0.051789	test-error:0.378795
[2]	train-error:0.041339	test-error:0.370312
[3]	train-error:0.035421	test-error:0.364416
[4]	train-error:0.031281	test-error:0.356286
[5]	train-error:0.028635	test-error:0.355504
[6]	train-error:0.025786	test-error:0.351767
[7]	train-error:0.023528	test-error:0.346358
[8]	train-error:0.021502	test-error:0.342586
[9]	train-error:0.019711	test-error:0.340713
('time elapse train:', 73.52350211143494)
('time elapse predict:', 0.0007841587066650391)
[ 0.51396251  0.32289192  0.46447942 ...,  0.62798262  0.58350259
  0.41744906]
('error:', 0.53125527675525985)


In [5]:
data_to_Predict = np.loadtxt('../data/stock_test_data_20170923.csv',delimiter=',',skiprows=1)
ids = data_to_Predict[:,0]
test_X = data_to_Predict[:,1:-1]
test_X = xgb.DMatrix(test_X)

start = time.time();
yprob = bst.predict( test_X )
end = time.time();
print('time elapse predict:', end- start);

data_pred = np.concatenate((data_to_Predict[:,0].reshape(yprob.shape[0],-1), yprob.reshape(yprob.shape[0],-1)), axis=1)

f = open('./data_pred.csv', 'w')
f.write('id,proba\n')
for i in range(data_pred.shape[0]):
    s = '%d,%.5f\n'%(data_pred[i,0], data_pred[i,1])
    f.write(s)
f.close()

('time elapse predict:', 0.3291018009185791)
