In [1]:
#! /usr/bin/python
from sklearn import preprocessing
from sklearn.utils import shuffle
import numpy as np
import xgboost as xgb
import time



In [2]:
# load data
data = np.loadtxt('../data/stock_train_data_20170916.csv',delimiter=',',skiprows=1)
print('data shape:', data.shape)

# data preprocessing
np.random.shuffle(data)
data_X = data[:,1:89]
data_Y = data[:,90]
weight_samples = data[:,89]
group_samples = data[:,91]
era_samples = data[:,92]
scaler = preprocessing.StandardScaler().fit(data_X)
data_X = scaler.transform(data_X)
data_cv = xgb.DMatrix(data_X, data_Y, weight = weight_samples)
print('data_X shape:', data_X.shape)
print('data_Y shape:', data_Y.shape)

('data shape:', (296104, 93))
('data_X shape:', (296104, 88))
('data_Y shape:', (296104,))


In [3]:

# setup parameters for xgboost
param = {}
# logistic regression for binary classification. Output probability.
param['objective'] = 'binary:logistic' 
param['metrics'] = {'logloss'}
param['eta'] = 0.01          # step size of each boosting step
param['max_depth'] = 5       # maximum depth of the tree
param['silent'] = 1
param['nthread'] = 7
param['seed'] = 0
param['nrounds'] = 10
#param['eval_metric'] = "auc"
# https://rdrr.io/cran/xgboost/man/xgb.train.html
# https://www.cnblogs.com/haobang008/p/5909207.html

start = time.time();
print ('running cross validation')
# do cross validation, this will print result out as
# [iteration]  metric_name:mean_value+std_value
# std_value is standard deviation of the metric
res = xgb.cv(param, data_cv,      
             nfold=10,
             callbacks=[xgb.callback.print_evaluation(show_stdv=True),
                        xgb.callback.early_stop(3)])
end = time.time();
print (res)
print('time elapse:', end- start);

running cross validation
[0]	train-error:0.417643+0.0032226	test-error:0.423281+0.00490464
Multiple eval metrics have been passed: 'test-error' will be used for early stopping.

Will train until test-error hasn't improved in 3 rounds.
[1]	train-error:0.415854+0.00422586	test-error:0.421365+0.00641644
[2]	train-error:0.415787+0.00419464	test-error:0.421252+0.00642336
[3]	train-error:0.414837+0.00528925	test-error:0.420231+0.00727345
[4]	train-error:0.413141+0.00530789	test-error:0.418375+0.00619695
[5]	train-error:0.410271+0.00551199	test-error:0.415771+0.00555759
[6]	train-error:0.409962+0.00564219	test-error:0.41503+0.00599006
[7]	train-error:0.409179+0.00595321	test-error:0.414585+0.00578197
[8]	train-error:0.408106+0.00634751	test-error:0.413565+0.00620306
[9]	train-error:0.406519+0.0055898	test-error:0.412121+0.00586417
   test-error-mean  test-error-std  train-error-mean  train-error-std
0         0.423281        0.004905          0.417643         0.003223
1         0.421365      

In [17]:
# model training

# setup parameters for xgboost
param = {}
# logistic regression for binary classification. Output probability.
param['objective'] = 'binary:logistic' 
param['metrics'] = {'logloss'}
param['eta'] = 0.01          # step size of each boosting step
param['max_depth'] = 5       # maximum depth of the tree
param['silent'] = 1
param['nthread'] = 7
param['seed'] = 0
param['nrounds'] = 10

# data preprocessing
np.random.shuffle(data)
data_X = data[:,1:89]
data_Y = data[:,90]
weight_samples = data[:,89]
group_samples = data[:,91]
era_samples = data[:,92]
scaler = preprocessing.StandardScaler().fit(data_X)
data_X = scaler.transform(data_X)
data_cv = xgb.DMatrix(data_X, data_Y, weight = weight_samples)
print('data_X shape:', data_X.shape)
print('data_Y shape:', data_Y.shape)

# more work needed for traing set selection...
test_X = data_X[int(data_X.shape[0] * 0.8):]
test_Y = data_Y[int(data_X.shape[0] * 0.8):]
weight_test = weight_samples[int(data_X.shape[0] * 0.8):]
train_X = data_X[0:int(data_X.shape[0] * 0.8)]
train_Y = data_Y[0:int(data_X.shape[0] * 0.8)]
weight_train = weight_samples[0:int(data_X.shape[0] * 0.8)]
print("train_X",train_X.shape)
print("test_X",test_X.shape)

xg_train = xgb.DMatrix( train_X, label=train_Y, weight = weight_train)
xg_test = xgb.DMatrix(test_X, label=test_Y, weight = weight_test)
watchlist = [ (xg_train,'train'), (xg_test, 'test') ]

num_round = 10
start = time.time();
bst = xgb.train(param, xg_train, num_round, watchlist);
end = time.time();
print('time elapse train:', end- start);
# Note: this convention has been changed since xgboost-unity
# get prediction, this is in 1D array, need reshape to (ndata, nclass)
start = time.time();
y_pred = bst.predict( xg_test )
end = time.time();
print('time elapse predict:', end- start);
print(y_pred)
print('error:', np.sum(test_Y == (y_pred > 0)).astype(float) / test_Y.shape[0])

('data_X shape:', (296104, 88))
('data_Y shape:', (296104,))
('train_X', (236883, 88))
('test_X', (59221, 88))
[0]	train-error:0.418727	test-error:0.421406
[1]	train-error:0.418727	test-error:0.421406
[2]	train-error:0.418871	test-error:0.421525
[3]	train-error:0.409831	test-error:0.414867
[4]	train-error:0.409212	test-error:0.414363
[5]	train-error:0.404671	test-error:0.409087
[6]	train-error:0.404415	test-error:0.409939
[7]	train-error:0.404237	test-error:0.409295
[8]	train-error:0.403631	test-error:0.408824
[9]	train-error:0.40352	test-error:0.40762
('time elapse train:', 6.09224009513855)
('time elapse predict:', 0.0006649494171142578)
[ 0.49826735  0.50759786  0.50471961 ...,  0.49826735  0.49826735
  0.50290203]
('error:', 0.52660373853869402)


In [46]:
data_to_Predict = np.loadtxt('../data/stock_test_data_20170916.csv',delimiter=',',skiprows=1)
ids = data_to_Predict[:,0]
test_X = data_to_Predict[:,1:-1]
test_X = xgb.DMatrix(test_X)

start = time.time();
yprob = bst.predict( test_X )
end = time.time();
print('time elapse predict:', end- start);

data_pred = np.concatenate((data_to_Predict[:,0].reshape(yprob.shape[0],-1), yprob.reshape(yprob.shape[0],-1)), axis=1)

f = open('./data_pred.csv', 'w')
f.write('id,proba\n')
for i in range(data_pred.shape[0]):
    s = '%d,%.5f\n'%(data_pred[i,0], data_pred[i,1])
    f.write(s)
f.close()

('time elapse predict:', 0.02595996856689453)
