In [1]:
#! /usr/bin/python
from sklearn import preprocessing
from sklearn.utils import shuffle
import numpy as np
import xgboost as xgb
import time



In [2]:
# load data
data = np.loadtxt('../data/stock_train_data_20170916.csv',delimiter=',',skiprows=1)
print('data shape:', data.shape)

# data preprocessing
np.random.shuffle(data)
data_X = data[:,1:89]
data_Y = data[:,90]
weight_samples = data[:,89]
group_samples = data[:,91]
era_samples = data[:,92]
scaler = preprocessing.StandardScaler().fit(data_X)
data_X = scaler.transform(data_X)
data_cv = xgb.DMatrix(data_X, data_Y, weight = weight_samples)
print('data_X shape:', data_X.shape)
print('data_Y shape:', data_Y.shape)

('data shape:', (296104, 93))
('data_X shape:', (296104, 88))
('data_Y shape:', (296104,))


In [3]:

# setup parameters for xgboost
param = {}
# logistic regression for binary classification. Output probability.
param['objective'] = 'binary:logistic' 
param['metrics'] = {'logloss'}
param['eta'] = 0.1          # step size of each boosting step
param['max_depth'] = 6       # maximum depth of the tree
param['silent'] = 1
param['nthread'] = 7
param['seed'] = 0
param['nrounds'] = 10
#param['eval_metric'] = "auc"
# https://rdrr.io/cran/xgboost/man/xgb.train.html
# https://www.cnblogs.com/haobang008/p/5909207.html

start = time.time();
print ('running cross validation')
# do cross validation, this will print result out as
# [iteration]  metric_name:mean_value+std_value
# std_value is standard deviation of the metric
res = xgb.cv(param, data_cv,      
             nfold=10,
             callbacks=[xgb.callback.print_evaluation(show_stdv=True),
                        xgb.callback.early_stop(3)])
end = time.time();
print (res)
print('time elapse:', end- start);

running cross validation
[0]	train-error:0.403546+0.00148194	test-error:0.409174+0.00412982
Multiple eval metrics have been passed: 'test-error' will be used for early stopping.

Will train until test-error hasn't improved in 3 rounds.
[1]	train-error:0.39445+0.00151339	test-error:0.40126+0.00429095
[2]	train-error:0.388199+0.00205108	test-error:0.395298+0.00442949
[3]	train-error:0.385784+0.000672063	test-error:0.393855+0.00359058
[4]	train-error:0.38257+0.000797252	test-error:0.390826+0.00385503
[5]	train-error:0.380341+0.000644967	test-error:0.389023+0.00353434
[6]	train-error:0.378363+0.00119459	test-error:0.38773+0.0030115
[7]	train-error:0.376655+0.00150333	test-error:0.386554+0.00331316
[8]	train-error:0.375058+0.00133812	test-error:0.38551+0.00305468
[9]	train-error:0.373618+0.00158427	test-error:0.384502+0.00370604
   test-error-mean  test-error-std  train-error-mean  train-error-std
0         0.409174        0.004130          0.403546         0.001482
1         0.401260      

In [4]:
# model training

per_train = 0.9
# data preprocessing
#np.random.shuffle(data)
# rebuild the data with era (time)

data_X = data[:,1:89]
data_Y = data[:,90]
weight_samples = data[:,89]
group_samples = data[:,91].reshape(-1,1)
era_samples = data[:,92]

#data_X = np.append(data_X, group_samples, axis= 1)
scaler = preprocessing.StandardScaler().fit(data_X)
data_X = scaler.transform(data_X)
data_cv = xgb.DMatrix(data_X, data_Y, weight = weight_samples)
print('data_X shape:', data_X.shape)
print('data_Y shape:', data_Y.shape)

# more work needed for traing set selection...
test_X = data_X[int(data_X.shape[0] * per_train):]
test_Y = data_Y[int(data_X.shape[0] * per_train):]
weight_test = weight_samples[int(data_X.shape[0] * per_train):]
train_X = data_X[0:int(data_X.shape[0] * per_train)]
train_Y = data_Y[0:int(data_X.shape[0] * per_train)]
weight_train = weight_samples[0:int(data_X.shape[0] * per_train)]
print("train_X",train_X.shape)
print("test_X",test_X.shape)

xg_train = xgb.DMatrix( train_X, label=train_Y, weight = weight_train)
xg_test = xgb.DMatrix(test_X, label=test_Y, weight = weight_test)
watchlist = [ (xg_train,'train'), (xg_test, 'test') ]

num_round = 10
start = time.time();
bst = xgb.train(param, xg_train, num_round, watchlist);
end = time.time();
print('time elapse train:', end- start);
# Note: this convention has been changed since xgboost-unity
# get prediction, this is in 1D array, need reshape to (ndata, nclass)
start = time.time();
y_pred = bst.predict( xg_test )
end = time.time();
print('time elapse predict:', end- start);
print(y_pred)
print('error:', np.sum(test_Y == (y_pred > 0)).astype(float) / test_Y.shape[0])


('data_X shape:', (296104, 88))
('data_Y shape:', (296104,))
('train_X', (266493, 88))
('test_X', (29611, 88))
[0]	train-error:0.402851	test-error:0.408325
[1]	train-error:0.39202	test-error:0.400858
[2]	train-error:0.389724	test-error:0.397337
[3]	train-error:0.383937	test-error:0.391529
[4]	train-error:0.380668	test-error:0.390403
[5]	train-error:0.378342	test-error:0.389599
[6]	train-error:0.376909	test-error:0.388194
[7]	train-error:0.374857	test-error:0.385298
[8]	train-error:0.372221	test-error:0.382285
[9]	train-error:0.370998	test-error:0.380321
('time elapse train:', 16.8576500415802)
('time elapse predict:', 0.0061647891998291016)
[ 0.54060709  0.58983809  0.53249854 ...,  0.59247029  0.57125658
  0.73460591]
('error:', 0.52521022592955324)


In [5]:
data_to_Predict = np.loadtxt('../data/stock_test_data_20170916.csv',delimiter=',',skiprows=1)
ids = data_to_Predict[:,0]
test_X = data_to_Predict[:,1:-1]
test_X = xgb.DMatrix(test_X)

start = time.time();
yprob = bst.predict( test_X )
end = time.time();
print('time elapse predict:', end- start);

data_pred = np.concatenate((data_to_Predict[:,0].reshape(yprob.shape[0],-1), yprob.reshape(yprob.shape[0],-1)), axis=1)

f = open('./data_pred.csv', 'w')
f.write('id,proba\n')
for i in range(data_pred.shape[0]):
    s = '%d,%.5f\n'%(data_pred[i,0], data_pred[i,1])
    f.write(s)
f.close()

('time elapse predict:', 0.048657894134521484)
