In [1]:
#! /usr/bin/python
from sklearn import preprocessing
from sklearn.utils import shuffle
import numpy as np
import xgboost as xgb
import time



In [2]:
# load data
data = np.loadtxt('../data/stock_train_data_20170916.csv',delimiter=',',skiprows=1)
print('data shape:', data.shape)

# data preprocessing
np.random.shuffle(data)
data_X = data[:,1:89]
data_Y = data[:,90]
weight_samples = data[:,89]
group_samples = data[:,91]
era_samples = data[:,92]
scaler = preprocessing.StandardScaler().fit(data_X)
data_X = scaler.transform(data_X)
data_cv = xgb.DMatrix(data_X, data_Y, weight = weight_samples)
print('data_X shape:', data_X.shape)
print('data_Y shape:', data_Y.shape)

('data shape:', (296104, 93))
('data_X shape:', (296104, 88))
('data_Y shape:', (296104,))


In [13]:

# setup parameters for xgboost
param = {}
# logistic regression for binary classification. Output probability.
param['objective'] = 'binary:logistic' 
param['metrics'] = {'logloss'}
param['eta'] = 0.1          # step size of each boosting step
param['max_depth'] = 10       # maximum depth of the tree
param['silent'] = 1
param['nthread'] = 7
param['seed'] = 0
param['nrounds'] = 2
param['lambda'] = 10
param['alpha'] = 5
#param['eval_metric'] = "auc"
# https://rdrr.io/cran/xgboost/man/xgb.train.html
# https://www.cnblogs.com/haobang008/p/5909207.html

start = time.time();
print ('running cross validation')
# do cross validation, this will print result out as
# [iteration]  metric_name:mean_value+std_value
# std_value is standard deviation of the metric
res = xgb.cv(param, data_cv,      
             nfold=10, metrics='auc',
             callbacks=[xgb.callback.print_evaluation(show_stdv=True),
                        xgb.callback.early_stop(3)])
end = time.time();
print (res)
print('time elapse:', end- start);

running cross validation
[0]	train-auc:0.698968+0.00140923	test-auc:0.642011+0.00446906
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 3 rounds.
[1]	train-auc:0.725536+0.00146908	test-auc:0.660836+0.0044523
[2]	train-auc:0.742484+0.00177473	test-auc:0.672159+0.00376296
[3]	train-auc:0.753752+0.00209469	test-auc:0.67932+0.0043276
[4]	train-auc:0.764688+0.00194107	test-auc:0.686178+0.00371929
[5]	train-auc:0.773822+0.00245907	test-auc:0.691127+0.00432757
[6]	train-auc:0.781692+0.00245859	test-auc:0.695888+0.00401362
[7]	train-auc:0.787792+0.00176858	test-auc:0.69915+0.00374629
[8]	train-auc:0.793665+0.00123664	test-auc:0.702292+0.00488445
[9]	train-auc:0.799251+0.00133687	test-auc:0.70526+0.00445594
   test-auc-mean  test-auc-std  train-auc-mean  train-auc-std
0       0.642011      0.004469        0.698968       0.001409
1       0.660836      0.004452        0.725536       0.001469
2       0.672159      0.

In [14]:
# model training

per_train = 0.9
# data preprocessing
#np.random.shuffle(data)
# rebuild the data with era (time)

data_X = data[:,1:89]
data_Y = data[:,90]
weight_samples = data[:,89]
group_samples = data[:,91].reshape(-1,1)
era_samples = data[:,92]

#data_X = np.append(data_X, group_samples, axis= 1)
scaler = preprocessing.StandardScaler().fit(data_X)
data_X = scaler.transform(data_X)
data_cv = xgb.DMatrix(data_X, data_Y, weight = weight_samples)
print('data_X shape:', data_X.shape)
print('data_Y shape:', data_Y.shape)

# more work needed for traing set selection...
test_X = data_X[int(data_X.shape[0] * per_train):]
test_Y = data_Y[int(data_X.shape[0] * per_train):]
weight_test = weight_samples[int(data_X.shape[0] * per_train):]
train_X = data_X[0:int(data_X.shape[0] * per_train)]
train_Y = data_Y[0:int(data_X.shape[0] * per_train)]
weight_train = weight_samples[0:int(data_X.shape[0] * per_train)]
print("train_X",train_X.shape)
print("test_X",test_X.shape)

xg_train = xgb.DMatrix( train_X, label=train_Y, weight = weight_train)
xg_test = xgb.DMatrix(test_X, label=test_Y, weight = weight_test)
watchlist = [ (xg_train,'train'), (xg_test, 'test') ]

num_round = 10
start = time.time();
bst = xgb.train(param, xg_train, num_round, watchlist);
end = time.time();
print('time elapse train:', end- start);
# Note: this convention has been changed since xgboost-unity
# get prediction, this is in 1D array, need reshape to (ndata, nclass)
start = time.time();
y_pred = bst.predict( xg_test )
end = time.time();
print('time elapse predict:', end- start);
print(y_pred)
print('error:', np.sum(test_Y == (y_pred > 0)).astype(float) / test_Y.shape[0])


('data_X shape:', (296104, 88))
('data_Y shape:', (296104,))
('train_X', (266493, 88))
('test_X', (29611, 88))
[0]	train-error:0.352481	test-error:0.388728
[1]	train-error:0.333243	test-error:0.378545
[2]	train-error:0.324733	test-error:0.372168
[3]	train-error:0.315491	test-error:0.364853
[4]	train-error:0.306442	test-error:0.361842
[5]	train-error:0.301292	test-error:0.358779
[6]	train-error:0.295547	test-error:0.355481
[7]	train-error:0.289804	test-error:0.353532
[8]	train-error:0.286206	test-error:0.350234
[9]	train-error:0.282777	test-error:0.34853
('time elapse train:', 22.648571968078613)
('time elapse predict:', 0.0008800029754638672)
[ 0.6044513   0.38634267  0.67602718 ...,  0.64311343  0.62655663  0.500521  ]
('error:', 0.52659484650974298)


In [15]:
data_to_Predict = np.loadtxt('../data/stock_test_data_20170916.csv',delimiter=',',skiprows=1)
ids = data_to_Predict[:,0]
test_X = data_to_Predict[:,1:-1]
test_X = xgb.DMatrix(test_X)

start = time.time();
yprob = bst.predict( test_X )
end = time.time();
print('time elapse predict:', end- start);

data_pred = np.concatenate((data_to_Predict[:,0].reshape(yprob.shape[0],-1), yprob.reshape(yprob.shape[0],-1)), axis=1)

f = open('./data_pred.csv', 'w')
f.write('id,proba\n')
for i in range(data_pred.shape[0]):
    s = '%d,%.5f\n'%(data_pred[i,0], data_pred[i,1])
    f.write(s)
f.close()

('time elapse predict:', 0.04147982597351074)
