In [1]:
import pandas as pd

# Read the CSV file
data = pd.read_csv("pred2017l.csv")

# Display the data
data.head()

Unnamed: 0,Open,High,Low,Close,HOD,LOD,HOY,LOY,VPOC,VPOC-1,LinReg,LinRegSlope,CumDelta,Ticks
0,-0.02,-0.04,0.1,0.04,-0.01,0.025,-0.14,0.26,0.04,0.0,1.0,0.0,-134,0.06
1,0.2,-0.06,0.24,0.0,-0.015,0.075,-0.095,0.305,0.04,0.24,0.84,0.36,2567,-0.04
2,0.06,-0.06,0.06,0.0,-0.015,0.085,-0.085,0.315,-0.02,0.08,0.86,0.49,-186,0.0
3,0.02,0.0,0.1,0.04,-0.015,0.085,-0.085,0.315,0.04,0.06,0.76,0.62,-376,0.0
4,0.04,-0.06,0.06,0.0,-0.015,0.105,-0.065,0.335,-0.02,0.1,0.74,0.6,386,0.1


Train-valid-test split

In [2]:
from sklearn.model_selection import train_test_split

# train-valid-test split
X_temp = data.drop('Ticks',axis=1)
y_temp = data['Ticks']

X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=0.25, train_size=0.75)

X_test  = data.drop('Ticks',axis=1)
y_test  = data['Ticks']

print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)
print(X_test.shape)

(2136, 13)
(2136,)
(713, 13)
(713,)
(2849, 13)


Convert data into XGB format

In [3]:
import xgboost as xgb

d_train = xgb.DMatrix(X_train, y_train)
d_valid = xgb.DMatrix(X_valid, y_valid)
d_test  = xgb.DMatrix(X_test)

XGB parameters

In [4]:
xgb_params = {
    'n_trees': 75, 
    'eta': 0.03,
    'max_depth': 4,
    'subsample': 0.90,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    #'base_score': y_mean, # base prediction = mean(target)
    'silent': 0}

#
num_boost_rounds = 1000
watchlist = [(d_train, 'train'), (d_valid, 'valid')]

Train the model

In [5]:
# Train the model!
mdl = xgb.train(dict(xgb_params, silent=0), d_train, num_boost_rounds, watchlist, early_stopping_rounds=500, maximize=False, verbose_eval=10)

[0]	train-rmse:0.09095	valid-rmse:0.09481
[10]	train-rmse:0.08963	valid-rmse:0.09469
[20]	train-rmse:0.08865	valid-rmse:0.09461
[30]	train-rmse:0.08769	valid-rmse:0.09471
[40]	train-rmse:0.08698	valid-rmse:0.09476


Parameters: { "n_trees", "silent" } are not used.



[50]	train-rmse:0.08626	valid-rmse:0.09481
[60]	train-rmse:0.08561	valid-rmse:0.09487
[70]	train-rmse:0.08512	valid-rmse:0.09496
[80]	train-rmse:0.08439	valid-rmse:0.09507
[90]	train-rmse:0.08383	valid-rmse:0.09503
[100]	train-rmse:0.08321	valid-rmse:0.09507
[110]	train-rmse:0.08255	valid-rmse:0.09522
[120]	train-rmse:0.08202	valid-rmse:0.09524
[130]	train-rmse:0.08147	valid-rmse:0.09523
[140]	train-rmse:0.08087	valid-rmse:0.09534
[150]	train-rmse:0.08034	valid-rmse:0.09537
[160]	train-rmse:0.07975	valid-rmse:0.09547
[170]	train-rmse:0.07925	valid-rmse:0.09552
[180]	train-rmse:0.07878	valid-rmse:0.09552
[190]	train-rmse:0.07831	valid-rmse:0.09554
[200]	train-rmse:0.07769	valid-rmse:0.09567
[210]	train-rmse:0.07723	valid-rmse:0.09570
[220]	train-rmse:0.07679	valid-rmse:0.09581
[230]	train-rmse:0.07624	valid-rmse:0.09593
[240]	train-rmse:0.07576	valid-rmse:0.09599
[250]	train-rmse:0.07529	valid-rmse:0.09608
[260]	train-rmse:0.07491	valid-rmse:0.09610
[270]	train-rmse:0.07443	valid-rmse:0

Predict the test data

In [6]:
y_pred = mdl.predict(d_test)

print(y_pred)

[ 0.05475697  0.01549684 -0.00066602 ...  0.01840273  0.00705392
  0.04463446]


test the quality of a model

In [7]:
from sklearn.metrics import root_mean_squared_error, mean_squared_error, mean_absolute_error
# Reshape y_pred to match the shape of y_valid
y_pred = y_pred[:len(y_valid)]

# Calculate the mean squared error
mse = mean_squared_error(y_valid, y_pred)
rmse = root_mean_squared_error(y_valid, y_pred)
mae= mean_absolute_error(y_valid, y_pred)

print('Mean Squared Error is:', mse)
print('Root Mean Squared Error is:', rmse)
print('Mean Absolute Error is:', mae)


Mean Squared Error is: 0.009798797830730425
Root Mean Squared Error is: 0.09898887730816239
Mean Absolute Error is: 0.07431107214148625
