In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb #package needed to implement the xgboost

from sklearn.model_selection import train_test_split #to split the data into training and testing parts
from sklearn.metrics import mean_squared_error #to check the mean squared error of the predictions made
from sklearn.preprocessing import MinMaxScaler #used to normalize the output to (-1, 1) 

In [2]:
stock = pd.read_csv("./data/data_stocks.csv", header = 0)

In [3]:
stock.head()

Unnamed: 0,DATE,SP500,NASDAQ.AAL,NASDAQ.AAPL,NASDAQ.ADBE,NASDAQ.ADI,NASDAQ.ADP,NASDAQ.ADSK,NASDAQ.AKAM,NASDAQ.ALXN,...,NYSE.WYN,NYSE.XEC,NYSE.XEL,NYSE.XL,NYSE.XOM,NYSE.XRX,NYSE.XYL,NYSE.YUM,NYSE.ZBH,NYSE.ZTS
0,1491226200,2363.6101,42.33,143.68,129.63,82.04,102.23,85.22,59.76,121.52,...,84.37,119.035,44.4,39.88,82.03,7.36,50.22,63.86,122.0,53.35
1,1491226260,2364.1001,42.36,143.7,130.32,82.08,102.14,85.65,59.84,121.48,...,84.37,119.035,44.11,39.88,82.03,7.38,50.22,63.74,121.77,53.35
2,1491226320,2362.6799,42.31,143.6901,130.225,82.03,102.2125,85.51,59.795,121.93,...,84.585,119.26,44.09,39.98,82.02,7.36,50.12,63.75,121.7,53.365
3,1491226380,2364.3101,42.37,143.64,130.0729,82.0,102.14,85.4872,59.62,121.44,...,84.46,119.26,44.25,39.99,82.02,7.35,50.16,63.88,121.7,53.38
4,1491226440,2364.8501,42.5378,143.66,129.88,82.035,102.06,85.7001,59.62,121.6,...,84.47,119.61,44.11,39.96,82.03,7.36,50.2,63.91,121.695,53.24


In [4]:
stock = stock.drop(columns = "DATE", axis = 1)

In [5]:
n = stock.shape[0]
p = stock.shape[1]

In [6]:
scaler = MinMaxScaler(feature_range=(0,1))
scaler.fit(stock)
stock = scaler.transform(stock)

In [7]:
train_start = 0
train_end = int(np.floor(0.8*n))
test_start = train_end + 1
test_end = n
train = stock[np.arange(train_start, train_end), :]
test = stock[np.arange(test_start, test_end), :]

In [8]:
stock.shape

(41266, 501)

In [9]:
X = train[:, 1:]
Y = train[:, 0]
X_test = test[:, 1:]
y_test = test[:, 0] 

In [10]:
X_train, X_validate, y_train, y_validate = train_test_split(X, Y, random_state = 123, test_size = 0.2)

In [11]:
model = xgb.XGBRegressor(max_depth=5, learning_rate = 0.1, objective = 'reg:logistic', booster = 'gblinear')

In [12]:
model.fit(X_train, y_train, eval_set = [(X_train, y_train),(X_validate, y_validate)], 
        eval_metric = "rmse", verbose = True)

[0]	validation_0-rmse:0.188148	validation_1-rmse:0.186141
[1]	validation_0-rmse:0.1566	validation_1-rmse:0.154748
[2]	validation_0-rmse:0.13646	validation_1-rmse:0.13473
[3]	validation_0-rmse:0.123457	validation_1-rmse:0.12181
[4]	validation_0-rmse:0.114897	validation_1-rmse:0.113309
[5]	validation_0-rmse:0.109137	validation_1-rmse:0.107591
[6]	validation_0-rmse:0.10517	validation_1-rmse:0.103656
[7]	validation_0-rmse:0.10238	validation_1-rmse:0.100888
[8]	validation_0-rmse:0.100377	validation_1-rmse:0.098901
[9]	validation_0-rmse:0.098911	validation_1-rmse:0.097449
[10]	validation_0-rmse:0.097819	validation_1-rmse:0.096367
[11]	validation_0-rmse:0.096992	validation_1-rmse:0.095547
[12]	validation_0-rmse:0.096355	validation_1-rmse:0.094916
[13]	validation_0-rmse:0.095854	validation_1-rmse:0.094421
[14]	validation_0-rmse:0.095455	validation_1-rmse:0.094026
[15]	validation_0-rmse:0.095131	validation_1-rmse:0.093705
[16]	validation_0-rmse:0.094862	validation_1-rmse:0.09344
[17]	validation

XGBRegressor(base_score=0.5, booster='gblinear', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [13]:
model.coef_

[0.00927973,
 0.00559738,
 0.0119926,
 0.0011429,
 0.00470193,
 0.0118769,
 -0.01175,
 0.00183741,
 0.0121959,
 0.00152182,
 0.00799312,
 0.0114159,
 0.0122743,
 0.0134896,
 -0.0170848,
 0.00317594,
 0.00237848,
 0.00958704,
 0.00669969,
 0.00914344,
 -0.0150603,
 0.00692296,
 0.00620638,
 0.00996386,
 0.00657892,
 -0.0106818,
 -0.00795323,
 0.00419697,
 0.0119914,
 0.0128579,
 -0.0069758,
 -0.00944511,
 -0.0105303,
 0.00328327,
 -0.0109811,
 0.0122742,
 0.00861314,
 0.00979854,
 -0.00868553,
 0.0138961,
 0.0075939,
 0.0113665,
 -0.00800637,
 0.0130313,
 -0.0109956,
 0.0138211,
 0.00513087,
 0.00665235,
 -0.00884017,
 -0.00917578,
 0.00660856,
 0.0113807,
 0.0111204,
 0.00439412,
 -0.00420437,
 0.00948935,
 0.0032903,
 0.00106553,
 0.0125817,
 0.00612277,
 -0.000264001,
 -0.00283362,
 -0.00484397,
 -0.00444177,
 0.0125458,
 0.0106904,
 0.000132245,
 -0.00804294,
 0.000505002,
 0.0129931,
 0.0121874,
 0.00901103,
 -0.0121289,
 0.00914158,
 -0.00306876,
 0.0119634,
 0.0155784,
 0.0098459

In [14]:
model.intercept_

[-0.385499]

In [16]:
model.coef_

[0.00927973,
 0.00559738,
 0.0119926,
 0.0011429,
 0.00470193,
 0.0118769,
 -0.01175,
 0.00183741,
 0.0121959,
 0.00152182,
 0.00799312,
 0.0114159,
 0.0122743,
 0.0134896,
 -0.0170848,
 0.00317594,
 0.00237848,
 0.00958704,
 0.00669969,
 0.00914344,
 -0.0150603,
 0.00692296,
 0.00620638,
 0.00996386,
 0.00657892,
 -0.0106818,
 -0.00795323,
 0.00419697,
 0.0119914,
 0.0128579,
 -0.0069758,
 -0.00944511,
 -0.0105303,
 0.00328327,
 -0.0109811,
 0.0122742,
 0.00861314,
 0.00979854,
 -0.00868553,
 0.0138961,
 0.0075939,
 0.0113665,
 -0.00800637,
 0.0130313,
 -0.0109956,
 0.0138211,
 0.00513087,
 0.00665235,
 -0.00884017,
 -0.00917578,
 0.00660856,
 0.0113807,
 0.0111204,
 0.00439412,
 -0.00420437,
 0.00948935,
 0.0032903,
 0.00106553,
 0.0125817,
 0.00612277,
 -0.000264001,
 -0.00283362,
 -0.00484397,
 -0.00444177,
 0.0125458,
 0.0106904,
 0.000132245,
 -0.00804294,
 0.000505002,
 0.0129931,
 0.0121874,
 0.00901103,
 -0.0121289,
 0.00914158,
 -0.00306876,
 0.0119634,
 0.0155784,
 0.0098459

In [17]:
predictions = model.predict(X_test)

In [18]:
mean_squared_error(predictions, y_test)

0.009807875851174081