In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgbm #package needed to implement the xgboost

from sklearn.model_selection import train_test_split #to split the data into training and testing parts
from sklearn.metrics import mean_squared_error #to check the mean squared error of the predictions made
from sklearn.preprocessing import MinMaxScaler #used to normalize the output to (-1, 1) 

In [2]:
stock = pd.read_csv("./data/data_stocks.csv", header = 0)

In [3]:
stock.head()

Unnamed: 0,DATE,SP500,NASDAQ.AAL,NASDAQ.AAPL,NASDAQ.ADBE,NASDAQ.ADI,NASDAQ.ADP,NASDAQ.ADSK,NASDAQ.AKAM,NASDAQ.ALXN,...,NYSE.WYN,NYSE.XEC,NYSE.XEL,NYSE.XL,NYSE.XOM,NYSE.XRX,NYSE.XYL,NYSE.YUM,NYSE.ZBH,NYSE.ZTS
0,1491226200,2363.6101,42.33,143.68,129.63,82.04,102.23,85.22,59.76,121.52,...,84.37,119.035,44.4,39.88,82.03,7.36,50.22,63.86,122.0,53.35
1,1491226260,2364.1001,42.36,143.7,130.32,82.08,102.14,85.65,59.84,121.48,...,84.37,119.035,44.11,39.88,82.03,7.38,50.22,63.74,121.77,53.35
2,1491226320,2362.6799,42.31,143.6901,130.225,82.03,102.2125,85.51,59.795,121.93,...,84.585,119.26,44.09,39.98,82.02,7.36,50.12,63.75,121.7,53.365
3,1491226380,2364.3101,42.37,143.64,130.0729,82.0,102.14,85.4872,59.62,121.44,...,84.46,119.26,44.25,39.99,82.02,7.35,50.16,63.88,121.7,53.38
4,1491226440,2364.8501,42.5378,143.66,129.88,82.035,102.06,85.7001,59.62,121.6,...,84.47,119.61,44.11,39.96,82.03,7.36,50.2,63.91,121.695,53.24


In [4]:
stock = stock.drop(columns = "DATE", axis = 1)

In [5]:
n = stock.shape[0]
p = stock.shape[1]

In [6]:
scaler = MinMaxScaler(feature_range=(0,1))
scaler.fit(stock)
stock = scaler.transform(stock)

In [7]:
train_start = 0
train_end = int(np.floor(0.8*n))
test_start = train_end + 1
test_end = n
train = stock[np.arange(train_start, train_end), :]
test = stock[np.arange(test_start, test_end), :]

In [8]:
stock.shape

(41266, 501)

In [9]:
X = train[:, 1:]
Y = train[:, 0]
X_test = test[:, 1:]
y_test = test[:, 0] 

In [10]:
X_train, X_validate, y_train, y_validate = train_test_split(X, Y, random_state = 123, test_size = 0.2)

In [11]:
model = lgbm.LGBMRegressor(boosting_type = 'gbdt', objective = 'regression')

In [12]:
model.fit(X_train, y_train, eval_set = [(X_train, y_train),(X_validate, y_validate)], 
        eval_metric = 'l2', verbose = True)

[1]	valid_0's l2: 0.0475417	valid_0's l2: 0.0475417	valid_1's l2: 0.0468582	valid_1's l2: 0.0468582
[2]	valid_0's l2: 0.0385574	valid_0's l2: 0.0385574	valid_1's l2: 0.0380167	valid_1's l2: 0.0380167
[3]	valid_0's l2: 0.0312768	valid_0's l2: 0.0312768	valid_1's l2: 0.0308427	valid_1's l2: 0.0308427
[4]	valid_0's l2: 0.025373	valid_0's l2: 0.025373	valid_1's l2: 0.0250182	valid_1's l2: 0.0250182
[5]	valid_0's l2: 0.0205887	valid_0's l2: 0.0205887	valid_1's l2: 0.0203007	valid_1's l2: 0.0203007
[6]	valid_0's l2: 0.0167093	valid_0's l2: 0.0167093	valid_1's l2: 0.0164744	valid_1's l2: 0.0164744
[7]	valid_0's l2: 0.013564	valid_0's l2: 0.013564	valid_1's l2: 0.0133781	valid_1's l2: 0.0133781
[8]	valid_0's l2: 0.0110128	valid_0's l2: 0.0110128	valid_1's l2: 0.0108598	valid_1's l2: 0.0108598
[9]	valid_0's l2: 0.00894689	valid_0's l2: 0.00894689	valid_1's l2: 0.0088215	valid_1's l2: 0.0088215
[10]	valid_0's l2: 0.0072697	valid_0's l2: 0.0072697	valid_1's l2: 0.00716707	valid_1's l2: 0.00716707

[78]	valid_0's l2: 1.72419e-05	valid_0's l2: 1.72419e-05	valid_1's l2: 2.19524e-05	valid_1's l2: 2.19524e-05
[79]	valid_0's l2: 1.7099e-05	valid_0's l2: 1.7099e-05	valid_1's l2: 2.18155e-05	valid_1's l2: 2.18155e-05
[80]	valid_0's l2: 1.69907e-05	valid_0's l2: 1.69907e-05	valid_1's l2: 2.17204e-05	valid_1's l2: 2.17204e-05
[81]	valid_0's l2: 1.68541e-05	valid_0's l2: 1.68541e-05	valid_1's l2: 2.15881e-05	valid_1's l2: 2.15881e-05
[82]	valid_0's l2: 1.67278e-05	valid_0's l2: 1.67278e-05	valid_1's l2: 2.1492e-05	valid_1's l2: 2.1492e-05
[83]	valid_0's l2: 1.66045e-05	valid_0's l2: 1.66045e-05	valid_1's l2: 2.13963e-05	valid_1's l2: 2.13963e-05
[84]	valid_0's l2: 1.6477e-05	valid_0's l2: 1.6477e-05	valid_1's l2: 2.12874e-05	valid_1's l2: 2.12874e-05
[85]	valid_0's l2: 1.63763e-05	valid_0's l2: 1.63763e-05	valid_1's l2: 2.12128e-05	valid_1's l2: 2.12128e-05
[86]	valid_0's l2: 1.62618e-05	valid_0's l2: 1.62618e-05	valid_1's l2: 2.11407e-05	valid_1's l2: 2.11407e-05
[87]	valid_0's l2: 1.6139

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.1, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_leaves=31, objective='regression',
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [13]:
model.n_features_

500

In [14]:
predictions = model.predict(X_test)

In [15]:
mean_squared_error(predictions, y_test)

0.005139620814690275