# XGBoost

Vamos a ver qué tal funciona el XGBoost con el problema de la predicción de acciones en bolsa.

In [None]:
!wget https://raw.githubusercontent.com/jbrownlee/Datasets/master/daily-total-female-births.csv

# 1. Predicción del valor de acciones en bolsa

Nos vamos a conectar a la API de Yahoo Stocks para descargarnos los últimos 20 años de datos de la cotización en bolsa de Amazon e intentar predecir valores futuros.

In [None]:
import datetime as dt
from pandas_datareader import data

dataset_raw = data.DataReader('AMZN','yahoo', dt.datetime(2000,1,1), dt.datetime.now())
dataset_raw.head()

In [None]:
# Nos quedamos con la variable 'Close' unicamente
dataframe = dataset_raw[['Close']]
dataframe.head()

In [None]:
# utilizamos la función create_dataset que NO introduce T 0's iniciales
def create_dataset(dataset, look_back_memory=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back_memory-1):
        dataX.append(dataset[i:i+look_back_memory, 0])
        dataY.append(dataset[i+look_back_memory, 0])
    return numpy.array(dataX), numpy.array(dataY)

In [None]:
# hacemos el import de todo lo que utilizaremos
import numpy
import matplotlib.pyplot as plt
from pandas import read_csv
import math
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

In [None]:
# fijamos la semilla para obtener resultados reproducibles
numpy.random.seed(42)

# cargamos los datos
dataset = dataframe.values
dataset = dataset.astype('float32')

# normalizamos el dataset
scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(dataset)

# dividimos en train y test
# train_size = int(len(dataset) * 0.67)
train_size = int(len(dataset) * 0.67)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]

# transformamos los datos para crearnos N registros con T timestamps cada uno 
# (uno por cada instante temporal hasta completar el tamaño de la ventana) y 
# las V variables de las que disponga nuestro dataset. En este caso, vamos a 
# escoger una ventana con un único timestamp T=10 y solo tendremos una variable,
# con lo que V=1 (valor de cierre de la acción).
look_back_memory = 10
trainX, trainY = create_dataset(train, look_back_memory)
testX, testY = create_dataset(test, look_back_memory)
print(trainX.shape, trainY.shape)
print(testX.shape, testY.shape)

In [None]:
# # Nos aseguramos de que las dimensiones de las entradas son las correctas:
# # (número de ventanas de T elementos, los T elementos de cada ventana, las V variables de cada timestamp)
# variables = 1 # (trainX.shape[1])
# trainX = numpy.reshape(trainX, (trainX.shape[0], look_back_memory, variables))
# testX = numpy.reshape(testX, (testX.shape[0], look_back_memory, variables))
# print(trainX.shape)
# print(testX.shape)


In [None]:
# entrenamos el modelo con los datos de entrenamiento
model = XGBRegressor(objective='reg:squarederror', n_estimators=1000)
model.fit(trainX, trainY)

In [None]:
# vamos a ver qué tal funciona nuestro modelo
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)

# una vez hechas las predicciones, tenemos que des-normalizarlas
trainPredict = scaler.inverse_transform([trainPredict])[0]
trainY_orig = scaler.inverse_transform([trainY])[0]
testPredict = scaler.inverse_transform([testPredict])[0]
testY_orig = scaler.inverse_transform([testY])[0]

# y ahora calculamos el error cometido en train y en test
trainScore = math.sqrt(mean_squared_error(trainY_orig, trainPredict))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = math.sqrt(mean_squared_error(testY_orig, testPredict))
print('Test Score: %.2f RMSE' % (testScore))

In [None]:
# por como creamos el dataset de entrenamiento, ahora tenemos que desplazar
# nuestras predicciones para que "cuadren" con el eje x de los datos originales
trainPredictPlot = numpy.empty(dataset.shape[0])
trainPredictPlot[:] = numpy.nan
trainPredictPlot[look_back_memory:len(trainPredict)+look_back_memory] = trainPredict

# y lo mismo para el test
testPredictPlot = numpy.empty(dataset.shape[0])
testPredictPlot[:] = numpy.nan
testPredictPlot[len(trainPredict)+(look_back_memory*2)+1:len(dataset)-1] = testPredict

# y mostramos los datos originales, la predicción en training y la predicción en test
plt.plot(scaler.inverse_transform(dataset))
plt.plot(trainPredictPlot)
plt.plot(testPredictPlot)
plt.show()

### Entrenar usando un Walk-Forward Validation

In [None]:
import datetime as dt
from pandas_datareader import data

dataset_raw = data.DataReader('AMZN','yahoo', dt.datetime(2000,1,1), dt.datetime.now())
dataset_raw.head()

In [None]:
# forecast monthly births with xgboost
from numpy import asarray
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from matplotlib import pyplot
import numpy
from tqdm import tqdm

def create_dataset(dataset, look_back_memory=1):
    data = []
    for i in range(len(dataset)-look_back_memory):
        data.append(dataset[i:i+look_back_memory+1, 0])
    return numpy.array(data)

# split a univariate dataset into train/test sets
def train_test_split(data, n_test):
	return data[:-n_test, :], data[-n_test:, :]

# fit an xgboost model and make a one step prediction
def xgboost_forecast(train, testX, n_estimators):
	# transform list into array
	train = asarray(train)
	# split into input and output columns
	trainX, trainy = train[:, :-1], train[:, -1]
	# fit model
	model = XGBRegressor(objective='reg:squarederror', n_estimators=n_estimators)
	model.fit(trainX, trainy)
	# make a one-step prediction
	yhat = model.predict(asarray([testX]))
	return yhat[0]

# walk-forward validation for univariate data
def walk_forward_validation(data, n_test, n_estimators=10):
	predictions = list()
	# split dataset
	train, test = train_test_split(data, n_test)
	# seed history with training dataset
	history = [x for x in train]
	# step over each time-step in the test set
	for i in tqdm(range(len(test))):
		# split test row into input and output columns
		testX, testy = test[i, :-1], test[i, -1]
		# fit model on history and make a prediction
		yhat = xgboost_forecast(history, testX, n_estimators)
		# store forecast in list of predictions
		predictions.append(yhat)
		# add actual observation to history for the next loop
		history.append(test[i])
		# summarize progress
		# print('>expected=%.1f, predicted=%.1f' % (testy, yhat))
	# estimate prediction error
	error = mean_absolute_error(test[:, -1], predictions)
	return error, test[:, -1], predictions

In [None]:
# creamos el dataset como de costumbre
data = create_dataset(dataset_raw[['Close']].values, look_back_memory=10)
print(data.shape)

In [None]:
# evaluate
mae, y, yhat = walk_forward_validation(data, n_test=1760, n_estimators=10)
print('MAE: %.3f' % mae)
# plot expected vs preducted
pyplot.plot(y, label='Expected')
pyplot.plot(yhat, label='Predicted')
pyplot.legend()
pyplot.show()

In [None]:
# evaluate
mae, y, yhat = walk_forward_validation(data, n_test=1760, n_estimators=100)
print('MAE: %.3f' % mae)
# plot expected vs preducted
pyplot.plot(y, label='Expected')
pyplot.plot(yhat, label='Predicted')
pyplot.legend()
pyplot.show()

Como podéis observar, la red es capaz de predecir bastante mejor los valores de cierre cuando utilizamos una ventana más grande.

¿Cómo podíarmos mejorar estos resultados?

¿Y si utilizamos, además del valor de cierre, el resto de valores?

## Predicción con más de una variable

En esta ocasión utilizaremos todos los datos que nos brinda la API de Yahoo para predecir el precio de cierre de una acción.

In [None]:
import datetime as dt
from pandas_datareader import data

dataset_raw = data.DataReader('AMZN','yahoo', dt.datetime(2000,1,1), dt.datetime.now())
dataset_raw.head()

In [None]:
dataset_raw.values[0, :-1]

In [None]:
dataset_raw.values[1, -1]

In [None]:
numpy.append(dataset_raw.values[0, :-1], dataset_raw.values[1, -1])

In [None]:
def create_dataset(dataset):
    data = []
    for i in range(len(dataset)-1):
        data.append(numpy.append(dataset_raw.values[i, :-1], dataset_raw.values[i+1, -1]))
    return numpy.array(data)

data = create_dataset(dataset_raw.values)
print(data[:5])
print(data.shape)

In [None]:
# forecast monthly births with xgboost
from numpy import asarray
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from matplotlib import pyplot
import numpy
from tqdm import tqdm

# split a univariate dataset into train/test sets
def train_test_split(data, n_test):
	return data[:-n_test, :], data[-n_test:, :]

# fit an xgboost model and make a one step prediction
def xgboost_forecast(train, testX, n_estimators):
	# transform list into array
	train = asarray(train)
	# split into input and output columns
	trainX, trainy = train[:, :-1], train[:, -1]
	# fit model
	model = XGBRegressor(objective='reg:squarederror', n_estimators=n_estimators)
	model.fit(trainX, trainy)
	# make a one-step prediction
	yhat = model.predict(asarray([testX]))
	return yhat[0], model

# walk-forward validation for univariate data
def walk_forward_validation(data, n_test, n_estimators=10):
    predictions = list()
    models = list()
    # split dataset
    train, test = train_test_split(data, n_test)
    # seed history with training dataset
    history = [x for x in train]
    # step over each time-step in the test set
    for i in tqdm(range(len(test))):
        # split test row into input and output columns
        testX, testy = test[i, :-1], test[i, -1]
        # fit model on history and make a prediction
        yhat, model = xgboost_forecast(history, testX, n_estimators)
        # store forecast in list of predictions
        predictions.append(yhat)
        models.append(model)
        # add actual observation to history for the next loop
        history.append(test[i])
        # summarize progress
        # print('>expected=%.1f, predicted=%.1f' % (testy, yhat))
    # estimate prediction error
    error = mean_absolute_error(test[:, -1], predictions)
    return error, test[:, -1], predictions, models

In [None]:
# evaluate
mae, y, yhat, models = walk_forward_validation(data, n_test=1760, n_estimators=10)
print('MAE: %.3f' % mae)
# plot expected vs preducted
pyplot.plot(y, label='Expected')
pyplot.plot(yhat, label='Predicted')
pyplot.legend()
pyplot.show()

Obtenemos un error muy similar a cuando utilizamos solo el valor de cierre (Close). ¿A qué puede deberse esto?

Fijaos:

In [None]:
import seaborn as sns
sns.pairplot(dataset_raw)

Como podéis comprobar, las variables están muy correlacionadas entre sí, por lo que es normal que no aporten información al modelo.

Además, según con el modelo con el que trabajemos, podría incluso infuir negativamente.

Veamos la importancia de las variables obtenida por el XGBoost:

In [None]:
cols = dataset_raw.columns[:-1]
imps = numpy.stack([m.feature_importances_ for m in models])

In [None]:
imps.shape

In [None]:
feature_importances = DataFrame(imps, columns=cols)
# feature_importances = feature_importances.sort_values(by='imp',ascending=False)
# px.bar(feature_importances,x='col',y='imp')
feature_importances.describe()

Las variables High, Low, Close y Open tienen una importancia muy similar. Viendo las correlaciones, esto nos indica que son redundantes. Por otra parte, Volume no aporta nada.

**¿Y si hubiéramos normalizado?**

In [None]:
def create_dataset(dataset):
    data = []
    for i in range(len(dataset)-1):
        data.append(numpy.append(dataset_raw.values[i, :-1], dataset_raw.values[i+1, -1]))
    return numpy.array(data)

data = create_dataset(dataset_raw.values)
print(data[:5])
print(data.shape)

In [None]:
data = (data - data.min(axis=0)) / (data.max(axis=0) - data.min(axis=0))
data[:5]

In [None]:
data.min(axis=0)

In [None]:
data.max(axis=0)

In [None]:
# forecast monthly births with xgboost
from numpy import asarray
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from matplotlib import pyplot
import numpy
from tqdm import tqdm

# split a univariate dataset into train/test sets
def train_test_split(data, n_test):
	return data[:-n_test, :], data[-n_test:, :]

# fit an xgboost model and make a one step prediction
def xgboost_forecast(train, testX, n_estimators):
	# transform list into array
	train = asarray(train)
	# split into input and output columns
	trainX, trainy = train[:, :-1], train[:, -1]
	# fit model
	model = XGBRegressor(objective='reg:squarederror', n_estimators=n_estimators)
	model.fit(trainX, trainy)
	# make a one-step prediction
	yhat = model.predict(asarray([testX]))
	return yhat[0], model

# walk-forward validation for univariate data
def walk_forward_validation(data, n_test, n_estimators=10):
    predictions = list()
    models = list()
    # split dataset
    train, test = train_test_split(data, n_test)
    # seed history with training dataset
    history = [x for x in train]
    # step over each time-step in the test set
    for i in tqdm(range(len(test))):
        # split test row into input and output columns
        testX, testy = test[i, :-1], test[i, -1]
        # fit model on history and make a prediction
        yhat, model = xgboost_forecast(history, testX, n_estimators)
        # store forecast in list of predictions
        predictions.append(yhat)
        models.append(model)
        # add actual observation to history for the next loop
        history.append(test[i])
        # summarize progress
        # print('>expected=%.1f, predicted=%.1f' % (testy, yhat))
    # estimate prediction error
    error = mean_absolute_error(test[:, -1], predictions)
    return error, test[:, -1], predictions, models

In [None]:
# evaluate
mae, y, yhat, models = walk_forward_validation(data, n_test=1760, n_estimators=10)
print('MAE: %.3f' % mae)

# plot expected vs preducted
pyplot.plot(y, label='Expected')
pyplot.plot(yhat, label='Predicted')
pyplot.legend()
pyplot.show()

In [None]:
cols = dataset_raw.columns[:-1]
imps = numpy.stack([m.feature_importances_ for m in models])

In [None]:
feature_importances = DataFrame(imps, columns=cols)
feature_importances.describe()

Fijaos que el ajuste es mejor, pero sigue sin tener en cuenta la variable Volume para nada.

**Más ejemplos interesantes de predicción con XGBoost**

- Predicción de compras

https://www.kaggle.com/alessandrosolbiati/using-xgboost-for-time-series-prediction-top-20

- Nacimientos de niñas

https://machinelearningmastery.com/xgboost-for-time-series-forecasting/

- Consumo de energía eléctrica

https://www.kaggle.com/robikscube/tutorial-time-series-forecasting-with-xgboost

https://github.com/Jenniferz28/Time-Series-ARIMA-XGBOOST-RNN

- Predicción de la demanda de bicicletas:

https://towardsdatascience.com/go-highly-accurate-or-go-home-61828afb0b13

**Más datasets** 

https://archive.ics.uci.edu/ml/index.php

