In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

from statsmodels.tools.sm_exceptions import ValueWarning
warnings.simplefilter('ignore', ValueWarning)

In [2]:
import yfinance as yf

df = yf.download('XOM', start="2018-01-01")
df

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,83.820000,85.199997,83.660004,85.029999,62.638458,11469300
2018-01-03,85.160004,86.970001,84.820000,86.699997,63.868690,13957700
2018-01-04,86.790001,87.220001,86.430000,86.820000,63.957081,10863000
2018-01-05,86.750000,86.879997,85.709999,86.750000,63.905525,11047600
2018-01-08,86.699997,87.150002,86.599998,87.139999,64.192818,10927100
...,...,...,...,...,...,...
2024-04-08,121.080002,121.739998,120.209999,120.550003,120.550003,17513900
2024-04-09,121.000000,121.650002,120.339996,121.180000,121.180000,14135700
2024-04-10,121.220001,122.470001,120.730003,122.199997,122.199997,17409200
2024-04-11,122.690002,122.690002,120.339996,121.790001,121.790001,17553600


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1580 entries, 2018-01-02 to 2024-04-12
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       1580 non-null   float64
 1   High       1580 non-null   float64
 2   Low        1580 non-null   float64
 3   Close      1580 non-null   float64
 4   Adj Close  1580 non-null   float64
 5   Volume     1580 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 86.4 KB


In [4]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df['Close'], test_size=0.2, shuffle=False)

In [5]:
# Input/Output Length
input_length = 50
output_length = 10

# Prepare data
def prepare_data(data, input_length, output_length):
    X, y = [], []
    for i in range(len(data) - input_length - output_length + 1):
        X.append(data[i:(i + input_length)])
        y.append(data[(i + input_length):(i + input_length + output_length)])
    return np.array(X), np.array(y)

X_train, y_train = prepare_data(train_data.values, input_length, output_length)
X_test, y_test = prepare_data(test_data.values, input_length, output_length)

In [6]:
# Reshape input data for XGBoost
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)

In [7]:
from xgboost import XGBRegressor

# Initialize and train the XGBoost model
model = XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=5)
model.fit(X_train, y_train.reshape(-1, output_length))

In [8]:
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

# Predict future stock prices
predictions = model.predict(X_test)
predictions = predictions.reshape(-1, output_length)

# Evaluate the model
mse = np.sqrt(mean_squared_error(y_test, predictions))
print(f'Root Mean Squared Error: {mse}')

mape = mean_absolute_percentage_error(y_test, predictions)
print(f'Mean Absolute Percentage Error: {mape}')

Root Mean Squared Error: 4.999358985594892
Mean Absolute Percentage Error: 0.03681281364607997
