In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

In [2]:
data = pd.read_csv('/Users/Selma/dev/STAT3007-timeseries_forecasting/data/dataeqnr_max_daily.csv')
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Scaled_close
0,2001-06-18 00:00:00-04:00,2.507967,2.514673,2.504614,2.504614,7189500,0.0,0.0,-0.973381
1,2001-06-19 00:00:00-04:00,2.531437,2.591789,2.531437,2.554907,1410700,0.0,0.0,-0.970356
2,2001-06-20 00:00:00-04:00,2.575024,2.578377,2.551554,2.561612,550400,0.0,0.0,-0.969953
3,2001-06-21 00:00:00-04:00,2.538142,2.548201,2.531436,2.538142,643600,0.0,0.0,-0.971364
4,2001-06-22 00:00:00-04:00,2.521378,2.531436,2.507966,2.518025,822600,0.0,0.0,-0.972574


In [3]:
# Converting date column from str to timestamp
# Remove a specific substring from the end
def remove_suffix(s, suffix='00:00:00-04:00'):
    if s.endswith(suffix):
        return s[:-len(suffix)]
    return s

data['Date'] = data['Date'].apply(lambda x: remove_suffix(x, '00:00:00-04:00'))
data['Date'] = data['Date'].apply(lambda x: remove_suffix(x, '00:00:00-05:00'))

In [4]:
# Scale the closeprice to the range [-1, 1] and add it as a new column
scaler = MinMaxScaler(feature_range=(-1, 1))
data['Scaled_tomorrow'] = scaler.fit_transform(data[['Close']])
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Scaled_close,Scaled_tomorrow
0,2001-06-18,2.507967,2.514673,2.504614,2.504614,7189500,0.0,0.0,-0.973381,-0.973381
1,2001-06-19,2.531437,2.591789,2.531437,2.554907,1410700,0.0,0.0,-0.970356,-0.970356
2,2001-06-20,2.575024,2.578377,2.551554,2.561612,550400,0.0,0.0,-0.969953,-0.969953
3,2001-06-21,2.538142,2.548201,2.531436,2.538142,643600,0.0,0.0,-0.971364,-0.971364
4,2001-06-22,2.521378,2.531436,2.507966,2.518025,822600,0.0,0.0,-0.972574,-0.972574


In [10]:
import ta
from sklearn.preprocessing import StandardScaler

# Adding technical indicators
data['SMA_10'] = ta.trend.sma_indicator(data['Scaled_close'], window=10)
data['SMA_30'] = ta.trend.sma_indicator(data['Scaled_close'], window=30)
data['RSI'] = ta.momentum.rsi(data['Scaled_close'], window=14)
data['MACD'] = ta.trend.macd(data['Scaled_close'])

# Ensure no NaN values
data.dropna(inplace=True)

# Create a rolling window of past 30 days including the new features
features = ['Scaled_close', 'SMA_10', 'SMA_30', 'RSI', 'MACD']
window_size = 30

# Creating the dataset
X = []
y_1d = []
y_3d = []
y_5d = []

for i in range(len(data) - window_size - 5):
    # Extracting the features for the current window
    X.append(data[features].iloc[i:i+window_size].values.flatten())

    # Extracting the target values for 1, 3, and 5 days ahead
    y_1d.append(data['Scaled_close'].iloc[i+window_size+1])
    y_3d.append(data['Scaled_close'].iloc[i+window_size+3])
    y_5d.append(data['Scaled_close'].iloc[i+window_size+5])

X = np.array(X)
y_1d = np.array(y_1d)
y_3d = np.array(y_3d)
y_5d = np.array(y_5d)

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train_1d, y_test_1d = train_test_split(X_scaled, y_1d, test_size=0.2, shuffle=False)
_, _, y_train_3d, y_test_3d = train_test_split(X_scaled, y_3d, test_size=0.2, shuffle=False)
_, _, y_train_5d, y_test_5d = train_test_split(X_scaled, y_5d, test_size=0.2, shuffle=False)

# Initialize and train the Random Forest models
model_1d = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
model_3d = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
model_5d = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)

model_1d.fit(X_train, y_train_1d)
model_3d.fit(X_train, y_train_3d)
model_5d.fit(X_train, y_train_5d)

# Make predictions
y_pred_1d = model_1d.predict(X_test)
y_pred_3d = model_3d.predict(X_test)
y_pred_5d = model_5d.predict(X_test)


In [14]:
import plotly.graph_objs as go

# Plot 1-day ahead actual vs. predicted prices
fig_1d = go.Figure()
fig_1d.add_trace(go.Scatter(x=data.index[window_size+len(X_train):], y=y_test_1d,
                            mode='lines', name='Actual Prices 1-day ahead'))
fig_1d.add_trace(go.Scatter(x=data.index[window_size+len(X_train):], y=y_pred_1d,
                            mode='lines', name='Predicted Prices 1-day ahead'))
fig_1d.update_layout(title='Actual vs. Predicted Stock Prices (1-day ahead)',
                     xaxis_title='Date',
                     yaxis_title='Price',
                     legend_title='Legend')
fig_1d.show()

# Plot 3-day ahead actual vs. predicted prices
fig_3d = go.Figure()
fig_3d.add_trace(go.Scatter(x=data.index[window_size+len(X_train):], y=y_test_3d,
                            mode='lines', name='Actual Prices 3-day ahead'))
fig_3d.add_trace(go.Scatter(x=data.index[window_size+len(X_train):], y=y_pred_3d,
                            mode='lines', name='Predicted Prices 3-day ahead'))
fig_3d.update_layout(title='Actual vs. Predicted Stock Prices (3-day ahead)',
                     xaxis_title='Date',
                     yaxis_title='Price',
                     legend_title='Legend')
fig_3d.show()

# Plot 5-day ahead actual vs. predicted prices
fig_5d = go.Figure()
fig_5d.add_trace(go.Scatter(x=data.index[window_size+len(X_train):], y=y_test_5d,
                            mode='lines', name='Actual Prices 5-day ahead'))
fig_5d.add_trace(go.Scatter(x=data.index[window_size+len(X_train):], y=y_pred_5d,
                            mode='lines', name='Predicted Prices 5-day ahead'))
fig_5d.update_layout(title='Actual vs. Predicted Stock Prices (5-day ahead)',
                     xaxis_title='Date',
                     yaxis_title='Price',
                     legend_title='Legend')
fig_5d.show()


In [13]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def calculate_rmse(y_true, y_pred):
    """Calculate Root Mean Squared Error (RMSE)."""
    return np.sqrt(mean_squared_error(y_true, y_pred))

def calculate_mae(y_true, y_pred):
    """Calculate Mean Absolute Error (MAE)."""
    return mean_absolute_error(y_true, y_pred)

def calculate_r2(y_true, y_pred):
    """Calculate R-squared (R2) score."""
    return r2_score(y_true, y_pred)

def calculate_mape(y_true, y_pred):
    """Calculate Mean Absolute Percentage Error (MAPE)."""
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# 1-day ahead error metrics
rmse_1d = calculate_rmse(y_test_1d, y_pred_1d)
mae_1d = calculate_mae(y_test_1d, y_pred_1d)
r2_1d = calculate_r2(y_test_1d, y_pred_1d)
mape_1d = calculate_mape(y_test_1d, y_pred_1d)
print("")
print(f'1-day ahead RMSE: {rmse_1d}')
print(f'1-day ahead MAE: {mae_1d}')
print(f'1-day ahead R2: {r2_1d}')
print(f'1-day ahead MAPE: {mape_1d}%')

# 3-day ahead error metrics
rmse_3d = calculate_rmse(y_test_3d, y_pred_3d)
mae_3d = calculate_mae(y_test_3d, y_pred_3d)
r2_3d = calculate_r2(y_test_3d, y_pred_3d)
mape_3d = calculate_mape(y_test_3d, y_pred_3d)
print("")
print(f'3-day ahead RMSE: {rmse_3d}')
print(f'3-day ahead MAE: {mae_3d}')
print(f'3-day ahead R2: {r2_3d}')
print(f'3-day ahead MAPE: {mape_3d}%')

# 5-day ahead error metrics
rmse_5d = calculate_rmse(y_test_5d, y_pred_5d)
mae_5d = calculate_mae(y_test_5d, y_pred_5d)
r2_5d = calculate_r2(y_test_5d, y_pred_5d)
mape_5d = calculate_mape(y_test_5d, y_pred_5d)
print("")
print(f'5-day ahead RMSE: {rmse_5d}')
print(f'5-day ahead MAE: {mae_5d}')
print(f'5-day ahead R2: {r2_5d}')
print(f'5-day ahead MAPE: {mape_5d}%')



1-day ahead RMSE: 0.39883419888515453
1-day ahead MAE: 0.2885225959453053
1-day ahead R2: 0.19973301752100048
1-day ahead MAPE: 74.96531123306313%

3-day ahead RMSE: 0.417724644046949
3-day ahead MAE: 0.30751260137755965
3-day ahead R2: 0.12129196559852573
3-day ahead MAPE: 111.52103670490425%

5-day ahead RMSE: 0.43563060194253284
5-day ahead MAE: 0.32521647622079886
5-day ahead R2: 0.04335907458762478
5-day ahead MAPE: 117.13323440048751%
