In [35]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 

import plotly.express as px
import math
from itertools import cycle

In [36]:
# Load the dataset
data = pd.read_csv('/Users/Selma/dev/STAT3007-timeseries_forecasting/data/dataeqnr_max_daily.csv')
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Scaled_close
0,2001-06-18 00:00:00-04:00,2.507967,2.514673,2.504614,2.504614,7189500,0.0,0.0,-0.973381
1,2001-06-19 00:00:00-04:00,2.531437,2.591789,2.531437,2.554907,1410700,0.0,0.0,-0.970356
2,2001-06-20 00:00:00-04:00,2.575024,2.578377,2.551554,2.561612,550400,0.0,0.0,-0.969953
3,2001-06-21 00:00:00-04:00,2.538142,2.548201,2.531436,2.538142,643600,0.0,0.0,-0.971364
4,2001-06-22 00:00:00-04:00,2.521378,2.531436,2.507966,2.518025,822600,0.0,0.0,-0.972574


In [37]:
# Scale the closeprice to the range [-1, 1] and add it as a new column
scaler = MinMaxScaler(feature_range=(-1, 1))
data['Scaled_close'] = scaler.fit_transform(data[['Close']])
data.head()


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Scaled_close
0,2001-06-18 00:00:00-04:00,2.507967,2.514673,2.504614,2.504614,7189500,0.0,0.0,-0.973381
1,2001-06-19 00:00:00-04:00,2.531437,2.591789,2.531437,2.554907,1410700,0.0,0.0,-0.970356
2,2001-06-20 00:00:00-04:00,2.575024,2.578377,2.551554,2.561612,550400,0.0,0.0,-0.969953
3,2001-06-21 00:00:00-04:00,2.538142,2.548201,2.531436,2.538142,643600,0.0,0.0,-0.971364
4,2001-06-22 00:00:00-04:00,2.521378,2.531436,2.507966,2.518025,822600,0.0,0.0,-0.972574


In [38]:
# Converting date column from str to timestamp
# Remove a specific substring from the end
def remove_suffix(s, suffix='00:00:00-04:00'):
    if s.endswith(suffix):
        return s[:-len(suffix)]
    return s

data['Date'] = data['Date'].apply(lambda x: remove_suffix(x, '00:00:00-04:00'))
data['Date'] = data['Date'].apply(lambda x: remove_suffix(x, '00:00:00-05:00'))

In [39]:
data['Date'] = pd.to_datetime(data['Date'])
data = data.drop(columns=["Close"])
data.head()

Unnamed: 0,Date,Open,High,Low,Volume,Dividends,Stock Splits,Scaled_close
0,2001-06-18,2.507967,2.514673,2.504614,7189500,0.0,0.0,-0.973381
1,2001-06-19,2.531437,2.591789,2.531437,1410700,0.0,0.0,-0.970356
2,2001-06-20,2.575024,2.578377,2.551554,550400,0.0,0.0,-0.969953
3,2001-06-21,2.538142,2.548201,2.531436,643600,0.0,0.0,-0.971364
4,2001-06-22,2.521378,2.531436,2.507966,822600,0.0,0.0,-0.972574


In [40]:
scaled_data = data[['Date', 'Scaled_close']]
print("Shape of close dataframe:", scaled_data.shape)
scaled_data.head()

Shape of close dataframe: (5746, 2)


Unnamed: 0,Date,Scaled_close
0,2001-06-18,-0.973381
1,2001-06-19,-0.970356
2,2001-06-20,-0.969953
3,2001-06-21,-0.971364
4,2001-06-22,-0.972574


In [51]:
def create_lag_features(data, number_of_lags):
    for lag in range(1, number_of_lags + 1):
        data[f'lag_{lag}'] = data['Scaled_close'].shift(lag)
    return data

# Example usage:
lag_data = create_lag_features(data, 5)
lag_data.dropna(inplace=True)  # Remove rows with NaN values that result from lagging
lag_data.head()

Unnamed: 0,Date,Open,High,Low,Volume,Dividends,Stock Splits,Scaled_close,lag_1,lag_2,lag_3,lag_4,lag_5
8,2001-06-28,2.427497,2.427497,2.38391,259000,0.0,0.0,-0.980439,-0.973986,-0.972776,-0.972776,-0.972574,-0.971364
9,2001-06-29,2.434203,2.504614,2.434203,100600,0.0,0.0,-0.975398,-0.980439,-0.973986,-0.972776,-0.972776,-0.972574
10,2001-07-02,2.397321,2.43085,2.393969,114300,0.0,0.0,-0.978826,-0.975398,-0.980439,-0.973986,-0.972776,-0.972776
11,2001-07-03,2.383909,2.410732,2.363791,128000,0.0,0.0,-0.979028,-0.978826,-0.975398,-0.980439,-0.973986,-0.972776
12,2001-07-05,2.410732,2.410732,2.360439,144300,0.0,0.0,-0.981851,-0.979028,-0.978826,-0.975398,-0.980439,-0.973986


In [52]:
# Assuming you have already created your feature set and target variable
X = lag_data.drop(['Scaled_close', 'Date'], axis=1)
y = lag_data['Scaled_close']
feature_list = list(X.columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)


In [53]:
print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test", y_test.shape)

X_train:  (4016, 11)
y_train:  (4016,)
X_test:  (1722, 11)
y_test (1722,)


In [54]:
# Code Snippet: Random Forest Model Creation and Training
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [55]:
# Get numerical feature importances
importances = list(rf.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Low                  Importance: 0.51
Variable: High                 Importance: 0.47
Variable: Open                 Importance: 0.02
Variable: Volume               Importance: 0.0
Variable: Dividends            Importance: 0.0
Variable: Stock Splits         Importance: 0.0
Variable: lag_1                Importance: 0.0
Variable: lag_2                Importance: 0.0
Variable: lag_3                Importance: 0.0
Variable: lag_4                Importance: 0.0
Variable: lag_5                Importance: 0.0


In [56]:
train_predict=rf.predict(X_train)
test_predict=rf.predict(X_test)

train_predict = train_predict.reshape(-1,1)
test_predict = test_predict.reshape(-1,1)

print("Train data prediction:", train_predict.shape)
print("Test data prediction:", test_predict.shape)

Train data prediction: (4016, 1)
Test data prediction: (1722, 1)


In [57]:
# Transform back to original form
train_predict = scaler.inverse_transform(train_predict)
test_predict = scaler.inverse_transform(test_predict)
original_ytrain = scaler.inverse_transform(y_train.values.reshape(-1,1))
original_ytest = scaler.inverse_transform(y_test.values.reshape(-1,1))

Evaluation metrices RMSE, MSE and MAE

Root Mean Square Error (RMSE), Mean Square Error (MSE) and Mean absolute Error (MAE) are a standard way to measure the error of a model in predicting quantitative data.

In [58]:
# Evaluation metrices RMSE and MAE
print("Train data RMSE: ", math.sqrt(mean_squared_error(original_ytrain, train_predict)))
print("Train data MSE: ", mean_squared_error(original_ytrain, train_predict))
print("Train data MAE: ", mean_absolute_error(original_ytrain, train_predict))
print("-------------------------------------------------------------------------------------")
print("Test data RMSE: ", math.sqrt(mean_squared_error(original_ytest, test_predict)))
print("Test data MSE: ", mean_squared_error(original_ytest, test_predict))
print("Test data MAE: ", mean_absolute_error(original_ytest, test_predict))

Train data RMSE:  0.02747518301357622
Train data MSE:  0.0007548856816295073
Train data MAE:  0.01879266848780261
-------------------------------------------------------------------------------------
Test data RMSE:  5.475742514175033
Test data MSE:  29.98375608154391
Test data MAE:  3.1808614179398553


R2 score for regression

R-squared (R2) is a statistical measure that represents the proportion of the variance for a dependent variable that's explained by an independent variable or variables in a regression model.

1 = Best

0 or < 0 = worse

In [59]:
print("Train data R2 score:", r2_score(original_ytrain, train_predict))
print("Test data R2 score:", r2_score(original_ytest, test_predict))

Train data R2 score: 0.9999563108566122
Test data R2 score: 0.34277312176331787


In [60]:
look_back = 30

# Ensure 'train_predict' and 'test_predict' can be accessed as numpy arrays
# Flatten arrays if they are not already flat
if isinstance(train_predict, np.ndarray):
    train_predict = train_predict.flatten()
if isinstance(test_predict, np.ndarray):
    test_predict = test_predict.flatten()

# Prepare empty arrays for plotting
trainPredictPlot = np.full_like(lag_data['Scaled_close'], np.nan, dtype=np.float64)
testPredictPlot = np.full_like(lag_data['Scaled_close'], np.nan, dtype=np.float64)

# Safely assign predictions to plotting arrays
train_len = min(len(trainPredictPlot) - look_back, len(train_predict))
test_len = min(len(testPredictPlot) - len(train_predict) - (look_back * 2) - 1, len(test_predict))

trainPredictPlot[look_back:look_back + train_len] = train_predict[:train_len]
testPredictPlot[len(train_predict) + (look_back * 2) + 1:len(train_predict) + (look_back * 2) + 1 + test_len] = test_predict[:test_len]

# Create DataFrame for plotting
plotdf = pd.DataFrame({
    'Date': lag_data['Date'],
    'original_close': lag_data['Scaled_close'],
    'train_predicted_close': trainPredictPlot,
    'test_predicted_close': testPredictPlot
})

# Configure plot with Plotly
names = cycle(['Original close price', 'Train predicted close price', 'Test predicted close price'])
fig = px.line(plotdf, x='Date', y=['original_close', 'train_predicted_close', 'test_predicted_close'],
              labels={'value': 'Stock price', 'Date': 'Date'})
fig.update_layout(title_text='Comparison between Original and Predicted Scaled Close Price',
                  plot_bgcolor='white', font_size=15, font_color='black', legend_title_text='Scaled Close Price')
fig.for_each_trace(lambda t: t.update(name=next(names)))

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()