In [501]:
# Import necessary libraries
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from finta import TA
import tensorflow as tf
from tensorflow.keras.models import Sequential
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from keras.regularizers import l2
from sklearn.model_selection import KFold
from tensorflow.keras.models import clone_model


In [502]:
# Load data
AAPL_path = Path("../files/AAPL.csv")
VIX_path = Path("../files/^VIX.csv")
FEDFUNDS_path = Path("../files/FEDFUNDS (1).csv")
SPY_index = Path("../files/SPY.csv")

In [503]:
apple_df = pd.read_csv(AAPL_path, index_col="Date")
fear_index_df = pd.read_csv(VIX_path, index_col="Date").rename(columns={"Close": "Fear_index"})
spy_index_df = pd.read_csv(SPY_index, index_col="Date").rename(columns={"Close": "SPY_index"})
fedfunds_df = pd.read_csv(FEDFUNDS_path, index_col="DATE")

In [504]:
# Convert index to datetime index
fedfunds_df.index = pd.to_datetime(fedfunds_df.index)

# Resample the fed_funds_df to have daily frequency and forward fill the values
fedfunds_df_monthly = fedfunds_df.resample('D').ffill()


In [505]:
# Concatenate dataframes
concatenated_df = pd.concat([apple_df, fear_index_df['Fear_index'], spy_index_df['SPY_index']], axis=1)
concatenated_df.index = pd.to_datetime(concatenated_df.index)


In [506]:
# Merge with fedfunds_df
concatenated_df = pd.merge(concatenated_df, fedfunds_df_monthly, left_index=True, right_index=True)


In [507]:
# Drop rows with NaN values
concatenated_df = concatenated_df.dropna()


In [508]:
# Shift target variable
concatenated_df['Target'] = concatenated_df['Close'].shift(-5)
concatenated_df = concatenated_df.dropna()

In [509]:
concatenated_df.head(10)

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Fear_index,SPY_index,FEDFUNDS,Target
2014-04-30,21.165714,21.408215,21.064285,21.074642,18.562872,456640800.0,13.41,188.309998,0.09,21.154642
2014-05-01,21.142857,21.242857,20.941429,21.124287,18.60659,244048000.0,13.25,188.330002,0.09,20.999643
2014-05-02,21.155001,21.22143,21.061071,21.16357,18.641197,191514400.0,12.91,188.059998,0.09,20.912144
2014-05-05,21.076429,21.464287,21.071428,21.462856,18.90481,287067200.0,13.29,188.419998,0.09,21.172501
2014-05-06,21.492857,21.586071,21.22893,21.22893,18.698767,374564400.0,13.8,186.779999,0.09,21.205713
2014-05-07,21.258928,21.331785,20.990356,21.154642,18.633331,282864400.0,13.4,187.880005,0.09,21.209642
2014-05-08,21.008928,21.22893,20.942858,20.999643,18.600111,230297200.0,13.43,187.679993,0.09,21.029285
2014-05-09,20.876429,20.9375,20.72607,20.912144,18.522621,291597600.0,12.92,187.960007,0.09,21.339643
2014-05-12,20.981787,21.202143,20.978571,21.172501,18.753223,213208800.0,12.23,189.789993,0.09,21.592501
2014-05-13,21.142857,21.23357,21.09643,21.205713,18.78264,159737200.0,12.13,189.960007,0.09,21.596786


In [510]:
# Calculate technical indicators using finta
data = concatenated_df.copy()  # Use the existing DataFrame concatenated_df
data['MA'] = TA.SMA(data, 20)  # 20-period Simple Moving Average
data['RSI'] = TA.RSI(data, 14)  # 14-period RSI

# Calculate Bollinger Bands correctly
bb_bands = TA.BBANDS(data, 20, 2)

# Assign Bollinger Bands values to DataFrame columns
data['BB_UPPER'] = bb_bands['BB_UPPER']
data['BB_MIDDLE'] = bb_bands['BB_MIDDLE']
data['BB_LOWER'] = bb_bands['BB_LOWER']

# Convert index to datetime
data.index = pd.to_datetime(data.index)

# Display the calculated technical indicators
data.tail()


Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Fear_index,SPY_index,FEDFUNDS,Target,MA,RSI,BB_UPPER,BB_MIDDLE,BB_LOWER
2024-03-18,175.570007,177.710007,173.520004,173.720001,173.720001,75604200.0,14.33,512.859985,5.33,170.850006,176.3455,40.189942,187.047394,176.3455,165.643607
2024-03-19,174.339996,176.610001,173.029999,176.080002,176.080002,55215200.0,13.82,515.710022,5.33,169.710007,176.0715,46.52582,186.488066,176.0715,165.654935
2024-03-20,175.720001,178.669998,175.089996,178.669998,178.669998,53423100.0,13.04,520.47998,5.33,173.309998,175.889,52.475843,185.967014,175.889,165.810986
2024-03-21,177.050003,177.490005,170.839996,171.369995,171.369995,106181300.0,12.92,522.200012,5.33,171.479996,175.239,39.227222,184.670016,175.239,165.807983
2024-03-22,171.759995,173.050003,170.059998,172.279999,172.279999,71106600.0,13.06,521.210022,5.33,170.029999,174.727,41.219498,183.588319,174.727,165.865681


In [511]:
data = data.dropna()
data.head(10)

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Fear_index,SPY_index,FEDFUNDS,Target,MA,RSI,BB_UPPER,BB_MIDDLE,BB_LOWER
2014-05-28,22.357857,22.493929,22.277857,22.286072,19.73955,315481600.0,11.68,191.380005,0.09,23.029285,21.408607,77.987931,22.226638,21.408607,20.590576
2014-05-29,22.423214,22.745358,22.420357,22.692142,20.099224,376474000.0,11.57,192.369995,0.09,23.119642,21.489482,82.673633,22.47182,21.489482,20.507144
2014-05-30,22.785,23.006071,22.460714,22.607143,20.023937,564020800.0,11.4,192.679993,0.09,23.05607,21.563625,78.888135,22.648405,21.563625,20.478845
2014-06-02,22.64143,22.672501,22.232143,22.451786,19.886335,369350800.0,11.58,192.899994,0.1,23.424999,21.628036,72.365965,22.764546,21.628036,20.491526
2014-06-03,22.445,22.812143,22.4375,22.769285,20.167553,292709200.0,11.87,192.800003,0.1,23.5625,21.693357,76.620166,22.935189,21.693357,20.451526
2014-06-04,22.765715,23.138929,22.718214,23.029285,20.39784,335482000.0,12.08,193.190002,0.1,23.465,21.783375,79.414911,23.139231,21.783375,20.427519
2014-06-05,23.078571,23.191786,22.950357,23.119642,20.477877,303805600.0,11.68,194.449997,0.1,23.0725,21.881625,80.296405,23.327445,21.881625,20.435805
2014-06-06,23.210714,23.259287,23.016787,23.05607,20.42157,349938400.0,10.73,195.380005,0.1,22.82,21.984446,77.773016,23.458386,21.984446,20.510507
2014-06-09,23.174999,23.469999,22.9375,23.424999,20.748339,301660000.0,11.15,195.580002,0.1,23.049999,22.110089,81.421837,23.626942,22.110089,20.593236
2014-06-10,23.682501,23.762501,23.3925,23.5625,20.870127,251108000.0,10.99,195.600006,0.1,23.02,22.229589,82.570282,23.810649,22.229589,20.648528


In [512]:
# Define features and target
X = concatenated_df.drop("Close", axis=1)
y = concatenated_df["Close"]

In [513]:
data.drop(columns=['Open', 'High', 'Low', 'Close', 'Adj Close'], inplace=True)


In [514]:
# Display the modified DataFrame
data.head()

Unnamed: 0,Volume,Fear_index,SPY_index,FEDFUNDS,Target,MA,RSI,BB_UPPER,BB_MIDDLE,BB_LOWER
2014-05-28,315481600.0,11.68,191.380005,0.09,23.029285,21.408607,77.987931,22.226638,21.408607,20.590576
2014-05-29,376474000.0,11.57,192.369995,0.09,23.119642,21.489482,82.673633,22.47182,21.489482,20.507144
2014-05-30,564020800.0,11.4,192.679993,0.09,23.05607,21.563625,78.888135,22.648405,21.563625,20.478845
2014-06-02,369350800.0,11.58,192.899994,0.1,23.424999,21.628036,72.365965,22.764546,21.628036,20.491526
2014-06-03,292709200.0,11.87,192.800003,0.1,23.5625,21.693357,76.620166,22.935189,21.693357,20.451526


In [515]:
data_clean = data.dropna()
data_clean.to_csv('../clean_data/AAPL_prepared_data.csv', index=True)

In [516]:
data_clean

Unnamed: 0,Volume,Fear_index,SPY_index,FEDFUNDS,Target,MA,RSI,BB_UPPER,BB_MIDDLE,BB_LOWER
2014-05-28,315481600.0,11.68,191.380005,0.09,23.029285,21.408607,77.987931,22.226638,21.408607,20.590576
2014-05-29,376474000.0,11.57,192.369995,0.09,23.119642,21.489482,82.673633,22.471820,21.489482,20.507144
2014-05-30,564020800.0,11.40,192.679993,0.09,23.056070,21.563625,78.888135,22.648405,21.563625,20.478845
2014-06-02,369350800.0,11.58,192.899994,0.10,23.424999,21.628036,72.365965,22.764546,21.628036,20.491526
2014-06-03,292709200.0,11.87,192.800003,0.10,23.562500,21.693357,76.620166,22.935189,21.693357,20.451526
...,...,...,...,...,...,...,...,...,...,...
2024-03-18,75604200.0,14.33,512.859985,5.33,170.850006,176.345500,40.189942,187.047394,176.345500,165.643607
2024-03-19,55215200.0,13.82,515.710022,5.33,169.710007,176.071500,46.525820,186.488066,176.071500,165.654935
2024-03-20,53423100.0,13.04,520.479980,5.33,173.309998,175.889000,52.475843,185.967014,175.889000,165.810986
2024-03-21,106181300.0,12.92,522.200012,5.33,171.479996,175.239000,39.227222,184.670016,175.239000,165.807983


In [517]:
# Define date cutoff for data split
date_cutoff = "2022-04-30"

# Split data
X_train = X[X.index <= date_cutoff]
X_test = X[X.index > date_cutoff]
y_train = y[y.index <= date_cutoff]
y_test = y[y.index > date_cutoff]

In [518]:
# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [519]:
# Initialize and train RandomForestRegressor model
# rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
# rf_model.fit(X_train, y_train)

# # Make predictions on the training set
# y_train_pred = rf_model.predict(X_train)
# # Make predictions on the testing set
# y_test_pred = rf_model.predict(X_test)

# # Evaluate the model
# mse_train = mean_squared_error(y_train, y_train_pred)
# mse_test = mean_squared_error(y_test, y_test_pred)
# r2_train = r2_score(y_train, y_train_pred)
# r2_test = r2_score(y_test, y_test_pred)

# print("Mean Squared Error (Train):", mse_train)
# print("Mean Squared Error (Test):", mse_test)
# print("R-squared (Train):", r2_train)
# print("R-squared (Test):", r2_test)

In [520]:
# from sklearn.model_selection import cross_val_score

# # Perform cross-validation
# cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='r2')

# print("Cross-Validation R-squared scores:", cv_scores)
# print("Mean R-squared:", cv_scores.mean())


In [521]:
# # Define input layer
# inputs = Input(shape=(X_train_scaled.shape[1],))

# # Define hidden layers with L2 regularization
# hidden1 = Dense(units=16, activation='relu', kernel_regularizer=l2(0.0005))(inputs)  # Decreased regularization strength
# dropout1 = Dropout(0.6)(hidden1)  # Increased dropout rate

# hidden2 = Dense(units=8, activation='relu', kernel_regularizer=l2(0.0005))(dropout1)  # Decreased regularization strength
# dropout2 = Dropout(0.6)(hidden2)  # Increased dropout rate

# # Define output layer
# outputs = Dense(units=1)(dropout2)

# # Define the model
# nn = Model(inputs=inputs, outputs=outputs)

# # Compile the model
# nn.compile(optimizer='adam', loss='mean_squared_error')

# # Train the model
# history = nn.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)

# # Evaluate the model on training data
# train_predictions = nn.predict(X_train_scaled)
# train_r2 = r2_score(y_train, train_predictions)

# # Evaluate the model on test data
# test_predictions = nn.predict(X_test_scaled)
# test_r2 = r2_score(y_test, test_predictions)

# print("R-squared (Train):", train_r2)
# print("R-squared (Test):", test_r2)

In [522]:
# Define the number of folds
k = 10

# Initialize lists to store R-squared scores
train_r2_scores = []
test_r2_scores = []

# Initialize KFold
kf = KFold(n_splits=k, shuffle=True)

# Define the model architecture
model = Sequential([
    Dense(units=16, activation='relu', kernel_regularizer=l2(0.0005), input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.6),
    Dense(units=8, activation='relu', kernel_regularizer=l2(0.0005)),
    Dropout(0.6),
    Dense(units=1)
])

# Example: Train with a smaller learning rate
from keras.optimizers import Adam
adam = Adam(learning_rate=0.0001)  # Adjust learning rate as needed
model.compile(optimizer=adam, loss='mean_squared_error')
model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)

# Compile the model
model.compile(optimizer=adam, loss='mean_squared_error')

# Perform k-fold cross-validation
for train_index, test_index in kf.split(X_train_scaled):
    X_train_cv, X_test_cv = X_train_scaled[train_index], X_train_scaled[test_index]
    y_train_cv, y_test_cv = y_train[train_index], y_train[test_index]
    
    
    # Evaluate the model on training data
    train_predictions = model.predict(X_train_cv)
    train_r2 = r2_score(y_train_cv, train_predictions)
    train_r2_scores.append(train_r2)
    
    # Evaluate the model on test data
    test_predictions = model.predict(X_test_cv)
    test_r2 = r2_score(y_test_cv, test_predictions)
    test_r2_scores.append(test_r2)

# Calculate average R-squared scores
avg_train_r2 = np.mean(train_r2_scores)
avg_test_r2 = np.mean(test_r2_scores)

print("Average R-squared (Train):", avg_train_r2)
print("Average R-squared (Test):", avg_test_r2)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 581us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 487us/step
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 319us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 385us/step
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 303us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 545us/step
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 296us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 481us/step
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 294us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 457us/step
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 283us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 418us/step
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 325us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━

In [523]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Initialize lists to store metrics for selected models
selected_train_r2 = []
selected_test_r2 = []
selected_train_mae = []
selected_train_mse = []
selected_test_mae = []
selected_test_mse = []

# Perform k-fold cross-validation
for train_index, test_index in kf.split(X_train_scaled):
    X_train_cv, X_test_cv = X_train_scaled[train_index], X_train_scaled[test_index]
    y_train_cv, y_test_cv = y_train[train_index], y_train[test_index]
    
    # Define a new model for each fold
    model_fold = Sequential([
        Dense(32, activation='relu', input_shape=(X_train_cv.shape[1],)),
        Dropout(0.5),  # Dropout layer with a dropout rate of 0.5
        Dense(16, activation='relu'),
        Dropout(0.5),  # Dropout layer with a dropout rate of 0.5
        Dense(1)  # Output layer
    ])
    
    # Compile the model with Adam optimizer and mean squared error loss
    model_fold.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
    
    # Train the model
    model_fold.fit(X_train_cv, y_train_cv, epochs=50, batch_size=32, verbose=0)
    
    # Evaluate the model on training data
    train_predictions = model_fold.predict(X_train_cv)
    train_r2 = r2_score(y_train_cv, train_predictions)
    train_mae = mean_absolute_error(y_train_cv, train_predictions)
    train_mse = mean_squared_error(y_train_cv, train_predictions)
    
    # Evaluate the model on test data
    test_predictions = model_fold.predict(X_test_cv)
    test_r2 = r2_score(y_test_cv, test_predictions)
    test_mae = mean_absolute_error(y_test_cv, test_predictions)
    test_mse = mean_squared_error(y_test_cv, test_predictions)
    
    # Append metrics to the lists
    selected_train_r2.append(train_r2)
    selected_test_r2.append(test_r2)
    selected_train_mae.append(train_mae)
    selected_train_mse.append(train_mse)
    selected_test_mae.append(test_mae)
    selected_test_mse.append(test_mse)

# Print metrics for selected models
for idx, (train_r2, test_r2, train_mae, train_mse, test_mae, test_mse) in enumerate(zip(selected_train_r2, selected_test_r2, selected_train_mae, selected_train_mse, selected_test_mae, selected_test_mse), start=1):
    print(f"Model {idx} - Train R-squared: {train_r2}, Test R-squared: {test_r2}, Train MAE: {train_mae}, Train MSE: {train_mse}, Test MAE: {test_mae}, Test MSE: {test_mse}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 553us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 446us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 558us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 442us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 550us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 423us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 567us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 471us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 560us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 468us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 552us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 487us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 557us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 439us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 531us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 463us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 567us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 458us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 564us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 539us/step
Model 1 - Train R-squared: 0.9435398065370532, Test R-squared: 0.9433548572982372, Train MAE: 8.407401798172485, Train MSE: 112.5471669604941, Test MAE: 8.306337159873433, Test MSE: 114.60441102025348
Model 2 - Train R-squared: 0.908121140796527, Test R-squared: 0.9126188116508898, Train MAE: 11.092757101716197, Train MSE: 183.92008444554398, Test MAE: 10.567485334571762, Test MSE: 170.1891621638483
Model 3 - Train R-squared: 0.9358534286728415, Test R-squared: 0.9427257923587764, Train MAE: 9.309639957230573, Train MSE: 126.62040172511185, Test MAE: 9.14317804946265, Test MSE: 125.79339913295453
Model 4 - Train R-squared: 0.9466797917048149, Test R-squared: 0.9407046705779781, Train MAE: 8.15947448896347, Train MSE: 106.54555058399025, Test MAE: 8.733980184102576, Test MSE: 117.15594250719721
Model 5 - Train R-squared: 0.9513384206890326, 

In [524]:
# Filter models where both Train R-squared and Test R-squared are less than 0.96
filtered_indices = [i for i, (train_r2, test_r2) in enumerate(zip(selected_train_r2, selected_test_r2)) if train_r2 < 0.96 and test_r2 < 0.96]

# Calculate the absolute difference between train R-squared and test R-squared values for filtered models
abs_diff_r2_filtered = np.abs(np.array(selected_train_r2)[filtered_indices] - np.array(selected_test_r2)[filtered_indices])

# Find the index of the model with the smallest absolute difference among filtered models
best_model_index = filtered_indices[np.argmin(abs_diff_r2_filtered)]

# Retrieve the metrics for the best model
best_train_r2 = selected_train_r2[best_model_index]
best_test_r2 = selected_test_r2[best_model_index]
best_train_mae = selected_train_mae[best_model_index]
best_train_mse = selected_train_mse[best_model_index]
best_test_mae = selected_test_mae[best_model_index]
best_test_mse = selected_test_mse[best_model_index]

# Print metrics for the best model
print(f"Best Model - Train R-squared: {best_train_r2}, Test R-squared: {best_test_r2}, Train MAE: {best_train_mae}, Train MSE: {best_train_mse}, Test MAE: {best_test_mae}, Test MSE: {best_test_mse}")


Best Model - Train R-squared: 0.9435398065370532, Test R-squared: 0.9433548572982372, Train MAE: 8.407401798172485, Train MSE: 112.5471669604941, Test MAE: 8.306337159873433, Test MSE: 114.60441102025348


In [525]:
# Scale data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define the model architecture
model = Sequential([
    Dense(units=16, activation='relu', kernel_regularizer=l2(0.0005), input_shape=(X_scaled.shape[1],)),
    Dropout(0.6),
    Dense(units=8, activation='relu', kernel_regularizer=l2(0.0005)),
    Dropout(0.6),
    Dense(units=1)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model on the entire dataset
model.fit(X_scaled, y, epochs=50, batch_size=32, verbose=0)

# Predict sequentially on each data point
all_predictions = model.predict(X_scaled)

# Ensure the number of predictions matches the original dataset
assert len(all_predictions) == len(X_scaled)

# Create a DataFrame to store the actual and predicted values
predictions_df = pd.DataFrame({'Actual': y, 'Predicted': all_predictions.flatten()}, index=X.index)

# Ensure index uniqueness in both the original dataset and predictions DataFrame
data_clean_unique_index = data_clean.index.drop_duplicates()
predictions_df = predictions_df.loc[data_clean_unique_index]

# Display the DataFrame
predictions_df


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 543us/step


Unnamed: 0,Actual,Predicted
2014-05-28,22.286072,21.790688
2014-05-29,22.692142,22.539791
2014-05-30,22.607143,25.859711
2014-06-02,22.451786,22.487484
2014-06-03,22.769285,21.647545
...,...,...
2024-03-18,173.720001,149.186066
2024-03-19,176.080002,149.816360
2024-03-20,178.669998,151.729401
2024-03-21,171.369995,148.198685


In [526]:
# Check for NaN values in the predictions DataFrame
nan_values = predictions_df.isnull().sum().sum()

if nan_values == 0:
    print("No NaN values found in the predictions DataFrame.")
    print(predictions_df)
else:
    print(f"Found {nan_values} NaN values in the predictions DataFrame. Please check your data or model.")


No NaN values found in the predictions DataFrame.
                Actual   Predicted
2014-05-28   22.286072   21.790688
2014-05-29   22.692142   22.539791
2014-05-30   22.607143   25.859711
2014-06-02   22.451786   22.487484
2014-06-03   22.769285   21.647545
...                ...         ...
2024-03-18  173.720001  149.186066
2024-03-19  176.080002  149.816360
2024-03-20  178.669998  151.729401
2024-03-21  171.369995  148.198685
2024-03-22  172.279999  148.028397

[2473 rows x 2 columns]


In [527]:
# Calculate the percentage difference between actual and predicted values
predictions_df['Percentage Difference (%)'] = ((predictions_df['Predicted'] - predictions_df['Actual']) / predictions_df['Actual']) * 100

# Display the DataFrame with percentage difference
predictions_df


Unnamed: 0,Actual,Predicted,Percentage Difference (%)
2014-05-28,22.286072,21.790688,-2.222843
2014-05-29,22.692142,22.539791,-0.671382
2014-05-30,22.607143,25.859711,14.387345
2014-06-02,22.451786,22.487484,0.158998
2014-06-03,22.769285,21.647545,-4.926550
...,...,...,...
2024-03-18,173.720001,149.186066,-14.122689
2024-03-19,176.080002,149.816360,-14.915744
2024-03-20,178.669998,151.729401,-15.078411
2024-03-21,171.369995,148.198685,-13.521218


In [528]:
# Calculate the absolute percentage difference for each data point
predictions_df['Abs_Percentage_Diff'] = abs((predictions_df['Actual'] - predictions_df['Predicted']) / predictions_df['Actual']) * 100

# Calculate the average percentage difference
avg_percentage_diff = predictions_df['Abs_Percentage_Diff'].mean()

print("Average Percentage Difference (%):", avg_percentage_diff)


Average Percentage Difference (%): 20.067400308351925
