In [68]:
# Import necessary libraries
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from finta import TA
import tensorflow as tf
from tensorflow.keras.models import Sequential
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from keras.regularizers import l2
from sklearn.model_selection import KFold

In [69]:
# Load data
TM_path = Path("../files/TM.csv")
VIX_path = Path("../files/^VIX.csv")
FEDFUNDS_path = Path("../files/FEDFUNDS (1).csv")
SPY_index = Path("../files/SPY.csv")

In [70]:
tm_df = pd.read_csv(TM_path, index_col="Date")
fear_index_df = pd.read_csv(VIX_path, index_col="Date").rename(columns={"Close": "Fear_index"})
spy_index_df = pd.read_csv(SPY_index, index_col="Date").rename(columns={"Close": "SPY_index"})
fedfunds_df = pd.read_csv(FEDFUNDS_path, index_col="DATE")

In [71]:
# Convert index to datetime index
fedfunds_df.index = pd.to_datetime(fedfunds_df.index)

# Resample the fed_funds_df to have daily frequency and forward fill the values
fedfunds_df_monthly = fedfunds_df.resample('D').ffill()


In [72]:
# Concatenate dataframes
concatenated_df = pd.concat([tm_df, fear_index_df['Fear_index'], spy_index_df['SPY_index']], axis=1)
concatenated_df.index = pd.to_datetime(concatenated_df.index)


In [73]:
# Merge with fedfunds_df
concatenated_df = pd.merge(concatenated_df, fedfunds_df_monthly, left_index=True, right_index=True)


In [74]:
# Drop rows with NaN values
concatenated_df = concatenated_df.dropna()


In [75]:
# Shift target variable
concatenated_df['Target'] = concatenated_df['Close'].shift(-5)
concatenated_df = concatenated_df.dropna()

In [76]:
concatenated_df.head(10)

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Fear_index,SPY_index,FEDFUNDS,Target
2014-04-30,108.0,108.5,107.669998,108.419998,100.839981,467700.0,13.41,188.309998,0.09,109.029999
2014-05-01,110.019997,110.150002,109.370003,109.730003,102.058388,340900.0,13.25,188.330002,0.09,108.150002
2014-05-02,109.989998,110.18,109.669998,109.93,102.244415,420500.0,12.91,188.059998,0.09,108.870003
2014-05-05,109.779999,109.93,109.410004,109.440002,101.788673,274800.0,13.29,188.419998,0.09,109.209999
2014-05-06,109.5,109.57,108.720001,109.0,101.379425,429400.0,13.8,186.779999,0.09,111.379997
2014-05-07,109.0,109.139999,108.580002,109.029999,101.407333,319000.0,13.4,187.880005,0.09,110.43
2014-05-08,108.5,108.610001,107.529999,108.150002,100.58886,366500.0,13.43,187.679993,0.09,109.5
2014-05-09,108.959999,109.379997,108.529999,108.870003,101.258522,371700.0,12.92,187.960007,0.09,109.610001
2014-05-12,108.870003,109.32,108.599998,109.209999,101.574753,346200.0,12.23,189.789993,0.09,108.610001
2014-05-13,110.209999,111.540001,110.089996,111.379997,103.59304,564200.0,12.13,189.960007,0.09,107.669998


In [77]:
# Calculate technical indicators using finta
data = concatenated_df.copy()  # Use the existing DataFrame concatenated_df
data['MA'] = TA.SMA(data, 20)  # 20-period Simple Moving Average
data['RSI'] = TA.RSI(data, 14)  # 14-period RSI

# Calculate Bollinger Bands correctly
bb_bands = TA.BBANDS(data, 20, 2)

# Assign Bollinger Bands values to DataFrame columns
data['BB_UPPER'] = bb_bands['BB_UPPER']
data['BB_MIDDLE'] = bb_bands['BB_MIDDLE']
data['BB_LOWER'] = bb_bands['BB_LOWER']

# Convert index to datetime
data.index = pd.to_datetime(data.index)

# Display the calculated technical indicators
data.tail()


Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Fear_index,SPY_index,FEDFUNDS,Target,MA,RSI,BB_UPPER,BB_MIDDLE,BB_LOWER
2024-03-18,237.580002,238.830002,237.279999,238.660004,238.660004,194100.0,14.33,512.859985,5.33,252.410004,238.452,57.155161,252.187283,238.452,224.716716
2024-03-19,242.679993,245.089996,242.410004,244.320007,244.320007,347400.0,13.82,515.710022,5.33,253.720001,239.3295,62.009328,252.133199,239.3295,226.5258
2024-03-20,244.679993,246.800003,244.309998,246.720001,246.720001,214900.0,13.04,520.47998,5.33,252.149994,240.311,63.878136,252.136371,240.311,228.485629
2024-03-21,251.449997,251.789993,250.259995,251.479996,251.479996,245500.0,12.92,522.200012,5.33,251.679993,241.1625,67.312529,253.645172,241.1625,228.679828
2024-03-22,255.0,255.229996,253.589996,254.770004,254.770004,285400.0,13.06,521.210022,5.33,242.149994,242.151,69.472956,255.667277,242.151,228.634723


In [78]:
# Define features and target
X = concatenated_df.drop("Close", axis=1)
y = concatenated_df["Close"]

In [79]:
data.drop(columns=['Open', 'High', 'Low', 'Close', 'Adj Close'], inplace=True)


In [80]:
# Display the modified DataFrame
data.head()

Unnamed: 0,Volume,Fear_index,SPY_index,FEDFUNDS,Target,MA,RSI,BB_UPPER,BB_MIDDLE,BB_LOWER
2014-04-30,467700.0,13.41,188.309998,0.09,109.029999,,,,,
2014-05-01,340900.0,13.25,188.330002,0.09,108.150002,,100.0,,,
2014-05-02,420500.0,12.91,188.059998,0.09,108.870003,,100.0,,,
2014-05-05,274800.0,13.29,188.419998,0.09,109.209999,,72.857125,,,
2014-05-06,429400.0,13.8,186.779999,0.09,111.379997,,57.7094,,,


In [81]:
data_clean = data.dropna()
data_clean.to_csv('../clean_data/GM_prepared_data.csv', index=True)

In [82]:
data_clean.head()

Unnamed: 0,Volume,Fear_index,SPY_index,FEDFUNDS,Target,MA,RSI,BB_UPPER,BB_MIDDLE,BB_LOWER
2014-05-28,259100.0,11.68,191.380005,0.09,114.480003,109.3095,55.911969,111.065231,109.3095,107.55377
2014-05-29,348500.0,11.57,192.369995,0.09,114.589996,109.463,64.453519,111.416912,109.463,107.509089
2014-05-30,449700.0,11.4,192.679993,0.09,115.150002,109.632,70.717723,112.178102,109.632,107.085898
2014-06-02,660400.0,11.58,192.899994,0.1,115.07,109.833,73.341083,113.029735,109.833,106.636265
2014-06-03,593400.0,11.87,192.800003,0.1,113.870003,110.107,76.013474,114.020872,110.107,106.193128


In [83]:
# Define date cutoff for data split
date_cutoff = "2022-04-30"

# Split data
X_train = X[X.index <= date_cutoff]
X_test = X[X.index > date_cutoff]
y_train = y[y.index <= date_cutoff]
y_test = y[y.index > date_cutoff]

In [84]:
# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [85]:
# Define the number of folds
k = 10

# Initialize lists to store R-squared scores
train_r2_scores = []
test_r2_scores = []

# Initialize KFold
kf = KFold(n_splits=k, shuffle=True)

# Define the model architecture
model = Sequential([
    Dense(units=16, activation='relu', kernel_regularizer=l2(0.0005), input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.6),
    Dense(units=8, activation='relu', kernel_regularizer=l2(0.0005)),
    Dropout(0.6),
    Dense(units=1)
])

# Example: Train with a smaller learning rate
from keras.optimizers import Adam
adam = Adam(learning_rate=0.0001)  # Adjust learning rate as needed
model.compile(optimizer=adam, loss='mean_squared_error')
model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)

# Compile the model
model.compile(optimizer=adam, loss='mean_squared_error')

# Perform k-fold cross-validation
for train_index, test_index in kf.split(X_train_scaled):
    X_train_cv, X_test_cv = X_train_scaled[train_index], X_train_scaled[test_index]
    y_train_cv, y_test_cv = y_train[train_index], y_train[test_index]
    
    
    # Evaluate the model on training data
    train_predictions = model.predict(X_train_cv)
    train_r2 = r2_score(y_train_cv, train_predictions)
    train_r2_scores.append(train_r2)
    
    # Evaluate the model on test data
    test_predictions = model.predict(X_test_cv)
    test_r2 = r2_score(y_test_cv, test_predictions)
    test_r2_scores.append(test_r2)

# Calculate average R-squared scores
avg_train_r2 = np.mean(train_r2_scores)
avg_test_r2 = np.mean(test_r2_scores)

print("Average R-squared (Train):", avg_train_r2)
print("Average R-squared (Test):", avg_test_r2)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 601us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 537us/step
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 352us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 523us/step
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 346us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 618us/step
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 386us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 585us/step
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 344us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 612us/step
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 355us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 540us/step
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 359us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━

In [86]:
from keras.models import clone_model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Initialize lists to store metrics for selected models
selected_train_r2 = []
selected_test_r2 = []
selected_train_mae = []
selected_train_mse = []
selected_test_mae = []
selected_test_mse = []

# Perform k-fold cross-validation
for train_index, test_index in kf.split(X_train_scaled):
    X_train_cv, X_test_cv = X_train_scaled[train_index], X_train_scaled[test_index]
    y_train_cv, y_test_cv = y_train[train_index], y_train[test_index]
    
    # Define a new model for each fold
    model_fold = clone_model(model)
    
    # Compile the model
    model_fold.compile(optimizer='adam', loss='mean_squared_error')
    
    # Set weights from the original model
    model_fold.set_weights(model.get_weights())
    
    # Train the model
    model_fold.fit(X_train_cv, y_train_cv, epochs=50, batch_size=32, verbose=0)
    
    # Evaluate the model on training data
    train_predictions = model_fold.predict(X_train_cv)
    train_r2 = r2_score(y_train_cv, train_predictions)
    train_mae = mean_absolute_error(y_train_cv, train_predictions)
    train_mse = mean_squared_error(y_train_cv, train_predictions)
    
    # Evaluate the model on test data
    test_predictions = model_fold.predict(X_test_cv)
    test_r2 = r2_score(y_test_cv, test_predictions)
    test_mae = mean_absolute_error(y_test_cv, test_predictions)
    test_mse = mean_squared_error(y_test_cv, test_predictions)
    
    # Append metrics to the lists
    selected_train_r2.append(train_r2)
    selected_test_r2.append(test_r2)
    selected_train_mae.append(train_mae)
    selected_train_mse.append(train_mse)
    selected_test_mae.append(test_mae)
    selected_test_mse.append(test_mse)

# Print metrics for selected models
for idx, (train_r2, test_r2, train_mae, train_mse, test_mae, test_mse) in enumerate(zip(selected_train_r2, selected_test_r2, selected_train_mae, selected_train_mse, selected_test_mae, selected_test_mse), start=1):
    print(f"Model {idx} - Train R-squared: {train_r2}, Test R-squared: {test_r2}, Train MAE: {train_mae}, Train MSE: {train_mse}, Test MAE: {test_mae}, Test MSE: {test_mse}")


[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 605us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 494us/step
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 600us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 483us/step
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 614us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 536us/step
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 578us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 488us/step
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 625us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 559us/step
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 747us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 491us/step
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 604us/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━

In [87]:
# Initialize variables to store the best model's index and its performance metric
best_model_index = None
best_test_r_squared = float('-inf')

# Iterate over each model's metrics to find the best model
for idx, (train_r_squared, test_r_squared, train_mae, train_mse, test_mae, test_mse) in enumerate(zip(selected_train_r2, selected_test_r2, selected_train_mae, selected_train_mse, selected_test_mae, selected_test_mse), start=1):
    if test_r_squared > best_test_r_squared:
        best_test_r_squared = test_r_squared
        best_model_index = idx

# Check if a best model is found
if best_model_index is not None:
    # Print the details of the best model
    print(f"Best Model - Train R-squared: {selected_train_r2[best_model_index - 1]}, Test R-squared: {selected_test_r2[best_model_index - 1]}")
    print(f"Train MAE: {selected_train_mae[best_model_index - 1]}, Train MSE: {selected_train_mse[best_model_index - 1]}")
    print(f"Test MAE: {selected_test_mae[best_model_index - 1]}, Test MSE: {selected_test_mse[best_model_index - 1]}")
else:
    print("No best model found.")


Best Model - Train R-squared: -2.009494147865413, Test R-squared: -1.5877935966630647
Train MAE: 35.716130479314536, Train MSE: 1360.1046531137035
Test MAE: 37.08590554470674, Test MSE: 1473.6965248309148


In [88]:
# Predict on the entire dataset using the best model
all_predictions = best_model.predict(np.concatenate((X_train_scaled, X_test_scaled), axis=0))

# Create a DataFrame to store the actual and predicted values
predictions_df = pd.DataFrame(index=concatenated_df.index, columns=['Actual', 'Predicted'])

# Fill in the actual values
predictions_df['Actual'] = concatenated_df['Target']

# Fill in the predicted values
predictions_df['Predicted'] = all_predictions

# Display the DataFrame
predictions_df


NameError: name 'best_model' is not defined

In [None]:
# Check for NaN values in the predicted values
any_nan_predicted = predictions_df['Predicted'].isna().any()

# Print the result
if any_nan_predicted:
    print("There are NaN values in the predicted values.")
else:
    print("There are no NaN values in the predicted values.")
