In [7]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.base import BaseEstimator, RegressorMixin

In [8]:
# Read the CSV file
df = pd.read_csv('SPY.csv')
print(df.columns)

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Day', 'Weekday',
       'Week', 'Month', 'Year'],
      dtype='object')


In [9]:
feature_columns = ["Open", "High", "Low", "Close", "Volume"]  
dates = df["Date"]
# if "Open" and "High" and "Low" and "Close" and "Volume" in df.columns:
#     df = df[feature_columns]


In [10]:
X = df

# scaler = RobustScaler()
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X[feature_columns]), columns=feature_columns, index=X.index)

In [11]:
# Perform train-test-validation split
X_train_val, X_test = train_test_split(X_scaled, test_size=0.2, random_state=42)
X_train, X_val = train_test_split(X_train_val, test_size=0.25, random_state=42)

X_train = X_train.values
X_val = X_val.values
X_test = X_test.values

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, X_train_tensor)
val_dataset = TensorDataset(X_val_tensor, X_val_tensor)
test_dataset = TensorDataset(X_test_tensor, X_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [23]:
# Check if CUDA is available and print the device being used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

class Encoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Encoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, encoding_dim)
        )

    def forward(self, x):
        return self.encoder(x)

# Define the Decoder model with reduced complexity and dropout
class Decoder(nn.Module):
    def __init__(self, encoding_dim, input_dim):
        super(Decoder, self).__init__()
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        return self.decoder(x)

    
# Define input dimensions
input_dim = X_train.shape[1]

# Model file paths
encoder_model_file = './models/encoder.pth'
decoder_model_file = './models/decoder.pth'

# Hyperparameters
encoding_dim = 8
learning_rate = 0.001
epochs = 50
batch_size = 8
patience = 5

# Check if the model files exist
if os.path.exists(encoder_model_file) and os.path.exists(decoder_model_file):
    print("Model files exist. Loading the models...")
    encoder = Encoder(input_dim, encoding_dim).to(device)
    decoder = Decoder(encoding_dim, input_dim).to(device)
    encoder.load_state_dict(torch.load(encoder_model_file))
    decoder.load_state_dict(torch.load(decoder_model_file))
else:
    print("Model files do not exist. Training new models...")

    # Initialize the encoder and decoder models
    encoder = Encoder(input_dim, encoding_dim).to(device)
    decoder = Decoder(encoding_dim, input_dim).to(device)
    optimizer_encoder = optim.Adam(encoder.parameters(), lr=learning_rate)
    optimizer_decoder = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()

    # Track training loss
    encoder_losses = []
    decoder_losses = []

    # Train the Encoder separately
    encoder.train()
    best_val_loss = float('inf')
    patience_counter = 0
    for epoch in range(epochs):
        epoch_loss = 0
        dataloader = DataLoader(TensorDataset(torch.tensor(X_train, dtype=torch.float32).to(device), torch.tensor(X_train, dtype=torch.float32).to(device)), batch_size=batch_size, shuffle=True, num_workers=0)
        for i, (batch_X, _) in enumerate(dataloader):
            batch_X = batch_X.to(device)
            optimizer_encoder.zero_grad()
            encoded = encoder(batch_X)
            # Pass the encoded output through the decoder
            decoded = decoder(encoded)
            # Calculate loss between decoder output and original input
            loss = criterion(decoded, batch_X)
            loss.backward()
            
            optimizer_encoder.step()
            optimizer_encoder.zero_grad()
            
            epoch_loss += loss.item()
        avg_loss = epoch_loss / len(dataloader)
        encoder_losses.append(avg_loss)
        print(f"Epoch [{epoch+1}/{epochs}], Encoder Loss: {avg_loss:.4f}")

        # Validation loss
        encoder.eval()
        decoder.eval()
        with torch.no_grad():
            val_encoded = encoder(torch.tensor(X_val, dtype=torch.float32).to(device))
            val_decoded = decoder(val_encoded)
            val_loss = criterion(val_decoded, torch.tensor(X_val, dtype=torch.float32).to(device)).item()
        print(f"Validation Loss: {val_loss:.4f}")

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break

    # Train the Decoder separately
    encoded_train = encoder(torch.tensor(X_train, dtype=torch.float32).to(device)).detach()
    decoder.train()
    best_val_loss = float('inf')
    patience_counter = 0
    for epoch in range(epochs):
        epoch_loss = 0
        dataloader = DataLoader(TensorDataset(encoded_train, torch.tensor(X_train, dtype=torch.float32).to(device)), batch_size=batch_size, shuffle=True, num_workers=0)
        for i, (batch_X, batch_y) in enumerate(dataloader):
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer_decoder.zero_grad()
            outputs = decoder(batch_X)
            # Calculate loss between decoder output and original input
            loss = criterion(outputs, batch_y)
            loss.backward()
            
            optimizer_decoder.step()
            optimizer_decoder.zero_grad()
            
            epoch_loss += loss.item()
        avg_loss = epoch_loss / len(dataloader)
        decoder_losses.append(avg_loss)
        print(f"Epoch [{epoch+1}/{epochs}], Decoder Loss: {avg_loss:.4f}")

        # Validation loss
        decoder.eval()
        with torch.no_grad():
            val_outputs = decoder(encoded_train)
            val_loss = criterion(val_outputs, torch.tensor(X_train, dtype=torch.float32).to(device)).item()
        print(f"Validation Loss: {val_loss:.4f}")

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            print("Early stopping counter:", patience_counter)
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break

    # Save the trained models
    os.makedirs('./models', exist_ok=True)
    torch.save(encoder.state_dict(), encoder_model_file)
    torch.save(decoder.state_dict(), decoder_model_file)
    print("Models trained and saved.")

# Combine Encoder and Decoder to form the Autoencoder
class Autoencoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(Autoencoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

autoencoder = Autoencoder(encoder, decoder).to(device)
autoencoder.eval()

# Calculate reconstruction error on the test set
with torch.no_grad():
    reconstructed_test = autoencoder(torch.tensor(X_test, dtype=torch.float32).to(device)).cpu().numpy()

# Calculate Mean Squared Error (MSE)
mse = np.mean((X_test - reconstructed_test) ** 2)
print(f'Mean Squared Error (MSE): {mse}')

# Calculate Mean Absolute Error (MAE)
mae = np.mean(np.abs(X_test - reconstructed_test))
print(f'Mean Absolute Error (MAE): {mae}')

# Calculate Reconstruction Error Percentage
# reconstruction_error = np.mean(np.square(X_test - reconstructed_test), axis=1)
# reconstruction_error_percentage = np.mean(reconstruction_error) * 100
# print(f'Reconstruction Error Percentage: {reconstruction_error_percentage}')

reconstruction_error_percentage = (
    np.mean(np.abs(X_test - reconstructed_test) / (np.abs(X_test) + 1e-8), axis=1) * 100
)
reconstruction_error_percentage_mean = np.mean(reconstruction_error_percentage)
print(f'Reconstruction Error Percentage: {reconstruction_error_percentage_mean}')

def calculate_index_error_percentage(index, X, reconstructed_X):
    print(np.mean(np.abs(X[index] - reconstructed_X[index]) / (np.abs(X[index]) + 1e-8)) * 100)     


Using device: cuda
Model files exist. Loading the models...
Mean Squared Error (MSE): 0.0004041298542706522
Mean Absolute Error (MAE): 0.013752187466917169
Reconstruction Error Percentage: 8.598396953424091


In [30]:
# number of points with error greater than average
np.sum(reconstruction_error_percentage > reconstruction_error_percentage_mean)

143