In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # Shape: (1, max_len, d_model)
        self.register_buffer("pe", pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

# class TransformerAutoencoder(nn.Module):
#     def __init__(self, time_steps, d_model=128, nhead=4, num_layers=3, latent_dim=64, dim_feedforward=256, dropout=0.1):
#         super().__init__()
#         self.input_proj = nn.Linear(1, d_model)  # Project feature dim (1) to d_model
#         self.pos_encoder = PositionalEncoding(d_model, max_len=time_steps)

#         # Transformer Encoder
#         encoder_layer = nn.TransformerEncoderLayer(
#             d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout,
#             activation="gelu", batch_first=True, norm_first=True
#         )
#         self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

#         # Latent Space Projection
#         self.latent_proj = nn.Linear(d_model, latent_dim)

#         # Transformer Decoder
#         decoder_layer = nn.TransformerDecoderLayer(
#             d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout,
#             activation="gelu", batch_first=True, norm_first=True
#         )
#         self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)

#         # Output Projection
#         self.output_proj = nn.Linear(d_model, 1)

#         # Residual Layer Norm
#         self.layer_norm = nn.LayerNorm(d_model)

#     def forward(self, x):
#         # x: [batch_size, num_features=1, time_steps]
#         x = x.permute(0, 2, 1)  # [batch_size, time_steps, num_features]
#         x = self.input_proj(x)
#         x = self.pos_encoder(x)

#         # Encoder Forward Pass
#         enc_output = self.encoder(x)
#         enc_output = self.layer_norm(enc_output)

#         # Latent Space Representation
#         latent = self.latent_proj(enc_output)

#         # Decoder Forward Pass (Reconstruction)
#         dec_output = self.decoder(enc_output, enc_output)  # Using encoded input as target for reconstruction
#         dec_output = self.layer_norm(dec_output)

#         # Output Projection
#         out = self.output_proj(dec_output)  # Shape: [batch_size, time_steps, 1]
#         out = out.permute(0, 2, 1)  # Back to [batch_size, 1, time_steps]

#         return out, latent  # Returning both the reconstruction and latent space

In [84]:
class TransformerAutoencoder(nn.Module):
    def __init__(self, time_steps, d_model=128, nhead=4, num_layers=3, latent_dim=64, dim_feedforward=256, dropout=0.1):
        super().__init__()
        self.input_proj = nn.Linear(1, d_model)  # Project feature dim (1) to d_model
        self.pos_encoder = PositionalEncoding(d_model, max_len=time_steps)

        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout,
            activation="gelu", batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Latent Space Projection (Global Pooling to get a single vector)
        self.latent_proj = nn.Linear(d_model, latent_dim)  # Reduce d_model → latent_dim
        self.reverse_proj = nn.Linear(latent_dim, d_model)
        self.latent_norm = nn.LayerNorm(latent_dim)  # Normalize latent space

        # Transformer Decoder
        decoder_layer = nn.TransformerDecoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout,
            activation="gelu", batch_first=True, norm_first=True
        )
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)

        # Output Projection
        self.output_proj = nn.Linear(d_model, 1)

        # Ensure normalization for correct dimensions
        self.encoder_norm = nn.LayerNorm(d_model)
        self.decoder_norm = nn.LayerNorm(d_model)

    def forward(self, x):
        """
        x: [batch_size, num_features=1, time_steps]
        """
        x = x.permute(0, 2, 1)  # Convert to [batch_size, time_steps, num_features]
        x = self.input_proj(x)  # Project input to d_model
        print(x.shape)
        x = self.pos_encoder(x)
        print(x.shape)
        # Encoder Forward Pass
        enc_output = self.encoder(x)  # Shape: [batch_size, time_steps, d_model]
        print(enc_output.shape)
        enc_output = self.encoder_norm(enc_output)  # Apply LayerNorm
        print(enc_output.shape)
        # Global Mean Pooling to get a single latent vector
        latent = enc_output.mean(dim=1, keepdim=True)  # Shape: [batch_size, 1, d_model]
        print(latent.shape)
        latent = self.latent_proj(latent)  # Shape: [batch_size, 1, latent_dim]
        print(latent.shape)
        latent = self.latent_norm(latent)  # Normalize latent space
        print(latent.shape)
        latent = self.reverse_proj(latent)
        print(latent.shape)
        # Expand latent representation back to sequence length
        repeated_latent = latent.repeat(1, x.shape[1], 1)  # Shape: [batch_size, time_steps, latent_dim]
        print(repeated_latent.shape)
        # Decoder Forward Pass
        dec_output = self.decoder(repeated_latent, enc_output)  # Shape: [batch_size, time_steps, d_model]
        print(dec_output.shape)
        dec_output = self.decoder_norm(dec_output)  # Apply LayerNorm
        print(dec_output.shape)
        # Output Projection
        out = self.output_proj(dec_output)  # Shape: [batch_size, time_steps, 1]
        out = out.permute(0, 2, 1)  # Back to [batch_size, 1, time_steps]

        return out, latent.squeeze(1)  # Returning [batch, 1, seq_len] & [batch, latent_dim]


In [64]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [65]:
rus1000 = pd.read_csv("/content/RUSSELL_1000_5y.csv")
spy500 = pd.read_csv("/content/SPY_500_5y.csv")

  rus1000 = pd.read_csv("/content/RUSSELL_1000_5y.csv")


In [66]:
X = spy500.iloc[2:,1:].dropna(axis=1).T.astype(float)
X = X / X.iloc[:,0].values.reshape(488, 1)
X_t, X_v = train_test_split(X.values, test_size=0.3, random_state=0)
X_t = X_t.reshape(X_t.shape[0], 1, X_t.shape[-1])
X_v = X_v.reshape(X_v.shape[0], 1, X_v.shape[-1])

In [87]:
model = TransformerAutoencoder(X_t.shape[1]).to(device)

In [88]:
model(torch.zeros(32,1,1000).to(device))

torch.Size([32, 1000, 128])
torch.Size([32, 1000, 128])
torch.Size([32, 1000, 128])
torch.Size([32, 1000, 128])
torch.Size([32, 1, 128])
torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
FACK
FUCK
torch.Size([32, 1, 128])
torch.Size([32, 1000, 128])
torch.Size([32, 1000, 128])
torch.Size([32, 1000, 128])


(tensor([[[0.7368, 0.6719, 0.6960,  ..., 0.8009, 0.7045, 0.7418]],
 
         [[0.7056, 0.6783, 0.7427,  ..., 0.7349, 0.7295, 0.6364]],
 
         [[0.5686, 0.7441, 0.8402,  ..., 0.7857, 0.7086, 0.6548]],
 
         ...,
 
         [[0.4908, 0.8691, 0.9567,  ..., 0.6676, 0.8680, 0.8598]],
 
         [[0.8694, 0.7959, 0.6559,  ..., 0.8610, 0.6839, 0.5189]],
 
         [[0.6788, 0.5521, 0.7811,  ..., 0.5856, 0.5474, 0.8048]]],
        device='cuda:0', grad_fn=<PermuteBackward0>),
 tensor([[-0.5396, -0.4193, -0.6710,  ..., -0.1164, -0.5929,  0.0811],
         [-0.5441, -0.4238, -0.6725,  ..., -0.1140, -0.5914,  0.0843],
         [-0.5365, -0.4217, -0.6709,  ..., -0.1152, -0.5937,  0.0880],
         ...,
         [-0.5409, -0.4249, -0.6760,  ..., -0.1258, -0.5924,  0.0881],
         [-0.5269, -0.4281, -0.6729,  ..., -0.1094, -0.5896,  0.0950],
         [-0.5364, -0.4177, -0.6704,  ..., -0.1149, -0.5875,  0.0920]],
        device='cuda:0', grad_fn=<SqueezeBackward1>))

In [19]:
train_dataset = TensorDataset(torch.tensor(X_t, dtype=torch.float32))
val_dataset = TensorDataset(torch.tensor(X_v, dtype=torch.float32))
all_dataset = TensorDataset(torch.tensor(X.values.reshape(X.shape[0], 1, X.shape[-1]), dtype=torch.float32))
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
all_loader = DataLoader(all_dataset, batch_size=32, shuffle=False)

In [7]:
def train_autoencoder(model, train_loader, val_loader, num_epochs=50, lr=1e-4, device="cuda"):
    model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)
    criterion = nn.MSELoss()

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0

        for (batch, ) in train_loader:
            x = batch.to(device)  # Move batch to GPU/CPU
            optimizer.zero_grad()
            outputs, _ = model(x)  # Forward pass
            loss = criterion(outputs, x)  # Compare to original input
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
            optimizer.step()
            train_loss += loss.item()

        # Compute average training loss
        train_loss /= len(train_loader)

        # Validation phase
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for (batch, ) in val_loader:
                x = batch.to(device)
                outputs, _ = model(x)
                loss = criterion(outputs, x)
                val_loss += loss.item()

        val_loss /= len(val_loader)

        # Print epoch losses
        print(f"Epoch [{epoch+1}/{num_epochs}] - Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    return model

In [8]:
# model = train_autoencoder(model, train_loader, val_loader, num_epochs=50, lr=1e-4, device=device)

In [9]:
# X_temp = next(iter(val_loader))[0].to(device)
# outputs, latent = model(X_temp)

In [10]:
# torch.nn.CosineSimilarity(dim=2)(X_temp, outputs)

In [11]:
# torch.linalg.norm(X_temp), torch.linalg.norm(outputs)

In [12]:
# torch.save(model.state_dict(), "model.pth")
model.load_state_dict(torch.load("model.pth"))

  model.load_state_dict(torch.load("model.pth"))


<All keys matched successfully>

In [13]:
# X_t.shape

In [26]:
model.eval()
latents = []
with torch.no_grad():
    for (batch, ) in all_loader:
        x = batch.to(device)
        output, latent = model(x)
        print(latent.shape)
        latents.append(latent.cpu().detach().numpy())
latents = np.concatenate(latents)

torch.Size([32, 1257, 64])
torch.Size([32, 1257, 64])
torch.Size([32, 1257, 64])
torch.Size([32, 1257, 64])
torch.Size([32, 1257, 64])
torch.Size([32, 1257, 64])
torch.Size([32, 1257, 64])
torch.Size([32, 1257, 64])
torch.Size([32, 1257, 64])
torch.Size([32, 1257, 64])
torch.Size([32, 1257, 64])
torch.Size([32, 1257, 64])
torch.Size([32, 1257, 64])
torch.Size([32, 1257, 64])
torch.Size([32, 1257, 64])
torch.Size([8, 1257, 64])


KeyboardInterrupt: 

In [21]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [22]:
latents.shape

(488, 1257, 64)

In [23]:
pca = PCA(n_components=2)
latents_transformed = pca.fit_transform(latents)

ValueError: Found array with dim 3. PCA expected <= 2.

In [25]:
latents.shape

(488, 1257, 64)