# Retrieval Augmented Prediction Model

This Model, specifically created to make Stock Predictions for upcoming Businesses, means this model predicts the market startup of any new business idea.


In [84]:
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from RetrievalSystem.RetrievalSystem import RetrievalSystem
from PredictionModel.AttentionModel.AttentionModel import AttentionModel
import pandas as pd
from PredictionModel.Layers.Layers import SimilarityLayer, IdeaLayer, StaticFeatureLayer, HistoricalFeatureLayer, FirstFusionLayer, SecondFusionLayer, OutputLayer

INPUT_PATH = "../RetrievalSystem/Embeddings/embeddings.csv"

BERT_DIM = 768

class RetrievalAugmentedPredictionModel(nn.Module):
    def __init__(self, hidden_dim: int = 128, ret_sys: RetrievalSystem = None, static_dim = 34, historical_dim = 72, forecast_steps: int = 6, retrieval_number: int = 16):
        super(RetrievalAugmentedPredictionModel, self).__init__()

        self.forecast_steps = forecast_steps
        self.static_feature_dim = static_dim
        self.historical_feature_dim = historical_dim
        self.historical_idea_dim = historical_dim - forecast_steps
        self.retrieval_number = retrieval_number

        if ret_sys:
            self.retrieval_system = ret_sys
        else:
            self.retrieval_system = RetrievalSystem(INPUT_PATH, retrieval_number)

        # 16 * 768 -> 768 + 16
        self.attention_model = AttentionModel(input_dim=BERT_DIM, hidden_dim=hidden_dim)
        # 16 -> 32
        self.similarity_fc = SimilarityLayer(retrieval_number=retrieval_number)
        # Use Same bert model as for original embeddings
        # 768 -> 4 * 128 -> 128
        self.idea_fc = IdeaLayer(bert_dim=BERT_DIM, hidden_dim=hidden_dim)

        # Static feature layers (deep)
        # 34 * 16 -> 34 * 8 -> 256
        self.static_fc = StaticFeatureLayer(retrieval_number=retrieval_number,hidden_dim=hidden_dim, static_feature_dim=self.static_feature_dim)

        # Historical stock data layers (deep)
        # 72 * 16-> 72 * 8 -> 72 * 8 -> 512
        self.historical_fc = HistoricalFeatureLayer(retrieval_number=retrieval_number, hidden_dim=hidden_dim, historical_feature_dim=self.historical_feature_dim)

        # 34 -> 32
        self.idea_static_fc = nn.Linear(self.static_feature_dim, 32)
        # 72 -> 64
        self.idea_historical_fc = nn.Linear(self.historical_idea_dim, hidden_dim//2)

        # First Fustion Layer, combines:
        # 1. AttentionModel Output -> 768
        # 1.a Attention Scores -> retrievel_numbre (16)
        # 2. Combined Static Layer Output -> 256
        # 2. Combined Static Layer Output -> 512
        # 4. Cosine Simularity Layer -> 32
        # combined = 1184 -> 1024 -> 512
        self.first_fusion_fc = FirstFusionLayer(bert_dim=BERT_DIM, hidden_dim=hidden_dim, retrieval_number=retrieval_number)

        # Attention layer after first fusion
        self.fusion_attention = nn.MultiheadAttention(embed_dim=4 * hidden_dim, num_heads=4, batch_first=True)

        # Second Fusion Layer, combines:
        # 1. Previous Fusion Layer Output: 512
        # 2. Idea Embedding: 256 (ouput of idea layer)
        # 3. Idea Static: 32
        # 4. Idea Historical: 64
        # combined = 992 -> 1024
        self.second_fusion_fc = SecondFusionLayer(hidden_dim=hidden_dim)

        # Second fusion
        self.second_fusion_attention = nn.MultiheadAttention(embed_dim=4 * hidden_dim, num_heads=4, batch_first=True)

        # Multi-layer LSTM with residual connection
        self.lstm = nn.LSTM(4 * hidden_dim, 2 * hidden_dim, num_layers=10, batch_first=True, dropout=0.2)

        # Attention mechanism
        self.attention = nn.MultiheadAttention(embed_dim=2 * hidden_dim, num_heads=4, batch_first=True)

        # Output layer for forecasting
        self.output_fc = OutputLayer(hidden_dim=hidden_dim)


    def forward(self, ideas: list, dataset: pd.DataFrame = None, static_features=None, historical_data=None, use_auxiliary_inputs=True, excluded_tickers=None):
        # Ensure device compatibility
        if excluded_tickers is None:
            excluded_tickers = []
        if dataset is None:
            print("We need a dataset for retrieval")
            return None

        device = next(self.parameters()).device

        # --- Retrieval Model ---
        idea_embeddings = []
        retrieved_embeddings = []
        retrieved_similarities = []
        retrieved_tickers = []
        for text in ideas:  # Iterate over the batch
            embedding, documents = self.retrieval_system.find_similar_entries(
                text=text, top_n=self.retrieval_number, excluded_tickers=excluded_tickers
            )
            idea_embeddings.append(embedding)
            retrieved_embeddings.append(torch.tensor(documents['embedding'].tolist(), dtype=torch.float32))
            retrieved_similarities.append(torch.tensor(documents['similarity'].tolist(), dtype=torch.float32))
            retrieved_tickers.append(documents['tickers'].values)

        # Convert lists to tensors
        idea_embeddings = torch.tensor(idea_embeddings, dtype=torch.float32).to(device)  # [batch_size, embedding_dim]
        idea_embeddings = idea_embeddings.squeeze(1)

        retrieved_idea_embeddings = torch.stack(retrieved_embeddings).to(device)  # [batch_size, num_retrieved, embedding_dim]
        retrieved_similarities = torch.stack(retrieved_similarities).to(device)  # [batch_size, num_retrieved]

        # --- Preparing Inputs for Layer ---

        print("Retrieved tickers: ", retrieved_tickers)
        dataset = dataset.set_index("tickers")

        # Filter rows from the dataset and extract numeric data
        filtered_data = []
        for i in range(len(retrieved_tickers)):
            tickers = retrieved_tickers[i]
            filtered = dataset[dataset.index.isin(tickers)]

            # Select numeric columns only
            numeric_data = filtered.select_dtypes(include=['number']).apply(pd.to_numeric, errors='coerce').fillna(0).values
            filtered_data.append(numeric_data)

        # Convert filtered data into a batch tensor with padding
        filtered_data = [torch.tensor(row, dtype=torch.float32) for row in filtered_data]
        filtered_data = pad_sequence(filtered_data, batch_first=True).to(device)  # [batch_size, padded_length, numeric_dim]
        print("We have these retrieved documents: ", filtered_data.shape)

        # Define static and month columns
        static_columns = [
            col for col in dataset.columns
            if col not in ["tickers", "business_description"] and not col.startswith("month")
        ]
        month_columns = [col for col in dataset.columns if col.startswith("month")]

        # Extract static and month vectors for each batch
        static_vectors = []
        month_vectors = []

        for i in range(len(retrieved_tickers)):
            tickers = retrieved_tickers[i]
            filtered = dataset[dataset.index.isin(tickers)]

            # Extract static data
            static_data = filtered[static_columns].apply(pd.to_numeric, errors='coerce').fillna(0).values
            static_vectors.append(static_data.flatten())  # Flatten to handle batch processing

            # Extract month data
            month_data = filtered[month_columns].apply(pd.to_numeric, errors='coerce').fillna(0).values
            month_vectors.append(month_data.flatten())  # Flatten to handle batch processing

        # Convert to tensors
        combined_static_tensor = torch.tensor(static_vectors, dtype=torch.float32).to(device)  # [batch_size, static_dim]
        combined_historical_tensor = torch.tensor(month_vectors, dtype=torch.float32).to(device)  # [batch_size, historical_dim]

        print(f"Static Tensor Shape: {combined_static_tensor.shape}, Historical Tensor Shape: {combined_historical_tensor.shape}")

        # --- AttentionModel, IdeaInput, 1.FusionLayer ---
        # Put retrieved documents into appropriate input layers
        weighted_sum, attention_weights = self.attention_model(retrieved_idea_embeddings)
        attention_weights = attention_weights.view(attention_weights.size(0), -1, 1)  # Retain batch size
        print(f"Shape of weighted_sum: {weighted_sum.shape}, attention_weights: {attention_weights.shape}")

        similarity_output = self.similarity_fc(retrieved_similarities)  # [batch_size, feature_dim]
        combined_static_output = self.static_fc(combined_static_tensor)  # [batch_size, feature_dim]
        combined_historical_output = self.historical_fc(combined_historical_tensor)  # [batch_size, feature_dim]

        print(f"Shape of static_output: {combined_static_output.shape}, similarity: {similarity_output.shape}, historical: {combined_historical_output.shape}")

        # Ensure attention_weights matches the batch size
        attention_weights = attention_weights.squeeze(-1)  # Remove the last dimension if not needed
        print(f"Shapes: weighted_sum: {weighted_sum.shape}, attention_weights: {attention_weights.shape}, combined_static_output: {combined_static_output.shape}, combined_historical: {combined_historical_output.shape}, similarity: {similarity_output.shape}")

        # Concatenate along the last dimension
        combined_retrieval_input = torch.cat((
            weighted_sum, attention_weights, combined_static_output, combined_historical_output, similarity_output
        ), dim=-1)  # Concatenation along the last dimension
        print(f"Shape of combined_retrieval_input: {combined_retrieval_input.shape}")

        first_fusion_output = self.first_fusion_fc(combined_retrieval_input)

        # Attention layer
        first_fusion_attention_output, _ = self.fusion_attention(first_fusion_output, first_fusion_output, first_fusion_output)

        # Put new ideas data into input layers
        idea_output = self.idea_fc(idea_embeddings)

        batch_size = idea_embeddings.size(0)
        if use_auxiliary_inputs:
            print("Using auxiliary inputs")
            static_tensor = static_features.clone().to(device)
            historical_tensor = historical_data.clone().to(device)
        else:
            print("Not using auxiliary inputs")
            static_tensor = torch.zeros((batch_size, self.static_feature_dim), dtype=torch.float32).to(device)
            historical_tensor = torch.zeros((batch_size, self.historical_idea_dim), dtype=torch.float32).to(device)

        static_output = self.idea_static_fc(static_tensor) # This wont change within the autoregressiv prediction

        # --- Autoregressive prediction ---
        predictions = []
        for step in range(self.forecast_steps):
            historical_output = self.idea_historical_fc(historical_tensor)

            # 2. FUSION LAYER - Fuse combined retrieval documents and new idea together
            print(f"Shapes of static_output: {static_output.shape}, historical_output: {historical_output.shape}, idea: {idea_output.shape}, attention_output: {first_fusion_attention_output.shape}")

            combined_idea_input = torch.cat((first_fusion_attention_output, idea_output, static_output, historical_output), dim=1)
            second_fusion_output = self.second_fusion_fc(combined_idea_input)

            # Attention layer
            second_fusion_attention_output, _ = self.second_fusion_attention(second_fusion_output, second_fusion_output, second_fusion_output)

            # LSTM
            lstm_output, _ = self.lstm(second_fusion_attention_output.unsqueeze(1))  # Add sequence dimension

            # Attention
            lstm_attention_output, _ = self.attention(lstm_output, lstm_output, lstm_output)

            # OUTPUT
            final_prediction = self.output_fc(lstm_attention_output.squeeze(1))  # Remove sequence dimension
            print(f"Final prediction: {final_prediction} and shape: {final_prediction.shape}")

            # Append to predictions
            predictions.append(final_prediction)

            # Update historical tensor for next step
            print(f"Final prediction: {final_prediction.shape}, historical tensor: {historical_tensor.shape}")
            historical_tensor = torch.cat((historical_tensor[:, 1:], final_prediction), dim=1)
            print(f"Resulting historical tensor shape: {historical_tensor.shape}")

        # Stack predictions into a single tensor
        print(f"Prediction before doing operations: {predictions}")
        predictions = torch.cat(predictions, dim=1)  # Shape: [1, forecast_steps, 1]
        predictions = predictions.squeeze().detach().cpu().numpy()  # Remove the last dimension, Shape: [1, forecast_steps]
        return predictions



### Example usage
Here is an example of how to use our newly created model:

In [85]:
import torch
# Initialize the model - HAVE TO BE ADAPTED TO DATASET (Values are likely correct)
static_feature_dim_num = 4    # Number of static features
historical_dim_num = 12       # Number of historical stock performance points
hidden_dim_num = 128          # Hidden layer size
forecast_steps_num = 12       # Predict next 12 months

batch_size = 9

DATASET_PATH = "../Dataset/Data/normalized_real_company_stock_dataset_large.csv"
dataset = pd.read_csv(DATASET_PATH)

retrieval_system = RetrievalSystem(INPUT_PATH, retrieval_number=16)

model = RetrievalAugmentedPredictionModel(
    forecast_steps=forecast_steps_num,
    ret_sys = retrieval_system,
    retrieval_number=16
)

current_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Random entry
idea_entries = dataset.iloc[4000:4000 + batch_size, :]  # Get a batch of rows

# Extract business descriptions (ideas)
ideas = idea_entries["business_description"].tolist()

static_columns = [
    col for col in dataset.columns
    if col not in ["tickers", "business_description"] and not col.startswith("month")
]
month_columns = [col for col in dataset.columns if col.startswith("month")]

# Prepare static and historical data for the batch
static_data = idea_entries[static_columns]
historical_data = idea_entries[month_columns]

# Ensure numeric data and handle missing values
static_data = static_data.apply(pd.to_numeric, errors='coerce').fillna(0).values.astype(float)
historical_data = historical_data.apply(pd.to_numeric, errors='coerce').fillna(0).values.astype(float)

# Convert to tensors with batch dimension
static_data = torch.tensor(static_data, dtype=torch.float32).to(current_device)  # [batch_size, static_feature_dim_num]
historical_data = torch.tensor(historical_data[:, -len(month_columns):-forecast_steps_num], dtype=torch.float32).to(current_device)  # [batch_size, historical_dim_num]

# Make a prediction
prediction = model(
    ideas=ideas,
    dataset=dataset,
    static_features=static_data,
    historical_data=historical_data,
    use_auxiliary_inputs=True
)
print(prediction)  # Co
print(prediction.shape)


# Make a prediction
prediction = model(
    ideas=ideas,
    dataset=dataset,
    use_auxiliary_inputs=False
)
print(prediction)  # Co
print(prediction.shape)



Retrieved tickers:  [array(['OTTR', 'XEL', 'LNT', 'PCG', 'RRX', 'AGX', 'CNP', 'VMI', 'DTE',
       'AEE', 'POWL', 'SO', 'CETY', 'FELE', 'PKX', 'NRG'], dtype=object), array(['OUST', 'ROK', 'INVZ', 'LTRX', 'AEVA', 'STM', 'INDI', 'ADI', 'AVY',
       'BSY', 'AMBA', 'APTV', 'ALGM', 'VC', 'KARO', 'MVIS'], dtype=object), array(['OUT', 'CCO', 'KD', 'APPS', 'EXTR', 'EH', 'ILLR', 'BNZI', 'ICLK',
       'APP', 'SPHR', 'IPG', 'OB', 'DAVA', 'ITI', 'TTGT'], dtype=object), array(['OVBC', 'CARE', 'PFS', 'FULT', 'FULTP', 'UBSI', 'RRBI', 'FBMS',
       'SHBI', 'TOWN', 'BOTJ', 'VABK', 'PNC', 'BRBS', 'BPRN', 'FBNC'],
      dtype=object), array(['OVID', 'PRAX', 'PTCT', 'APLT', 'MRNS', 'BMRN', 'ANNX', 'XFOR',
       'NBIX', 'CMRX', 'TGTX', 'APLS', 'MIRM', 'SEEL', 'PHAR', 'IONS'],
      dtype=object), array(['OVLY', 'TCBX', 'SSBI', 'UNTY', 'STEL', 'PCB', 'HTLF', 'HTLFP',
       'FBNC', 'FRST', 'VBFC', 'PNBK', 'EWBC', 'HTBK', 'WTBA', 'VBTX'],
      dtype=object), array(['OVV', 'EPSN', 'VRN', 'BTE', 'COP', 'M

### Simple Training Loop

In [86]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

# Load and preprocess the dataset
DATASET_PATH = "../Dataset/Data/normalized_real_company_stock_dataset_large.csv"
dataset = pd.read_csv(DATASET_PATH)

# Initialize the retrieval system and model
retrieval_system = RetrievalSystem(INPUT_PATH, retrieval_number=16)
model = RetrievalAugmentedPredictionModel(
    forecast_steps=forecast_steps_num,
    ret_sys=retrieval_system,
    static_dim=34,
    historical_dim=72, # THIS IS IMPORTANT
    retrieval_number=16
)
model.to(current_device)

# Loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training loop
epochs = 1
for epoch in range(epochs):
    total_loss = 0.0

    for i in range(len(dataset)):

        if i > 10:
            break
        # Get the current entry
        idea_entry = dataset.iloc[i, :]
        idea = idea_entry["business_description"]

        ticker = idea_entry["tickers"]
        print(f"Currently testing with ticker {ticker}")

        # Prepare static and historical data
        static_columns = [
            col for col in dataset.columns
            if col not in ["tickers", "business_description"] and not col.startswith("month")
        ]
        month_columns = [col for col in dataset.columns if col.startswith("month")]

        static_data = idea_entry[static_columns]
        historical_data = idea_entry[month_columns]

        # Ensure numeric data and handle missing values
        static_data = static_data.apply(pd.to_numeric, errors='coerce').fillna(0).values.astype(float)
        historical_data = historical_data.apply(pd.to_numeric, errors='coerce').fillna(0).values.astype(float)

        # Separate the target and input historical data
        target = torch.tensor(historical_data[-forecast_steps_num:], dtype=torch.float32).unsqueeze(0).to(current_device)  # Target: last forecast_steps_num
        historical_data = historical_data[:-forecast_steps_num]  # Input: all but last forecast_steps_num

        # Convert to tensors
        static_data = torch.tensor(static_data, dtype=torch.float32).unsqueeze(0).to(current_device)
        historical_data = torch.tensor(historical_data, dtype=torch.float32).unsqueeze(0).to(current_device)
        print(f"Shapes in training loop: static: {static_data.shape}, historical data shape: {historical_data.shape}")

        # Remove the current entry from the dataset for retrieval
        filtered_dataset = dataset.drop(index=i)

        # Forward pass
        prediction = model(
            idea=idea,
            dataset=filtered_dataset,
            static_features=static_data,
            historical_data=historical_data,
            use_auxiliary_inputs=True,
            excluded_tickers=[ticker]
        )

        # Compute loss
        loss = loss_fn(prediction, target)
        total_loss += loss.item()

        # Backpropagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Print progress
        if i % 100 == 0:
            print(f"Epoch [{epoch + 1}/{epochs}], Step [{i}/{len(dataset)}], Loss: {loss.item():.4f}")

    # Print epoch summary
    print(f"Epoch [{epoch + 1}/{epochs}] completed. Total Loss: {total_loss:.4f}")


prediction = model(
    idea="I want to create a coffeshop that uses digital cups that analyze exactly whats in your coffe and how it is going to impact you.",
    dataset=dataset,
    use_auxiliary_inputs=False
)

print("Prediction after Training: ", prediction)


Currently testing with ticker A
Shapes in training loop: static: torch.Size([1, 34]), historical data shape: torch.Size([1, 60])


TypeError: RetrievalAugmentedPredictionModel.forward() got an unexpected keyword argument 'idea'