# Hybrid Stock Prediction Model Training
In the "HybridStockPredictionModel" notebook we created our model that can be used to make efficient stock prediction for new business ideas.

At first we create our Dataset class that will be used to train the model:


# Load Libraries and Set Up Dependencies

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F
from torch.utils.data import DataLoader
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load and Preprocess the Dataset

In [2]:
# Load the dataset
df = pd.read_csv("../Dataset/Data/normalized_real_company_stock_dataset.csv")

# Define columns
idea_column = "business_description"
static_feature_columns = ["market_size", "investment", "team_strength"]
historical_columns = [col for col in df.columns if col.startswith("month_")]

# Define the target as stock performance for the next 6 months
target_columns = historical_columns[-6:]  # Last 6 months of performance
forecast_steps = 6

# Prepare your features and target
ideas = df[idea_column].values
static_features = df[static_feature_columns].values
historical_data = df[historical_columns].values
targets = df[target_columns].values

# Scale static features
scaler_static = StandardScaler()
static_features = scaler_static.fit_transform(static_features)

# Train-test split
ideas_train, ideas_val, static_train, static_val, hist_train, hist_val, y_train, y_val = train_test_split(
    ideas, static_features, historical_data, targets, test_size=0.2, random_state=42
)


# Convert the ideas to embeddings using SentenceTransformer
from sentence_transformers import SentenceTransformer

# Load the pre-trained model (this will map each idea to a vector of length 384)
text_encoder = SentenceTransformer('all-MiniLM-L6-v2')

# Convert the text ideas into embeddings (numerical vectors)
ideas_train_embeddings = text_encoder.encode(ideas_train, convert_to_numpy=True)
ideas_val_embeddings = text_encoder.encode(ideas_val, convert_to_numpy=True)

# Now convert these embeddings into torch tensors
ideas_train_tensor = torch.tensor(ideas_train_embeddings, dtype=torch.float32)
ideas_val_tensor = torch.tensor(ideas_val_embeddings, dtype=torch.float32)

# Convert static features and target variables into torch tensors
static_train_tensor = torch.tensor(static_train, dtype=torch.float32)
static_val_tensor = torch.tensor(static_val, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)

# Define the PyTorch Dataset

In [3]:
class StockDataset(Dataset):
    def __init__(self, ideas, static_features, historical_data, targets):
        self.ideas = ideas
        self.static_features = torch.tensor(static_features, dtype=torch.float32)
        self.historical_data = torch.tensor(historical_data, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        # Return the idea (text), static features, historical data, and the target
        return self.ideas[idx], self.static_features[idx], self.historical_data[idx], self.targets[idx]


train_dataset = StockDataset(ideas=ideas_train,
                             static_features=static_train,
                             historical_data=hist_train,
                             targets=y_train)

val_dataset = StockDataset(ideas=ideas_val,
                           static_features=static_val,
                           historical_data=hist_val,
                           targets=y_val)


### Training the model
Here we import the model and set it up for training

In [13]:
import torch.optim as optim
import torch.nn as nn
import Model.HybridStockPredictionModel
from Model.HybridStockPredictionModel import StockPerformancePredictionModel

# Model initialization
static_feature_dim = static_features.shape[1]
historical_dim = historical_data.shape[1]
hidden_dim = 128  # Example hidden size

# Training loop
def train_model(model, train_loader, val_loader, criterion, optimizer, device, epochs=10, step_size=5, gamma=0.1):
    # Learning rate scheduler
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

    for epoch in range(epochs):
        model.train()  # Set the model to training mode
        train_loss = 0.0

        # Training phase
        for ideas, static_features, historical_data, targets in train_loader:
            # Check for invalid values in data
            assert not torch.isnan(static_features).any(), "Static features contain NaN"
            assert not torch.isinf(static_features).any(), "Static features contain Inf"
            assert not torch.isnan(historical_data).any(), "Historical data contains NaN"
            assert not torch.isinf(historical_data).any(), "Historical data contains Inf"

            # Move to device
            static_features, historical_data, targets = (
                static_features.to(device),
                historical_data.to(device),
                targets.to(device),
            )

            # Forward pass
            predictions = model(
                idea=ideas,
                static_features=static_features,
                historical_data=historical_data,
                use_auxiliary_inputs=True,
                predict_autoregressively=False
            )

            # Compute loss
            loss = criterion(predictions.squeeze(), targets)
            optimizer.zero_grad()
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            train_loss += loss.item()

        # Step the scheduler after each epoch
        scheduler.step()

        # Evaluate the model after training for the epoch
        val_loss = evaluate(model, val_loader, device, criterion)

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss:.4f}")



#### We can also create a a custom loss function and an optimizer

You can also possible create custom loss functions:

In [5]:
# Suppose the last indicator is categorical
def custom_loss(predictions, targets, model, lambda_reg=0.01):
    # Mean Squared Error Loss
    mse_loss = F.mse_loss(predictions, targets)

    # L2 regularization (sum of squared weights of the model parameters)
    l2_loss = 0
    for param in model.parameters():
        l2_loss += torch.sum(param ** 2)

    # Combine the two losses
    total_loss = mse_loss + lambda_reg * l2_loss
    return total_loss


### Evaluation
After training we are going to evaluate our model:

In [6]:
def evaluate(model, val_loader, device, criterion):
    model.eval()  # Set the model to evaluation mode
    val_loss = 0.0
    all_predictions = []
    all_targets = []

    with torch.no_grad():  # No need to track gradients during evaluation
        for ideas_batch, static_batch, historical_batch, target_batch in val_loader:


            # Move data to device
            static_batch = static_batch.to(device)
            historical_batch = historical_batch.to(device)
            target_batch = target_batch.to(device)

            # Get predictions
            predictions = model(ideas_batch, static_features=static_batch, historical_data=historical_batch)

            # Compute loss
            loss = criterion(predictions, target_batch)
            val_loss += loss.item()

            # Collect all predictions and targets for evaluation
            all_predictions.append(predictions.cpu().numpy())
            all_targets.append(target_batch.cpu().numpy())

    # Compute average loss
    avg_val_loss = val_loss / len(val_loader)

    # Flatten the lists to make evaluation easier
    all_predictions = np.concatenate(all_predictions, axis=0)
    all_targets = np.concatenate(all_targets, axis=0)

    # Calculate evaluation metrics
    mse = mean_squared_error(all_targets, all_predictions)
    mae = mean_absolute_error(all_targets, all_predictions)
    r2 = r2_score(all_targets, all_predictions)

    print(f'Validation Loss: {avg_val_loss:.4f}')
    print(f'MSE: {mse:.4f}')
    print(f'MAE: {mae:.4f}')
    print(f'R²: {r2:.4f}')

    return avg_val_loss, mse, mae, r2


# Integration with Training Loop

In [14]:
# Run Training and Evaluation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Static: {static_feature_dim}")
print(f"Historical: {historical_dim}")
model = StockPerformancePredictionModel(static_feature_dim, historical_dim, hidden_dim, forecast_steps).to(device)

# Define batch size
batch_size = 32

# Create DataLoader for training and validation
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_model(model, train_loader, val_loader, criterion, optimizer, device, epochs=10)


Static: 3
Historical: 24


KeyboardInterrupt: 