# Hybrid Stock Prediction Model Training

In the "HybridStockPredictionModel" notebook we created our model that can be used to make efficient stock prediction for new business ideas.



At first we create our Dataset class that will be used to train the model:





# Load Libraries and Set Up Dependencies

In [56]:
import torch

import torch.nn as nn

import torch.optim as optim

from torch.utils.data import Dataset, DataLoader

import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sentence_transformers import SentenceTransformer

import torch.nn.functional as F

from torch.utils.data import DataLoader

import numpy as np

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from tqdm import tqdm
import os

tqdm._instances.clear()  # Clear any existing progress bars
tqdm.pandas(disable=True)  # Disable tqdm globally
# Suppress tqdm progress bars
tqdm.disable = True

# (Optional) Disable tqdm via environment variable
os.environ["DISABLE_TQDM"] = "1"

import sys
sys.path.append('/kaggle/input/stockpredictionmodel/pytorch/default/1')
from HybridStockPredictionModel import StockPerformancePredictionModel

import matplotlib.pyplot as plt


In [59]:
import joblib

hidden_dim = 128  # Example hidden size
historical_scaler = joblib.load("/kaggle/input/scaler/historical_scaler.pkl")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_save_path = "/kaggle/input/trainedmodel1/best_model.pth"
model = StockPerformancePredictionModel(3, 24, hidden_dim, 24)
model.load_state_dict(torch.load(model_save_path, weights_only=True))
model.to(device)


predict = model(
    idea=["AbCellera Biologics Inc. builds an engine for antibody drug discovery and development. Its engine discovers antibodies from natural immune responses, which are pre-enriched for antibodies. The company's preclinical products are ABCL635 for metabolic and endocrine conditions; and ABCL575 for atopic dermatitis. It has a research collaboration and license agreement with Eli Lilly and Company; a research collaboration with Confo Therapeutics for the discovery of therapeutic antibody candidates targeting two undisclosed GPCR targets; and strategic collaboration with Biogen Inc. to discover therapeutic antibodies for neurological conditions, as well as collaboration with Viking Global Investors and ArrowMark Partners. The company was incorporated in 2012 and is headquartered in Vancouver, Canada."],
    use_auxiliary_inputs=False
)

print(predict)

# Ensure the tensor is detached from the computation graph before converting to numpy
predictions_np = predict.detach().cpu().numpy()

prediction_historical = historical_scaler.inverse_transform(predictions_np)

print(prediction_historical)




https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
  n_nonblank = len("".join(repr_.split()))


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

tensor([[0.0559, 0.0691, 0.0856, 0.1086, 0.1388, 0.1745, 0.2124, 0.2493, 0.2823,
         0.3093, 0.3294, 0.3426, 0.3496, 0.3514, 0.3495, 0.3450, 0.3390, 0.3325,
         0.3261, 0.3204, 0.3156, 0.3118, 0.3091, 0.3073]], device='cuda:0',
       grad_fn=<SqueezeBackward1>)
[[36448.363  52886.816  44280.86   31755.986  24987.928  11462.613
   2264.2227  2916.9507  1496.0712  1391.8411  1155.2336  1204.2379
   1296.8995  1266.2158  1212.2828  1251.4788  1170.2217  1255.5984
   1291.9784  1190.294   1233.674   1313.3424  1445.3121  1601.451 ]]


# Load and Preprocess the Dataset

In [None]:
# Load the dataset
df = pd.read_csv("/kaggle/input/csv-dataset/normalized_real_company_stock_dataset.csv")

# Define columns
idea_column = "business_description"
static_feature_columns = ["market_size", "investment", "team_strength"]
historical_columns = [col for col in df.columns if col.startswith("month_")]

# Define the target as stock performance for the next 6 months
target_columns = historical_columns  # Last 6 months of performance
forecast_steps = 24

# Prepare your features and target
ideas = df[idea_column].values
static_features = df[static_feature_columns].values
historical_data = df[historical_columns].values
targets = df[target_columns].values

# Scale static features
scaler_static = StandardScaler()
static_features = scaler_static.fit_transform(static_features)

# Train-test split
ideas_train, ideas_val, static_train, static_val, hist_train, hist_val, y_train, y_val = train_test_split(
    ideas, static_features, historical_data, targets, test_size=0.2, random_state=42
)

# Convert the ideas to embeddings using SentenceTransformer
from sentence_transformers import SentenceTransformer

# Load the pre-trained model (this will map each idea to a vector of length 384)
text_encoder = SentenceTransformer('all-MiniLM-L6-v2')

# Convert the text ideas into embeddings (numerical vectors)
ideas_train_embeddings = text_encoder.encode(ideas_train, convert_to_numpy=True)
ideas_val_embeddings = text_encoder.encode(ideas_val, convert_to_numpy=True)

# Now convert these embeddings into torch tensors
ideas_train_tensor = torch.tensor(ideas_train_embeddings, dtype=torch.float32)
ideas_val_tensor = torch.tensor(ideas_val_embeddings, dtype=torch.float32)

# Convert static features and target variables into torch tensors
static_train_tensor = torch.tensor(static_train, dtype=torch.float32)
static_val_tensor = torch.tensor(static_val, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)

# Define the PyTorch Dataset

In [None]:
class StockDataset(Dataset):

    def __init__(self, ideas, static_features, historical_data, targets):

        self.ideas = ideas

        self.static_features = torch.tensor(static_features, dtype=torch.float32)

        self.historical_data = torch.tensor(historical_data, dtype=torch.float32)

        self.targets = torch.tensor(targets, dtype=torch.float32)



    def __len__(self):

        return len(self.targets)



    def __getitem__(self, idx):

        # Return the idea (text), static features, historical data, and the target

        return self.ideas[idx], self.static_features[idx], self.historical_data[idx], self.targets[idx]





train_dataset = StockDataset(ideas=ideas_train,

                             static_features=static_train,

                             historical_data=hist_train,

                             targets=y_train)



val_dataset = StockDataset(ideas=ideas_val,

                           static_features=static_val,

                           historical_data=hist_val,

                           targets=y_val)


### Training the model

Here we import the model and set it up for training

In [None]:
import torch.optim as optim

import torch.nn as nn

import sys
sys.path.append('/kaggle/input/stockpredictionmodel/pytorch/default/1')
from HybridStockPredictionModel import StockPerformancePredictionModel

from torch.optim.lr_scheduler import ReduceLROnPlateau


# Model initialization

static_feature_dim = static_features.shape[1]

historical_dim = historical_data.shape[1]

hidden_dim = 128  # Example hidden size



# Training loop
def train_model(model, train_loader, val_loader, criterion, optimizer, device, epochs=100):
    
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)
    best_val_loss = float('inf')
    
    for epoch in range(epochs):
        model.train()  # Set the model to training mode

        train_loss = 0.0

        # Training phase
        for ideas, static_features, historical_data, targets in train_loader:
            # Check for invalid values in data
            assert not torch.isnan(static_features).any(), "Static features contain NaN"
            assert not torch.isinf(static_features).any(), "Static features contain Inf"
            assert not torch.isnan(historical_data).any(), "Historical data contains NaN"
            assert not torch.isinf(historical_data).any(), "Historical data contains Inf"

            # Move to device
            static_features, historical_data, targets = (
                static_features.to(device),
                historical_data.to(device),
                targets.to(device),
            )

            # Forward pass
            predictions = model(
                idea=ideas,
                static_features=static_features,
                historical_data=historical_data,
                use_auxiliary_inputs=True,
                predict_autoregressively=False
            )


            # Compute TemporalLoss
            loss = criterion(predictions, targets)
    
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
            optimizer.step()

            # Track the training loss
            train_loss += loss.item()

        # Evaluate the model after training for the epoch
        val_loss, _, _, _ = evaluate(model, val_loader, device, criterion)  # Use TemporalLoss for evaluation

        # Adjust learning rate based on validation loss
        scheduler.step(val_loss)

        import os
    
        # Early Stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            # Create a folder for models
            os.makedirs("models", exist_ok=True)
            torch.save(model.state_dict(), "models/best_model.pth")

        print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss / len(train_loader):.4f}, Val Loss: {val_loss}")







#### We can also create a a custom loss function and an optimizer

You can also possible create custom loss functions:

In [None]:
import torch
import torch.nn as nn

class TemporalLoss(nn.Module):
    def __init__(self, lambda_smooth=0.1):
        super(TemporalLoss, self).__init__()
        self.lambda_smooth = lambda_smooth
        self.mse_loss = nn.MSELoss()

    def forward(self, predictions, targets):
        # Reshape predictions if needed
        if predictions.dim() == 2:
            predictions = predictions.unsqueeze(-1)  # (batch_size, sequence_length) -> (batch_size, sequence_length, 1)
        if targets.dim() == 2:
            targets = targets.unsqueeze(-1)  # Ensure targets match dimensions

        # Base MSE loss
        base_loss = self.mse_loss(predictions, targets)

        # Temporal smoothness loss
        temporal_diff = predictions[:, 1:, :] - predictions[:, :-1, :]
        smoothness_loss = torch.mean(temporal_diff**2)

        # Combine losses
        combined_loss = base_loss + self.lambda_smooth * smoothness_loss
        return combined_loss


### Evaluation

After training we are going to evaluate our model:

In [None]:
import joblib
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate(model, val_loader, device, criterion):
    model.eval()  # Set the model to evaluation mode

    val_loss = 0.0
    all_predictions = []
    all_targets = []
    correct = 0
    total = 0

    # Load the scalers for denormalization
    static_scaler = joblib.load("/kaggle/input/scaler/static_scaler.pkl")
    historical_scaler = joblib.load("/kaggle/input/scaler/historical_scaler.pkl")

    with torch.no_grad():  # No need to track gradients during evaluation
        for ideas_batch, static_batch, historical_batch, target_batch in val_loader:
            # Move data to device
            static_batch = static_batch.to(device)
            historical_batch = historical_batch.to(device)
            target_batch = target_batch.to(device)
    
            # Get predictions
            predictions = model(idea=ideas_batch, use_auxiliary_inputs=False)
    
            # Compute temporal loss
            loss = criterion(predictions, target_batch)
            val_loss += loss.item()
    
            # Collect all predictions and targets for evaluation
            all_predictions.append(predictions.cpu().numpy())
            all_targets.append(target_batch.cpu().numpy())

            print(predictions.shape)
    
            # Evaluate similarity between predictions and historical targets
            predictions_np = predictions.cpu().numpy()
            target_batch_np = target_batch.cpu().numpy()

            # Denormalize the predication values similarly
            prediction_historical = historical_scaler.inverse_transform(predictions_np)
            target_historical = historical_scaler.inverse_transform(target_batch_np)
    
            # Compute Mean Absolute Error (MAE) as a similarity measure
            mae = np.mean(np.abs(predictions_np - target_batch_np))
            print(f"Mean Absolute Error (MAE) for batch: {mae}")

            # Manually calculate MSE in original scale for reporting
            mse_loss_denorm = np.mean((prediction_historical - target_historical)**2)
            print(f"Denormalized Loss: {mse_loss_denorm}")

            # Compute Mean Absolute Error (MAE) as a similarity measure
            mae = np.mean(np.abs(prediction_historical - target_historical))
            print(f"Mean Absolute Error (MAE) for batch: {mae}")
    
            # Compute correlation (optional, if normalized data allows for it)
            for i in range(predictions_np.shape[0]):  # Iterate over the batch
                pred_series = predictions_np[i]
                target_series = target_batch_np[i]
    
                # Pearson correlation coefficient (if meaningful for your data)
                correlation = np.corrcoef(pred_series.flatten(), target_series.flatten())[0, 1]
                print(f"Correlation for batch {i}: {correlation}")

            # Compute correlation (optional, if normalized data allows for it)
            for i in range(prediction_historical.shape[0]):  # Iterate over the batch
                pred_series = prediction_historical[i]
                target_series = target_historical[i]
    
                # Pearson correlation coefficient (if meaningful for your data)
                correlation = np.corrcoef(pred_series.flatten(), target_series.flatten())[0, 1]
                print(f"Correlation for batch {i}: {correlation}")
                
                # Manually calculate MSE in original scale for reporting
                mse_loss_denorm = np.mean((pred_series - target_series)**2)
                print(f"Denormalized Loss: {mse_loss_denorm}")
    
                # Compute Mean Absolute Error (MAE) as a similarity measure
                mae = np.mean(np.abs(pred_series - target_series))
                print(f"Mean Absolute Error (MAE) for batch: {mae}")
    
    # Average validation loss
    val_loss /= len(val_loader)
    print(f"Validation Loss: {val_loss}")

    # Flatten the lists to make evaluation easier
    all_predictions = np.concatenate(all_predictions, axis=0)
    all_targets = np.concatenate(all_targets, axis=0)

    # Denormalize the predication values similarly
    predictions_historical = historical_scaler.inverse_transform(all_predictions)

    # Denormalize the target values similarly
    targets_historical = historical_scaler.inverse_transform(all_targets)

    # Calculate MSE and MAE for historical features
    mse_historical = mean_squared_error(predictions_historical, targets_historical)
    mae_historical = mean_absolute_error(targets_historical, predictions_historical)
    r2_historical = r2_score(targets_historical, predictions_historical)

    # Print metrics for each section
    print(f"MSE for historical features: {mse_historical}")
    print(f"MAE for historical features: {mae_historical}")
    print(f"R² for historical features: {r2_historical}")
    print(f"___ This is the Prediction ___")
    print(f"{predictions}")

    # import matplotlib.pyplot as plt

    # plt.scatter(all_targets, all_predictions, alpha=0.5)
    # plt.xlabel("Actual Values")
    # plt.ylabel("Predicted Values")
    # plt.title("Actual vs. Predicted")
    # plt.show()
    
    # residuals = all_targets - all_predictions
    # plt.hist(residuals, bins=30, edgecolor='k')
    # plt.xlabel("Residual")
    # plt.ylabel("Frequency")
    # plt.title("Residual Distribution")
    # plt.show()


    return val_loss, mse_historical, mae_historical, r2_historical


# Integration with Training Loop

In [None]:
# Run Training and Evaluation
model_save_path = "/kaggle/input/trainedmodel1/best_model.pth"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = StockPerformancePredictionModel(static_feature_dim, historical_dim, hidden_dim, forecast_steps).to(device)
# model = StockPerformancePredictionModel(static_feature_dim, historical_dim, hidden_dim, forecast_steps)
# model.load_state_dict(torch.load(model_save_path, weights_only=True))
# model.to(device)

# Define batch size
batch_size = 32

# Create DataLoader for training and validation
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Loss and optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
temporal_loss = TemporalLoss(lambda_smooth=0.1)

import warnings
warnings.filterwarnings("ignore", message="Loading widget...")


train_model(model, train_loader, val_loader, temporal_loss, optimizer, device, epochs=50)
