# Hybrid Stock Prediction Model Training

In the "HybridStockPredictionModel" notebook we created our model that can be used to make efficient stock prediction for new business ideas.



At first we create our Dataset class that will be used to train the model:





# Load Libraries and Set Up Dependencies

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from tqdm import tqdm
import os
tqdm._instances.clear()  # Clear any existing progress bars
tqdm.pandas(disable=True)  # Disable tqdm globally
tqdm.disable = True
os.environ["DISABLE_TQDM"] = "1"
import sys

# For Kaggle:
# sys.path.append('/kaggle/input/stockpredictionmodel/pytorch/default/7')
# For IDE:
sys.path.append('/home/kai/Documents/AIR-Project/Model')
from HybridStockPredictionModel import StockPerformancePredictionModel

from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.tensorboard import SummaryWriter
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
# import joblib

# hidden_dim = 128  # Example hidden size
# historical_scaler = joblib.load("/kaggle/input/scaler/historical_scaler.pkl")
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model_save_path = "/kaggle/input/trainedmodel2/best_model.pth"
# model = StockPerformancePredictionModel(3, 12, hidden_dim, 12)

# # Load the model on the device where you want to use it
# model.load_state_dict(torch.load(model_save_path, map_location=device))
# model.to(device)

# model.eval()

# with torch.no_grad():
#     predictions = model(
#         idea=["Agilent Technologies, Inc. provides application focused solutions to the life sciences, diagnostics, and applied chemical markets worldwide. The company operates in three segments: Life Sciences and Applied Markets, Diagnostics and Genomics, and Agilent CrossLab. The Life Sciences and Applied Markets segment offers liquid chromatography systems and components; liquid chromatography mass spectrometry systems; gas chromatography systems and components; gas chromatography mass spectrometry systems; inductively coupled plasma mass spectrometry instruments; atomic absorption instruments; microwave plasma-atomic emission spectrometry instruments; inductively coupled plasma optical emission spectrometry instruments; raman spectroscopy; cell analysis plate based assays; flow cytometer; real-time cell analyzer; cell imaging systems; microplate reader; laboratory software; information management and analytics; laboratory automation and robotic systems; dissolution testing; and vacuum pumps, and measurement technologies. The Diagnostics and Genomics segment focuses on genomics, nucleic acid contract manufacturing and research and development, pathology, companion diagnostics, reagent partnership, and biomolecular analysis businesses. The Agilent CrossLab segment provides GC and LC columns, sample preparation products, custom chemistries, and laboratory instrument supplies; and offers services portfolio, including repairs, parts, maintenance, installations, training, compliance support, software as a service, asset management, and consulting services. The company markets its products through direct sales, distributors, resellers, manufacturer's representatives, and electronic commerce. Agilent Technologies, Inc. was incorporated in 1999 and is headquartered in Santa Clara, California."],
#         use_auxiliary_inputs=False
#     )

# print(predictions)
# nan_tensor = torch.full((1, 12), float('nan'), device=predictions.device) # Expand zeros_tensor to 2D (1, 12)
# right = [0.0002258687415962,0.0002258687415962,0.0002258687415962,0.0002258687415962,0.0002258687415962,0.0002258687415962,0.0002258687415962,0.0002258687415962,0.0002258687415962,0.0002258687415962,0.0002258687415962,0.0002258687415962]
# right_tensor = torch.tensor(right, device=predictions.device)
# right_tensor = right_tensor.unsqueeze(0)

# # Now concatenate along dim=1
# stock_market_prediction = torch.cat((right_tensor, predictions), dim=1)

# # Ensure the tensor is detached from the computation graph before converting to numpy
# predictions_np = stock_market_prediction.detach().cpu().numpy()
# prediction_historical = historical_scaler.inverse_transform(predictions_np)
# print(prediction_historical[0])

# # Assuming target_series and pred_series are created here...
# target_series = pd.Series(prediction_historical[0])

# sns.set_style("darkgrid")
# fig = plt.figure(figsize=(10, 6))  # Add figure size for clarity

# plt.subplot(1, 2, 1)
# ax = sns.lineplot(x=target_series.index, y=target_series.values, label="Data", color='royalblue')
# ax.set_title('Stock price', size=14, fontweight='bold')
# ax.set_xlabel("Days", size=14)
# ax.set_ylabel("Cost (USD)", size=14)

# plt.show()  # Ensure the plot appears




# Define the PyTorch Dataset

In [None]:
class StockDataset(Dataset):
    def __init__(self, ideas, static_features, historical_data, targets):
        self.ideas = ideas
        self.static_features = torch.tensor(static_features, dtype=torch.float32)
        self.historical_data = torch.tensor(historical_data, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        # Return the idea (text), static features, historical data, and the target
        return self.ideas[idx], self.static_features[idx], self.historical_data[idx], self.targets[idx]

# Load and Preprocess the Dataset

In [None]:
# Load the dataset
df = pd.read_csv("/kaggle/input/realstockdataset/real_company_stock_dataset.csv")

# Define columns
idea_column = "business_description"
static_feature_columns = ["market_size", "investment", "team_strength"]
historical_columns = [col for col in df.columns if col.startswith("month_")]

# Prepare your features and target
ideas = df[idea_column].iloc[327:].reset_index(drop=True).values
static_features = df[static_feature_columns].iloc[327:,:].reset_index(drop=True).values 

# Work with historical columns as DataFrame first
historical_df = df[historical_columns]

# Split the data: all rows but split columns
train_data = historical_df.iloc[327:, :12].reset_index(drop=True).values 
test_data = historical_df.iloc[327:, 12:24].reset_index(drop=True).values 

# Initialize new scalers
static_scaler = StandardScaler()  # or MinMaxScaler()
historical_scaler = MinMaxScaler(feature_range=(0, 1))

# Fit scalers on the entire dataset
train_data = historical_scaler.fit_transform(train_data)  # All rows, first 12 columns
test_data = historical_scaler.transform(test_data)  # All rows, first 12 columns
static_features = static_scaler.fit_transform(static_features)  # All rows, first 12 columns

# Convert Nan to Zero
train_data[np.isnan(train_data)] = 0
test_data[np.isnan(test_data)] = 0
static_features[np.isnan(static_features)] = 0

# Train-test split
ideas_train, ideas_test, static_train, static_test, hist_train, hist_test, target_train, target_test = train_test_split(
    ideas, static_features, train_data, test_data, test_size=0.2, random_state=42
)


# Denormalize the values similarly
predictions_historical = historical_scaler.inverse_transform(test_data)
targets_historical = historical_scaler.inverse_transform(train_data)

plt.figure(figsize=(10, 6))

# Loop through predictions and targets
for i in range(len(predictions_historical)):
    plt.plot(targets_historical[i], linestyle='--', marker='x', color='red')  # Plot targets

# Add labels, title, and legend
plt.xlabel('Time Steps')  # Or whatever your x-axis represents
plt.ylabel('Value')  # Or whatever your y-axis represents
plt.title('First 12 Months Training-Data')
plt.legend()  # Add legend to differentiate between predictions and targets
plt.grid(True)

# Show the plot
plt.show()

plt.figure(figsize=(10, 6))
# Loop through predictions and targets
for i in range(len(predictions_historical)):
    plt.plot(predictions_historical[i], linestyle='-', marker='o', color='blue')  # Plot predictions

# Add labels, title, and legend
plt.xlabel('Time Steps')  # Or whatever your x-axis represents
plt.ylabel('Value')  # Or whatever your y-axis represents
plt.title('12 to 24 Months Testing-Data')
plt.legend()  # Add legend to differentiate between predictions and targets
plt.grid(True)

# Show the plot
plt.show()


# Combine all inputs for the training dataset
# Assuming each row of historical data matches with static features and idea embeddings
train_dataset = StockDataset(
    ideas=ideas_train,
    static_features=static_train,
    historical_data=hist_train,
    targets=target_train)

test_dataset = StockDataset(
    historical_data=hist_test,                    # Historical data
    ideas=ideas_test,                             # Idea embeddings
    static_features=static_test,                  # Static features
    targets=target_test                           # Targets
)

print("Dataset updated successfully!")

### Training the model

Here we import the model and set it up for training

In [None]:
def train_model(model, train_loader, val_loader, criterion, optimizer, device, epochs=100):
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)
    writer = SummaryWriter(log_dir="logs")  # TensorBoard writer
    train_loss_statistics = []
    test_loss_statistics = []

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        
        all_predictions = []
        all_targets = []
        
        for ideas, static_features, historical_data, targets in train_loader:
            
            static_features, historical_data, targets = (
                static_features.to(device),
                historical_data.to(device),
                targets.to(device),
            )
            
            optimizer.zero_grad()
            
            predictions = model(
                idea=ideas,
                static_features=static_features,
                historical_data=historical_data,
                use_auxiliary_inputs=True,
                predict_autoregressively=False,
            )
            
            loss = criterion(predictions, targets)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)  # More lenient gradient clipping
            optimizer.step()
            train_loss += loss.item()

            all_predictions.append(predictions.cpu().detach().numpy())
            all_targets.append(targets.cpu().detach().numpy())

        # Flatten the lists to make evaluation easier
        all_predictions = np.concatenate(all_predictions, axis=0)
        all_targets = np.concatenate(all_targets, axis=0)
    
        # Denormalize the values similarly
        predictions_historical = historical_scaler.inverse_transform(all_predictions)
        targets_historical = historical_scaler.inverse_transform(all_targets)
    
        plt.figure(figsize=(10, 6))
    
        # Loop through predictions and targets
        for i in range(len(predictions_historical)):
            plt.plot(predictions_historical[i], linestyle='-', marker='o', color='blue')  # Plot predictions
            plt.plot(targets_historical[i], linestyle='--', marker='x', color='red')  # Plot targets
        
        # Add labels, title, and legend
        plt.xlabel('Time Steps')  # Or whatever your x-axis represents
        plt.ylabel('Value')  # Or whatever your y-axis represents
        plt.title('Predictions vs Targets')
        plt.legend()  # Add legend to differentiate between predictions and targets
        plt.grid(True)
        
        # Show the plot
        plt.show()
        

        # Validation phase
        val_loss, _, _, _ = evaluate(model, val_loader, device, criterion)
        scheduler.step(val_loss)

        # Logging
        train_loss /= len(train_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss}, Val Loss: {val_loss}")
        train_loss_statistics.append(train_loss)
        test_loss_statistics.append(val_loss)
        writer.add_scalars('Loss', {'Train': train_loss, 'Validation': val_loss}, epoch)

    writer.close()

    
    sns.set_style("darkgrid")
    fig = plt.figure(figsize=(16, 6))  # Set figure size here
    ax = plt.subplot(1, 2, 2)
    
    # Create x-axis for epochs
    epochs = range(1, len(train_loss_statistics) + 1)
    
    # Plot training loss
    sns.lineplot(x=epochs, y=train_loss_statistics, label="Train Loss", color='royalblue', ax=ax)
    
    # Plot testing loss
    sns.lineplot(x=epochs, y=test_loss_statistics, label="Test Loss", color='tomato', ax=ax)
    
    # Customize labels and title
    ax.set_xlabel("Epoch", size=14)
    ax.set_ylabel("Loss", size=14)
    ax.set_title("Training Loss / Testing Loss", size=14, fontweight='bold')
    
    plt.tight_layout()  # Adjust layout to prevent overlap
    plt.show()


#### We can also create a a custom loss function and an optimizer

You can also possible create custom loss functions:

In [None]:
class TemporalLoss(nn.Module):
    def __init__(self, mse_weight=0.6, diversity_weight=0.1, lambda_smooth=0.2, lambda_penalty=0.1):
        super(TemporalLoss, self).__init__()
        self.mse_weight = mse_weight
        self.diversity_weight = diversity_weight
        self.lambda_smooth = lambda_smooth
        self.lambda_penalty = lambda_penalty
        self.mse_loss = nn.MSELoss()

    def forward(self, predictions, targets):
        # Ensure dimensions match
        if predictions.dim() == 2:
            predictions = predictions.unsqueeze(-1)
        if targets.dim() == 2:
            targets = targets.unsqueeze(-1)

        # Base MSE loss
        mse = self.mse_loss(predictions, targets)

        # Diversity encouragement
        diversity_penalty = -torch.std(predictions)

        # Temporal smoothness loss
        temporal_diff = (predictions[:, 1:, :] - predictions[:, :-1, :]) / (torch.abs(predictions[:, :-1, :]) + 1e-6)
        smoothness_loss = torch.mean(temporal_diff**2)

        # Negative prediction penalty
        negative_penalty = torch.sum(torch.clamp(-predictions, min=0))

        # Combine all loss components
        combined_loss = (
            self.mse_weight * mse +
            self.diversity_weight * diversity_penalty +
            self.lambda_smooth * smoothness_loss +
            self.lambda_penalty * negative_penalty
        )

        return combined_loss



### Evaluation

After training we are going to evaluate our model:

In [None]:
import joblib
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import seaborn as sns

def evaluate(model, val_loader, device, criterion):
    model.eval()  # Set the model to evaluation mode

    val_loss = 0.0
    all_predictions = []
    all_targets = []
    correct = 0
    total = 0

    with torch.no_grad():  # No need to track gradients during evaluation
        for ideas_batch, static_batch, historical_batch, target_batch in val_loader:
            # Move data to device
            static_batch = static_batch.to(device)
            historical_batch = historical_batch.to(device)
            target_batch = target_batch.to(device)
    
            # Get predictions
            predictions = model(idea=ideas_batch, use_auxiliary_inputs=False)

            # print("Training predictions Shape:", predictions.shape)
            # print("Testing target_batch Shape:", target_batch.shape) 
    
            # Compute temporal loss
            loss = criterion(predictions, target_batch)
            val_loss += loss.item()
            
            all_predictions.append(predictions.cpu().numpy())
            all_targets.append(target_batch.cpu().numpy())

                
    # Average validation loss
    val_loss /= len(val_loader)
    print(f"Validation Loss: {val_loss}")

    # Flatten the lists to make evaluation easier
    all_predictions = np.concatenate(all_predictions, axis=0)
    all_targets = np.concatenate(all_targets, axis=0)

    # Denormalize the values similarly
    predictions_historical = historical_scaler.inverse_transform(all_predictions)
    targets_historical = historical_scaler.inverse_transform(all_targets)

    plt.figure(figsize=(10, 6))

    # Loop through predictions and targets
    for i in range(len(predictions_historical)):
        plt.plot(predictions_historical[i], linestyle='-', marker='o', color='blue')  # Plot predictions
        plt.plot(targets_historical[i], linestyle='--', marker='x', color='red')  # Plot targets
    
    # Add labels, title, and legend
    plt.xlabel('Time Steps')  # Or whatever your x-axis represents
    plt.ylabel('Value')  # Or whatever your y-axis represents
    plt.title('Predictions vs Targets')
    plt.legend()  # Add legend to differentiate between predictions and targets
    plt.grid(True)
    
    # Show the plot
    plt.show()

    # Calculate MSE and MAE for historical features
    mse_historical = mean_squared_error(predictions_historical, targets_historical)
    mae_historical = mean_absolute_error(targets_historical, predictions_historical)
    r2_historical = r2_score(targets_historical, predictions_historical)

    # Print metrics for each section
    print(f"MSE for historical features: {mse_historical}")
    print(f"MAE for historical features: {mae_historical}")
    print(f"R² for historical features: {r2_historical}")

    # import matplotlib.pyplot as plt

    # plt.scatter(all_targets, all_predictions, alpha=0.5)
    # plt.xlabel("Actual Values")
    # plt.ylabel("Predicted Values")
    # plt.title("Actual vs. Predicted")
    # plt.show()
    
    # residuals = all_targets - all_predictions
    # plt.hist(residuals, bins=30, edgecolor='k')
    # plt.xlabel("Residual")
    # plt.ylabel("Frequency")
    # plt.title("Residual Distribution")
    # plt.show()


    return val_loss, mse_historical, mae_historical, r2_historical


# Integration with Training Loop

In [None]:
# Run Training and Evaluation
# Model initialization
static_feature_dim = static_features.shape[1]
historical_dim = 12
hidden_dim = 128
forecast_steps = 12
model_save_path = "/kaggle/input/trainedmodel1/best_model.pth"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = xm.xla_device()

model = StockPerformancePredictionModel(static_feature_dim, historical_dim, hidden_dim, forecast_steps).to(device)
# model = StockPerformancePredictionModel(static_feature_dim, historical_dim, hidden_dim, forecast_steps)
# model.load_state_dict(torch.load(model_save_path, weights_only=True))
# model.to(device)

# Create DataLoader for training and validation
# Create DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Loss and optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
temporal_loss = TemporalLoss(lambda_smooth=0.1)

# import warnings
# warnings.filterwarnings("ignore", message="Loading widget...")

print("Start Training")
train_model(model, train_loader, test_loader, temporal_loss, optimizer, device, epochs=50)

os.makedirs("models", exist_ok=True)
torch.save(model.state_dict(), "models/best_model.pth")

print("Saved Model in Outputs/Models")