In [None]:
# import os
# os.chdir('../')
# print(os.getcwd())

In [None]:
import torch
import pytorch_lightning as pl
from torch import nn
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from transformers import AutoTokenizer, AutoModel
import numpy as np
import pandas as pd
import os
import random
import optuna
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import torch.nn.functional as F


# Set the correct output path for saving the logs
OUTPUT_PATH = "data/fine_tuned_gaap_classifier"  # Directory for saving outputs

os.makedirs(OUTPUT_PATH, exist_ok=True)
OPTUNA_DB_PATH = os.path.join(OUTPUT_PATH, "optuna_study.db")

# === Seeder for reproducibility ===
def seed_everything(seed: int):
    """
    Sets the seed for reproducibility across various libraries
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    pl.seed_everything(seed, workers=True)


# === Setup for BGE model ===
MODEL_NAME = "BAAI/bge-large-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
encoder = AutoModel.from_pretrained(MODEL_NAME)

# Ensure the model is in evaluation mode (no gradients needed)
encoder.eval()

# === Check if MPS is available (for Apple Silicon users) ===
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

# Move the model to the selected device
encoder = encoder.to(device)


# === Dataset Class for Data Loading (Using Precomputed Embeddings) ===
class TextDataset(Dataset):
    def __init__(self, data_file):
        """
        Loads the dataset and the precomputed embeddings directly from the JSONL file.
        """
        self.data = pd.read_json(data_file, lines=True)

        # Extract embeddings from the data as NumPy arrays and cast them to float32
        self.input_embeddings = np.array(self.data["variation_embedding"].tolist(), dtype=np.float32)
        self.description_embeddings = np.array(self.data["description_embedding"].tolist(), dtype=np.float32)

    def __len__(self):
        return len(self.input_embeddings)

    def __getitem__(self, idx):
        # Convert the NumPy arrays to PyTorch tensors with requires_grad=True
        input_embedding = torch.tensor(self.input_embeddings[idx], dtype=torch.float32, requires_grad=False)
        description_embedding = torch.tensor(self.description_embeddings[idx], dtype=torch.float32, requires_grad=False)
        
        return input_embedding, description_embedding

# === Model Definition with Attention and Fully Connected Layer ===
class AlignmentModel(pl.LightningModule):
    def __init__(self, dropout_rate=0.2, hidden_size=256, num_heads=8):
        super(AlignmentModel, self).__init__()

        # TODO: Don't hardcode
        hidden_size = 1024
        num_heads = 8

        input_size = 1024

        # Fully connected layer
        self.fc = nn.Sequential(
            nn.Linear(input_size, hidden_size),  # Ensure this matches your input size
            nn.GELU(),
            nn.Dropout(dropout_rate)
        )

        # Attention layer
        # self.attn = nn.MultiheadAttention(embed_dim=hidden_size, num_heads=num_heads)
        # self.layer_norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc_out = nn.Linear(hidden_size, input_size)  # Adjust to match input size


    def forward(self, input_embeddings, description_embeddings):
        # Use the embeddings directly as input features (without detaching them)
        
        # Pass the embeddings through the fully connected layers
        input_embeddings = self.fc(input_embeddings)
        description_embeddings = self.fc(description_embeddings)

        # Combine the embeddings for attention processing
        combined_embeddings = torch.cat((input_embeddings, description_embeddings), dim=0)

        # Apply attention mechanism
        # attn_output, _ = self.attn(combined_embeddings.unsqueeze(0), combined_embeddings.unsqueeze(0), combined_embeddings.unsqueeze(0))
        # attn_output = attn_output.squeeze(0) + combined_embeddings  # Add the input to the attention output

        # Apply normalization and dropout
        # attn_output = self.layer_norm(attn_output)
        # attn_output = self.dropout(attn_output)
        # or
        attn_output = combined_embeddings

        # Split back into the input and description embeddings
        input_embeddings, description_embeddings = torch.split(attn_output, input_embeddings.size(0), dim=0)

        # Final output layer
        input_embeddings = self.fc_out(input_embeddings)
        description_embeddings = self.fc_out(description_embeddings)

        return input_embeddings, description_embeddings

    def training_step(self, batch, batch_idx):
        input_embeddings, description_embeddings = batch

        transformed_input_embeddings, transformed_description_embeddings = self(input_embeddings, description_embeddings)

        # Compute training loss
        loss = self.cosine_similarity_loss(transformed_input_embeddings, transformed_description_embeddings)
        self.log("train_loss", loss, prog_bar=True)

        # Compute the cosine similarity loss on the original (non-transformed) embeddings
        raw_loss = self.cosine_similarity_loss(input_embeddings, description_embeddings)
        self.log("train_raw_loss", raw_loss, prog_bar=True)

        return loss

    def validation_step(self, batch, batch_idx):
        input_embeddings, description_embeddings = batch
        
        transformed_input_embeddings, transformed_description_embeddings = self(input_embeddings, description_embeddings)

        # Compute validation loss
        loss = self.cosine_similarity_loss(transformed_input_embeddings, transformed_description_embeddings)
        self.log("val_loss", loss, prog_bar=True)

        # Compute the cosine similarity loss on the original (non-transformed) embeddings
        raw_loss = self.cosine_similarity_loss(input_embeddings, description_embeddings)
        self.log("val_raw_loss", raw_loss, prog_bar=True)

        return loss

    def cosine_similarity_loss(self, embeddings1, embeddings2):
        # Compute cosine similarity using PyTorch's functional API
        cosine_sim = F.cosine_similarity(embeddings1, embeddings2, dim=-1)
        
        # Convert cosine similarity to a loss value (1 - similarity)
        loss = 1 - cosine_sim.mean()  # Return as a scalar loss
        return loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=1e-5)


# === Objective Function for Optuna ===
def objective(trial):
    batch_size = trial.suggest_int("batch_size", 8, 64, step=8)
    dropout_rate = trial.suggest_float("dropout_rate", 0.0, 0.5, step=0.1)
    hidden_size = trial.suggest_int("hidden_size", 256, 1024, step=256)
    num_heads = trial.suggest_int("num_heads", 2, 8, step=2)

    # Initialize model with hyperparameters
    model = AlignmentModel(dropout_rate=dropout_rate, hidden_size=hidden_size, num_heads=num_heads)

    # Load the original dataset (use TextDataset class to load it)
    data_file = "data/us_gaap_tags_with_variations_and_embeddings.jsonl"

    # Use the original method of loading the embeddings directly from the JSONL file
    full_dataset = TextDataset(data_file)

    # Split the dataset manually for training and validation
    train_size = int(0.8 * len(full_dataset))  # 80% for training
    val_size = len(full_dataset) - train_size  # 20% for validation
    train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, val_size])

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Set up callbacks for early stopping and model checkpointing
    early_stop_callback = EarlyStopping(monitor="val_loss", patience=3, verbose=True, mode="min")
    model_checkpoint = ModelCheckpoint(
        dirpath=OUTPUT_PATH,
        filename="best_model",
        monitor="val_loss",
        mode="min",
        save_top_k=1,
        verbose=True
    )

    # Set up the logger
    logger = TensorBoardLogger(OUTPUT_PATH, name="tb_logs")

    # Trainer setup for Optuna
    trainer = pl.Trainer(
        max_epochs=200,
        callbacks=[early_stop_callback, model_checkpoint],
        logger=logger,
        accelerator="auto",
        devices=1,
        gradient_clip_val=1.0  # Optional, set as needed
    )
    
    trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)

    return trainer.callback_metrics["val_loss"].item()

# === Run Optuna Optimization ===
if __name__ == "__main__":
    seed_everything(42)  # Ensure reproducibility

    # Create a study with SQLite monitoring
    study = optuna.create_study(direction="minimize", storage=f"sqlite:///{OPTUNA_DB_PATH}", load_if_exists=True)

    # Start the Optuna study to optimize hyperparameters
    study.optimize(objective, n_trials=20)

    # Print the best hyperparameters found during the study
    print(f"Best Hyperparameters: {study.best_params}")

Using device: mps


Seed set to 42
[I 2025-04-07 20:23:30,571] A new study created in RDB with name: no-name-d877c87f-3d29-4b04-aaf1-dbe3cccc36bb
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Volumes/2TB Storage Vault/rust-sec-fetcher/python/venv/lib/python3.12/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /Volumes/2TB Storage Vault/rust-sec-fetcher/python/data/fine_tuned_gaap_classifier exists and is not empty.

  | Name    | Type       | Params | Mode 
-----------------------------------------------
0 | fc      | Sequential | 1.0 M  | train
1 | dropout | Dropout    | 0      | train
2 | fc_out  | Linear     | 1.0 M  | train
-----------------------------------------------
2.1 M     Trainable params
0         Non-trainable params
2.1 M     Total params
8.397     Total estimated model params size (MB)
6         Modules in train mode
0         Modules in eval mode


Sanity Checking: |                                        | 0/? [00:00<?, ?it/s]

/Volumes/2TB Storage Vault/rust-sec-fetcher/python/venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
/Volumes/2TB Storage Vault/rust-sec-fetcher/python/venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


Training: |                                               | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Metric val_loss improved. New best score: 0.003
Epoch 0, global step 484: 'val_loss' reached 0.00348 (best 0.00348), saving model to '/Volumes/2TB Storage Vault/rust-sec-fetcher/python/data/fine_tuned_gaap_classifier/best_model.ckpt' as top 1


Validation: |                                             | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.002 >= min_delta = 0.0. New best score: 0.001
Epoch 1, global step 968: 'val_loss' reached 0.00145 (best 0.00145), saving model to '/Volumes/2TB Storage Vault/rust-sec-fetcher/python/data/fine_tuned_gaap_classifier/best_model.ckpt' as top 1


Validation: |                                             | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.001
Epoch 2, global step 1452: 'val_loss' reached 0.00084 (best 0.00084), saving model to '/Volumes/2TB Storage Vault/rust-sec-fetcher/python/data/fine_tuned_gaap_classifier/best_model.ckpt' as top 1


Validation: |                                             | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.001
Epoch 3, global step 1936: 'val_loss' reached 0.00055 (best 0.00055), saving model to '/Volumes/2TB Storage Vault/rust-sec-fetcher/python/data/fine_tuned_gaap_classifier/best_model.ckpt' as top 1


Validation: |                                             | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.000
Epoch 4, global step 2420: 'val_loss' reached 0.00039 (best 0.00039), saving model to '/Volumes/2TB Storage Vault/rust-sec-fetcher/python/data/fine_tuned_gaap_classifier/best_model.ckpt' as top 1


Validation: |                                             | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.000
Epoch 5, global step 2904: 'val_loss' reached 0.00028 (best 0.00028), saving model to '/Volumes/2TB Storage Vault/rust-sec-fetcher/python/data/fine_tuned_gaap_classifier/best_model.ckpt' as top 1


Validation: |                                             | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.000
Epoch 6, global step 3388: 'val_loss' reached 0.00021 (best 0.00021), saving model to '/Volumes/2TB Storage Vault/rust-sec-fetcher/python/data/fine_tuned_gaap_classifier/best_model.ckpt' as top 1


Validation: |                                             | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.000
Epoch 7, global step 3872: 'val_loss' reached 0.00016 (best 0.00016), saving model to '/Volumes/2TB Storage Vault/rust-sec-fetcher/python/data/fine_tuned_gaap_classifier/best_model.ckpt' as top 1


Validation: |                                             | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.000
Epoch 8, global step 4356: 'val_loss' reached 0.00012 (best 0.00012), saving model to '/Volumes/2TB Storage Vault/rust-sec-fetcher/python/data/fine_tuned_gaap_classifier/best_model.ckpt' as top 1



Detected KeyboardInterrupt, attempting graceful shutdown ...
[W 2025-04-07 20:24:15,498] Trial 0 failed with parameters: {'batch_size': 48, 'dropout_rate': 0.4, 'hidden_size': 256, 'num_heads': 8} because of the following error: NameError("name 'exit' is not defined").
Traceback (most recent call last):
  File "/Volumes/2TB Storage Vault/rust-sec-fetcher/python/venv/lib/python3.12/site-packages/pytorch_lightning/trainer/call.py", line 48, in _call_and_handle_interrupt
    return trainer_fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Volumes/2TB Storage Vault/rust-sec-fetcher/python/venv/lib/python3.12/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
    self._run(model, ckpt_path=ckpt_path)
  File "/Volumes/2TB Storage Vault/rust-sec-fetcher/python/venv/lib/python3.12/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
    results = self._run_stage()
              ^^^^^^^^^^^^^^^^^
  File "/Volumes/2TB Storage Vault/r

NameError: name 'exit' is not defined

In [None]:
# # TODO: Refactor as necessary

# import torch
# import pytorch_lightning as pl
# from torch import nn
# from torch.utils.data import Dataset, DataLoader
# from transformers import AutoTokenizer, AutoModel
# import optuna
# import pandas as pd
# from sklearn.metrics.pairwise import cosine_similarity
# import random
# import numpy as np
# from tqdm import tqdm
# from utils.pytorch import seed_everything, get_device

# # === Setup for BGE model ===
# MODEL_NAME = "BAAI/bge-large-en-v1.5"
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# device = get_device()
# encoder = AutoModel.from_pretrained(MODEL_NAME)
# encoder = encoder.to(device)


# # Cache embeddings for each text individually
# embedding_cache = {}

# def generate_embeddings(texts):
#     embeddings = []
#     for text in texts:
#         if text in embedding_cache:  # Check if the embedding is cached
#             embeddings.append(embedding_cache[text])
#         else:
#             inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
#             with torch.no_grad():
#                 outputs = encoder(**inputs)
#             text_embedding = outputs.last_hidden_state[:, 0, :]  # Use [CLS] token embedding
#             embedding_cache[text] = text_embedding  # Cache the embedding
#             embeddings.append(text_embedding)
#     return torch.stack(embeddings)

# # Cosine Similarity Loss
# def cosine_similarity_loss(embeddings1, embeddings2):
#     cosine_sim = cosine_similarity(embeddings1.cpu().numpy(), embeddings2.cpu().numpy())
#     return torch.tensor(1 - cosine_sim.mean(), device=device)

# # === Model Definition with Attention and Fully Connected Layer ===
# class AlignmentModel(pl.LightningModule):
#     def __init__(self, dropout_rate=0.2, hidden_size=256, num_heads=2):
#         super(AlignmentModel, self).__init__()

#         # Fully connected layer to transform embeddings
#         self.fc = nn.Sequential(
#             nn.Linear(encoder.config.hidden_size, hidden_size),
#             nn.ReLU(),  # Non-linearity
#             nn.Dropout(dropout_rate)
#         )
        
#         # Attention layer that operates on combined embeddings
#         self.attn = nn.MultiheadAttention(embed_dim=hidden_size, num_heads=num_heads)
        
#         # Layer normalization after attention
#         self.layer_norm = nn.LayerNorm(hidden_size)
        
#         # Output layer to restore embedding size (optional)
#         self.fc_out = nn.Linear(hidden_size, encoder.config.hidden_size)
        
#         # Dropout after attention
#         self.dropout = nn.Dropout(dropout_rate)

#     def forward(self, input_texts, description_texts):
#         # Generate embeddings for input and description texts
#         input_embeddings = generate_embeddings(input_texts)
#         description_embeddings = generate_embeddings(description_texts)
        
#         # Apply dropout to the embeddings
#         input_embeddings = self.fc(input_embeddings)
#         description_embeddings = self.fc(description_embeddings)
        
#         # Concatenate the embeddings to allow attention to focus on the relationships
#         combined_embeddings = torch.cat((input_embeddings, description_embeddings), dim=0)
        
#         # Apply attention to focus on the relationships between the input and description embeddings
#         attn_output, _ = self.attn(combined_embeddings, combined_embeddings, combined_embeddings)
        
#         # Residual connection after attention (adding the original embeddings back)
#         attn_output = attn_output + combined_embeddings
        
#         # Apply layer normalization after attention
#         attn_output = self.layer_norm(attn_output)
        
#         # Apply dropout to the attention output
#         attn_output = self.dropout(attn_output)
        
#         # Split the output back into two sets: input and description embeddings
#         input_embeddings, description_embeddings = torch.split(attn_output, input_embeddings.size(0), dim=0)
        
#         # Output layer to adjust embedding size
#         input_embeddings = self.fc_out(input_embeddings)
#         description_embeddings = self.fc_out(description_embeddings)

#         return input_embeddings, description_embeddings

#     def training_step(self, batch, batch_idx):
#         input_texts, description_texts = batch
#         input_embeddings, description_embeddings = self(input_texts, description_texts)
        
#         # Calculate cosine similarity loss
#         loss = cosine_similarity_loss(input_embeddings, description_embeddings)
        
#         self.log("train_loss", loss, prog_bar=True)
#         return loss

#     def validation_step(self, batch, batch_idx):
#         input_texts, description_texts = batch
#         input_embeddings, description_embeddings = self(input_texts, description_texts)
        
#         # Calculate cosine similarity loss for validation
#         loss = cosine_similarity_loss(input_embeddings, description_embeddings)
        
#         self.log("val_loss", loss, prog_bar=True)
#         return loss

#     def configure_optimizers(self):
#         return torch.optim.AdamW(self.parameters(), lr=1e-5)

# # === Data Loading ===
# def load_data(file_path):
#     df = pd.read_json(file_path, lines=True)
#     return df

# # === Objective Function for Optuna ===
# def objective(trial):
#     batch_size = trial.suggest_int("batch_size", 8, 64, step=8)
#     dropout_rate = trial.suggest_float("dropout_rate", 0.0, 0.5, step=0.1)
#     hidden_size = trial.suggest_int("hidden_size", 256, 1024, step=256)
#     num_heads = trial.suggest_int("num_heads", 2, 8, step=2)

#     # Initialize model with hyperparameters
#     model = AlignmentModel(dropout_rate=dropout_rate, hidden_size=hidden_size, num_heads=num_heads)

#     # Load datasets (Assume the datasets are already loaded into `train_data` and `val_data`)
#     train_data = load_data("data/train.jsonl")  # Replace with your train dataset path
#     val_data = load_data("data/val.jsonl")  # Replace with your val dataset path

#     train_texts = list(train_data["input_text"])
#     train_descriptions = list(train_data["us_gaap_description"])
#     val_texts = list(val_data["input_text"])
#     val_descriptions = list(val_data["us_gaap_description"])

#     # Create DataLoader for training and validation
#     train_dataset = TextDataset(train_texts, train_descriptions)
#     val_dataset = TextDataset(val_texts, val_descriptions)
#     train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
#     val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

#     # Trainer setup for Optuna
#     trainer = pl.Trainer(max_epochs=5, gpus=1, progress_bar_refresh_rate=20)
#     trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)

#     return trainer.callback_metrics["val_loss"].item()

# # === Run Optuna Optimization ===
# if __name__ == "__main__":
#     # Start the Optuna study to optimize hyperparameters
#     study = optuna.create_study(direction="minimize")
#     study.optimize(objective, n_trials=20)
    
#     print(f"Best Hyperparameters: {study.best_params}")

In [None]:
# import torch

# class FocalLoss(torch.nn.Module):
#     def __init__(self, alpha=1, gamma=2, reduction='mean'):
#         super(FocalLoss, self).__init__()
#         self.alpha = alpha  # Balancing factor
#         self.gamma = gamma  # Focusing parameter
#         self.reduction = reduction

#     def forward(self, inputs, targets):
#         # Sigmoid activation for multi-label classification (logits to probabilities)
#         inputs = torch.sigmoid(inputs)
        
#         # Ensure the targets are in the same shape as inputs
#         targets = targets.float()

#         # Cross entropy part
#         cross_entropy = -targets * torch.log(inputs + 1e-8) - (1 - targets) * torch.log(1 - inputs + 1e-8)

#         # Focal loss
#         focal_loss = self.alpha * (1 - inputs) ** self.gamma * cross_entropy

#         # Reduce (mean or sum)
#         if self.reduction == 'mean':
#             return torch.mean(focal_loss)
#         elif self.reduction == 'sum':
#             return torch.sum(focal_loss)
#         else:
#             return focal_loss


In [None]:
# import torch

# class DiceLoss(torch.nn.Module):
#     def __init__(self, smooth=1e-6):
#         super(DiceLoss, self).__init__()
#         self.smooth = smooth

#     def forward(self, preds, target):
#         # Flatten the input and target tensors
#         preds = preds.view(-1)
#         target = target.view(-1)

#         intersection = (preds * target).sum()
#         union = preds.sum() + target.sum()

#         # Dice coefficient (with smoothing)
#         dice_coeff = (2. * intersection + self.smooth) / (union + self.smooth)

#         return 1 - dice_coeff


In [None]:
# import random
# import os
# import json
# import torch
# import optuna
# import numpy as np
# from torch.utils.data import Dataset, DataLoader
# from transformers import AutoTokenizer, AutoModel
# import pytorch_lightning as pl
# from pytorch_lightning.callbacks import EarlyStopping
# from pytorch_lightning.loggers import TensorBoardLogger
# from sklearn.metrics.pairwise import cosine_similarity


# # === SEED ===
# SEED = 42
# def seed_everything(seed: int):
#     """
#     This function sets the seed for various libraries to ensure reproducibility.
#     It seeds Python's built-in random module, NumPy, PyTorch (CPU and GPU), and PyTorch Lightning.
#     """
#     random.seed(seed)
#     np.random.seed(seed)
#     torch.manual_seed(seed)
#     if torch.cuda.is_available():
#         torch.cuda.manual_seed_all(seed)
#     pl.seed_everything(seed, workers=True)

# # Ensure it's called!
# seed_everything(SEED)


# # === CONFIG ===
# TRAIN_JSONL_PATH = "data/train.jsonl"  # Path to your training dataset
# VAL_JSONL_PATH = "data/val.jsonl"  # Path to your validation dataset
# MODEL_NAME = "BAAI/bge-large-en-v1.5"  # Base model
# OUTPUT_PATH = "data/fine_tuned_gaap_classifier"  # Directory for saving outputs
# os.makedirs(OUTPUT_PATH, exist_ok=True)
# OPTUNA_DB_PATH = os.path.join(OUTPUT_PATH, "optuna_study.db")
# EPOCHS = 10
# PATIENCE = 5

# # Define the device (MPS or CPU)
# device = "mps" if torch.backends.mps.is_available() else "cpu"
# print(f"Using device: {device}")


# # === Load Data from JSONL files ===
# def load_jsonl(filepath):
#     with open(filepath, "r") as f:
#         return [json.loads(line) for line in f]

# train_data = load_jsonl(TRAIN_JSONL_PATH)
# val_data = load_jsonl(VAL_JSONL_PATH)

# # === Dynamically determine the number of possible categories ===
# all_categories = set()
# for entry in train_data + val_data:
#     all_categories.update(entry["labels"])

# num_labels = max(all_categories)  # Dynamically find the highest category label number
# print(f"Number of categories: {num_labels}")


# # === Initialize Tokenizer and Encoder ===
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# encoder = AutoModel.from_pretrained(MODEL_NAME).to(device)


# # === Prepare Label Embeddings ===
# def cache_label_embeddings(label_descriptions, model_name="BAAI/bge-large-en-v1.5", save_path="data/label_embeddings.pt"):
#     if os.path.exists(save_path):
#         print("Loading cached label embeddings...")
#         return torch.load(save_path)
    
#     print("Generating new label embeddings...")
#     label_embeddings = {}
#     for label_id, label_text in label_descriptions.items():
#         label_embeddings[label_id] = generate_embeddings([label_text])[0]  # Generate embedding for each label
    
#     label_embeddings_tensor = torch.stack(list(label_embeddings.values()))
#     torch.save(label_embeddings_tensor, save_path)
#     print(f"Label embeddings saved to {save_path}")
    
#     return label_embeddings_tensor


# def generate_embeddings(texts):
#     """
#     This function generates embeddings for a given list of text descriptions.
#     It uses the base model to generate the embeddings, specifically the [CLS] token representation.
#     """
#     if isinstance(texts, str):
#         texts = [texts]  # Convert single string to a list of strings
#     elif not isinstance(texts, list):
#         raise ValueError("Input must be a string or a list of strings.")
    
#     texts = [str(text) if not isinstance(text, str) else text for text in texts]
#     inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
#     with torch.no_grad():
#         outputs = encoder(**inputs)
    
#     embeddings = outputs.last_hidden_state[:, 0, :]  # Use [CLS] token embedding
#     return embeddings


# # === Dataset Class ===
# class MultiLabelDataset(Dataset):
#     def __init__(self, data, tokenizer, encoder):
#         self.samples = []
#         self.tokenizer = tokenizer
#         self.encoder = encoder
#         for d in data:
#             input_text = d["input_text"]
#             if not isinstance(input_text, str):
#                 input_text = str(input_text)
#             self.samples.append(input_text)

#     def __len__(self):
#         return len(self.samples)

#     def __getitem__(self, idx):
#         text = self.samples[idx]
#         if not isinstance(text, str):
#             text = str(text)
        
#         inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(self.encoder.device)
#         with torch.no_grad():
#             outputs = self.encoder(**inputs)
#         embedding = outputs.last_hidden_state[:, 0, :]
#         embedding.requires_grad_()  # Make sure gradients are tracked for embeddings
#         return embedding


# # === GAAP Classifier ===
# class GAAPClassifier(pl.LightningModule):
#     def __init__(self, model_name, dropout_rate, batch_size, lr, gradient_clip, weight_decay, label_embeddings, label_descriptions):
#         super().__init__()
#         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
#         self.encoder = AutoModel.from_pretrained(model_name).to(device)
#         self.label_embeddings = label_embeddings
#         self.label_descriptions = label_descriptions
#         self.batch_size = batch_size
#         self.lr = lr
#         self.gradient_clip = gradient_clip
#         self.weight_decay = weight_decay
#         self.save_hyperparameters()

#         # Enable gradient tracking for model parameters
#         for param in self.encoder.parameters():
#             param.requires_grad = True  # Ensure all parameters in the encoder require gradients

#     def forward(self, inputs):
#         outputs = self.encoder(**inputs)
#         embeddings = outputs.last_hidden_state[:, 0, :]
#         embeddings.requires_grad_()  # Ensure gradients are tracked for embeddings
#         return embeddings

#     def compute_loss(self, outputs, labels):
#         outputs = outputs.detach().cpu().numpy()
#         labels = self.label_embeddings.detach().cpu().numpy()
#         similarities = cosine_similarity(outputs, labels)
#         loss = 1 - similarities.mean()
#         return torch.tensor(loss, device=self.device)

#     def training_step(self, batch, batch_idx):
#         texts = batch
#         if isinstance(texts, torch.Tensor):
#             texts = [str(t) for t in texts]
#         inputs = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(self.device)
#         outputs = self(inputs)
#         loss = self.compute_loss(outputs, self.label_embeddings)
#         self.log("train/loss", loss, prog_bar=True)
#         return loss

#     def validation_step(self, batch, batch_idx):
#         texts = batch
#         if isinstance(texts, torch.Tensor):
#             texts = [str(t) for t in texts]
#         inputs = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(self.device)
#         outputs = self(inputs)
#         loss = self.compute_loss(outputs, self.label_embeddings)
#         self.log("val/loss", loss, prog_bar=True)
#         return loss
    
#     def configure_optimizers(self):
#         return torch.optim.AdamW(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)


# # === Cache Label Embeddings ===
# label_descriptions = {label: f"Description for label {label}" for label in all_categories}
# label_embeddings_tensor = cache_label_embeddings(label_descriptions)


# # === Training Setup ===
# def objective(trial):
#     batch_size = trial.suggest_int("batch_size", 8, 64, step=8)
#     lr = trial.suggest_float("lr", 1e-6, 1e-3, log=True)
#     dropout_rate = trial.suggest_float("dropout_rate", 0.0, 0.5, step=0.1)
#     gradient_clip = trial.suggest_float("gradient_clip", 0.0, 0.8, step=0.1)
#     weight_decay = trial.suggest_float("weight_decay", 1e-8, 1e-4, log=True)

#     train_loader = DataLoader(MultiLabelDataset(train_data, tokenizer, encoder), batch_size=batch_size, shuffle=True)
#     val_loader = DataLoader(MultiLabelDataset(val_data, tokenizer, encoder), batch_size=batch_size, shuffle=False)

#     model = GAAPClassifier(MODEL_NAME, dropout_rate, batch_size, lr, gradient_clip, weight_decay, label_embeddings_tensor, label_descriptions)

#     trainer = pl.Trainer(
#         max_epochs=EPOCHS,
#         callbacks=[EarlyStopping(monitor="val/loss", patience=PATIENCE)],
#         logger=TensorBoardLogger(OUTPUT_PATH),
#         accelerator="auto",
#         devices=1,
#         gradient_clip_val=gradient_clip
#     )

#     trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)
#     return trainer.callback_metrics["val/loss"].item()

# # === Optuna Optimization ===
# study = optuna.create_study(direction="minimize", storage=f"sqlite:///{OPTUNA_DB_PATH}", load_if_exists=True)
# study.optimize(objective, n_trials=50)

# # Best Params
# print("Best params:", study.best_params)
# best_trial = study.best_trial
# print(f"Best trial value: {best_trial.value}")
# for k, v in best_trial.params.items():
#     print(f"    {k}: {v}")
