In [None]:
!git clone https://github.com/krislette/bach-or-bot.git
%cd bach-or-bot

In [None]:
 pip install  torch llm2vec librosa pandas soundfile torchaudio peft timm pyyaml torchao

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
import pandas as pd

csv_path = "/content/drive/MyDrive/data/external/1k_dataset.csv"
try:
    df = pd.read_csv(csv_path)
    print("CSV file loaded successfully.")
    display(df.head())
except FileNotFoundError:
    print(f"Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"An error occurred while reading the CSV file: {e}")

In [None]:
import os

# Define the base directory where the folders '01' to '05' are located
# IMPORTANT: Replace '/path/to/your/base/directory' with the actual path on your system
base_directory = '/content/drive/MyDrive/data/raw' # Assuming the folders are here

def check_file_existence(directory):
    """Checks if a file exists at the given path including the base directory."""
    full_path = os.path.join(base_directory, directory)
    return os.path.exists(full_path)

# Apply the function to the 'directory' column and create a new column 'file_exists'
df['file_exists'] = df['directory'].apply(check_file_existence)

# Report the results
files_found = df['file_exists'].sum()
files_missing = len(df) - files_found

print(f"Total files listed: {len(df)}")
print(f"Files found: {files_found}")
print(f"Files missing: {files_missing}")

if files_missing > 0:
    print("\nMissing files:")
    display(df[~df['file_exists']])

In [None]:
from llm2vec import LLM2Vec
from transformers import AutoTokenizer, AutoModel, AutoConfig
from peft import PeftModel
from torchao.quantization import quantize_, Int8WeightOnlyConfig
import torch

access_token = REDACTED_TOKEN

model_id = "McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp"

tokenizer = AutoTokenizer.from_pretrained(model_id, padding = True, truncation = True, max_length = 512)
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)

if torch.cuda.is_available():
    print("Using GPU")
# GPU path: use bf16 for speed
    model = AutoModel.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=config,
    torch_dtype=torch.bfloat16,
    device_map="cuda",
    token=access_token,
)
else:
    print("Using CPU")
# CPU path: use float32 first, then quantize
    model = AutoModel.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=config,
    torch_dtype=torch.float32,   # quantization requires fp32
    device_map="cpu",
    token=access_token,
    )

    try:
        from torchao.quantization import quantize_
        print("[INFO] Applying torchao quantization for CPU...")
        quant_config = Int8WeightOnlyConfig(group_size=None)
        print("[INFO] Applying torchao quantization with Int8WeightOnlyConfig...")
        quantize_(model, quant_config)

    except ImportError:
        print("[WARNING] torchao not installed. Run: pip install torchao")
        print("[WARNING] Falling back to non-quantized CPU model.")

l2v = LLM2Vec(model, tokenizer, pooling_mode="mean", max_length=512)


In [None]:
import sys
import os

# Add the src directory to the system path
sys.path.append('/content/bach-or-bot/src')

# Change the current directory to the project root
os.chdir('/content/bach-or-bot')

# Import the necessary functions from the script
from src.preprocessing.preprocessor import dataset_read, bulk_preprocessing
from src.spectttra.spectttra_trainer import spectttra_train
from pathlib import Path
from src.utils.config_loader import DATASET_NPZ, PCA_MODEL, RAW_DATASET_NPZ
from src.utils.dataset import dataset_scaler

import numpy as np
from sklearn.decomposition import IncrementalPCA
from sklearn.preprocessing import StandardScaler
import pickle
import gc
import joblib

# Initialize PCA and StandardScaler globally for training
_pca_trainer = None

# Initialize PCA and StandardScaler globally for training
_pca_trainer = None

class SimplePCATrainer:
    """
    A simple PCA trainer that uses IncrementalPCA to fit data in batches.
    It saves checkpoints every 5 batches and can save the final model.

    Args:
        None

    Returns:
        None

    Attributes:
        pca: The IncrementalPCA model.
        scaler: StandardScaler for normalizing data.
        fitted: Boolean indicating if the model has been initialized.
        batch_count_pca: Counter for the number of batches processed.

    Methods:
        process_batch(vectors): Processes a batch of vectors, fits the PCA model incrementally.
        save_final(model_path): Saves the final PCA model to the specified path.
    """

    # Initialize the trainer
    def __init__(self):
        self.pca = None
        self.scaler = StandardScaler()
        self.fitted = False
        self.batch_count_pca = 0

    def _determine_optimal_components(self, vectors):
        """
        Determine the optimal number of PCA components to retain 95% variance.

        Args:
            vectors: The input data to analyze.
        Returns:
            n_components: The optimal number of components.
        """
        temp_pca = IncrementalPCA()
        temp_pca.fit(vectors)
        cumsum_var = np.cumsum(temp_pca.explained_variance_ratio_)
        n_comp_95 = np.argmax(cumsum_var >= 0.95) + 1
        return min(n_comp_95, vectors.shape[1] // 2)

    def process_batch(self, vectors):
        """
        Process a batch of vectors, fitting the PCA model incrementally.

        Args:
            vectors: The input data batch to process.
        Returns:
            reduced_vectors: The PCA-transformed data.

        Note: This method saves a checkpoint every 5 batches.
        """
        if not self.fitted:
            # First batch - initialize everything
            n_components = self._determine_optimal_components(vectors)
            self.pca = IncrementalPCA(n_components=n_components, batch_size=500)
            self.scaler.fit(vectors)
            self.fitted = True
            print(f"Initialized PCA with {n_components} components")

        # Process batch
        vectors_scaled = self.scaler.transform(vectors)
        self.pca.partial_fit(vectors_scaled)
        reduced_vectors = self.pca.transform(vectors_scaled)

        self.batch_count_pca += 1

        # Save checkpoint every 5 batches
        if self.batch_count_pca % 5 == 0:
            os.makedirs("pca_checkpoints", exist_ok=True)
            with open(f"pca_checkpoints/checkpoint_batch_{self.batch_count_pca}.pkl", 'wb') as f:
                pickle.dump({'pca': self.pca, 'scaler': self.scaler}, f)
            print(f"Saved checkpoint at batch {self.batch_count_pca}")

        print(f"Processed batch {self.batch_count_pca}, shape: {vectors.shape} -> {reduced_vectors.shape}")
        return reduced_vectors

    def save_final(self, model_path):
        """
        Save the final PCA model to the specified path.

        Args:
            model_path: The file path to save the PCA model.

        Returns:
            None

        Note: Change the model path as needed in the data_config.yml file.
        """
        os.makedirs(os.path.dirname(model_path), exist_ok=True)
        with open(model_path, 'wb') as f:
            pickle.dump({'pca': self.pca, 'scaler': self.scaler}, f)
        print(f"Final model saved to {model_path}. Total variance explained: {np.sum(self.pca.explained_variance_ratio_):.4f}")

## For Single Input
def load_pca_model(vectors, model_path="models/fusion/pca.pkl"):
    """
    Load a pre-trained PCA model and transform the input vectors.

    Args:
        vectors: The input data to transform.
        model_path: The file path of the pre-trained PCA model.

    Returns:
        output: The PCA-transformed data.

    Note: Change the model path as needed in the data_config.yml file (or set the path file as shown above). Can be used for the main program.
    """
    model_path = Path(model_path)
    pca = joblib.load(model_path)
    return pca.transform(vectors)

def l2vec_single_train(l2v, lyrics):
    """
    Encode a single lyric string using the provided LLM2Vec model.

    Args:
        l2v: The LLM2Vec model for encoding lyrics.
        lyrics: A single lyric string to encode.

    Returns:
        vectors: The vector representation of the lyrics.

    """
    vectors = l2v.encode([lyrics]).detach().cpu().numpy()
    return vectors

# For Batch Processing
def l2vec_train(l2v, lyrics_list):
    """
    Encode a list of lyric strings using the provided LLM2Vec model.

    Args:
        l2v: The LLM2Vec model for encoding lyrics.
        lyrics_list: A list of lyric strings to encode.
    Returns:
        vectors: The encoded vector representations of the lyrics.

    Note: This function only encodes the lyrics and does not apply PCA reduction. The PCA reduction can be applied separately in the train.py module.
    """
    with torch.no_grad():
        vectors = l2v.encode(lyrics_list)  # lyrics_list: list of strings
    return vectors

def train_pipeline():
    """
    Training script which includes preprocessing, feature extraction, and training the MLP model.

    The train pipeline saves the train dataset in an .npz format.

    Parameters
    ----------
    None

    Returns
    -------
    None
    """

    # Instantiate X and Y vectors
    X, Y = None, None

    dataset_path = Path(DATASET_NPZ)

    if dataset_path.exists():
        print("Training dataset already exists. Loading file...")

        loaded_data = np.load(DATASET_NPZ)
        X = loaded_data["X"]
        Y = loaded_data["Y"]
    else:
        print("Training dataset does not exist. Processing data...")
        # Get batches from dataset and return full Y labels
        batches, Y = dataset_read(batch_size=2)
        batch_count = 1

        # Instantiate LLM2Vec and PCA model
        llm2vec_model = l2v

        # Preallocate spaces for both audio and lyric vectors to reduce memory overhead
        audio_vectors = np.zeros((len(Y), 384), dtype=np.float32)
        lyric_vectors = np.zeros((len(Y), 2048), dtype=np.float32)

        start_idx = 0
        for batch in batches:
            print(f"Bulk Preprocessing - Batch {batch_count}.")
            audio, lyrics = bulk_preprocessing(batch, batch_count)
            batch_count += 1

            # Call the train methods for both SpecTTTra and LLM2Vec
            print("Starting SpecTTTra feature extraction...")
            audio_features = spectttra_train(audio)

            print("Starting LLM2Vec feature extraction...")
            lyrics_features = l2vec_train(llm2vec_model, lyrics)

            batch_size = audio_features.shape[0]

            # Store the results on preallocated spaces
            audio_vectors[start_idx:start_idx + batch_size, :] = audio_features
            lyric_vectors[start_idx:start_idx + batch_size, :] = lyrics_features

            # Delete stored instance for next batch to remove overhead
            del audio, lyrics, audio_features, lyrics_features

        # Save both X and Y to an .npz file for easier loading
        print("Saving dataset for future testing...")
        np.savez(RAW_DATASET_NPZ, audio=audio_vectors, lyrics=lyric_vectors, labels=Y)

        # Run standard scaling on audio and lyrics separately
        print("Running standard scaling for audio and lyrics...")
        audio_vectors, lyric_vectors = dataset_scaler(audio_vectors, lyric_vectors)

        # Run PCA per batch to reduce GPU overhead
        ipca = IncrementalPCA(n_components=256)
        batch_size = 1000  # Adjust depending on memory

        # Fit IPCA in batches
        for i in range(0, lyric_vectors.shape[0], batch_size):
            ipca.partial_fit(lyric_vectors[i:i + batch_size])

        # Transform in batches
        lyric_vectors_reduced = np.zeros((lyric_vectors.shape[0], 256), dtype=np.float32)
        for i in range(0, lyric_vectors.shape[0], batch_size):
            lyric_vectors_reduced[i:i + batch_size, :] = ipca.transform(lyric_vectors[i:i + batch_size])

        # Save IncrementalPCA model
        joblib.dump(ipca, "models/fusion/incremental_pca.pkl")
        lyric_vectors = lyric_vectors_reduced

        # Run standard scaling on audio and lyrics separately
        print("Running standard scaling for audio and lyrics...")
        _, lyric_vectors = dataset_scaler(audio_vectors, lyric_vectors)

        # Concatenate audio features and reduced lyrics features
        X = np.concatenate([audio_vectors, lyric_vectors], axis=1)
        print(f"Audio and Lyrics Concatenated. Final features shape: {X.shape}")

        # Convert label list into np.array
        Y = np.array(Y)

        # Save both X and Y to an .npz file for easier loading
        print("Saving dataset for future testing...")
        np.savez(DATASET_NPZ, X=X, Y=Y)

print("Executing train_pipeline...")
train_pipeline()
print("train_pipeline execution finished.")

In [None]:
import torch
import gc

def flush_gpu_cache():
    """Flushes the GPU cache to free up memory."""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("GPU cache flushed.")
    else:
        print("No GPU available to flush cache.")

# Example usage:
flush_gpu_cache()