In [None]:
!pip install tensorflow>=2.13.0 transformers matplotlib seaborn scikit-learn

# Connect to my google drive
from google.colab import drive
drive.mount('/content/drive')

# copy files from google drive to project
!cp '/content/drive/MyDrive/Uni/BD/part-00000-a1634db0-8c09-4cb9-b1ca-466599ba8394-c000.snappy.parquet' '/content/'
!cp '/content/drive/MyDrive/Uni/BD/ppart-00001-a1634db0-8c09-4cb9-b1ca-466599ba8394-c000.snappy.parque' '/content/'
!cp '/content/drive/MyDrive/Uni/BD/part-00002-a1634db0-8c09-4cb9-b1ca-466599ba8394-c000.snappy.parquet' '/content/'
!cp '/content/drive/MyDrive/Uni/BD/part-00003-a1634db0-8c09-4cb9-b1ca-466599ba8394-c000.snappy.parquet' '/content/'

Mounted at /content/drive
cp: cannot stat '/content/drive/MyDrive/Uni/BD/ppart-00001-a1634db0-8c09-4cb9-b1ca-466599ba8394-c000.snappy.parque': No such file or directory


### Previous

In [None]:
"""
Deep Learning Models for Sentiment Analysis - GPU/TPU Optimized for Google Colab
Implements LSTM, CNN, and Transformer models with CUDA/TPU acceleration
Optimized for Google Colab environment with mixed precision training

Author: AI Assistant
Date: 2025-01-20
Dependencies: tensorflow>=2.13.0, transformers, pandas, numpy
Environment: Google Colab with GPU/TPU runtime
"""

import os
import numpy as np
import pandas as pd
import logging
import time
from typing import Dict, List, Tuple, Optional, Union
import json
import warnings
from pathlib import Path

# TensorFlow imports with GPU optimization
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks, optimizers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
from tensorflow.keras.mixed_precision import LossScaleOptimizer

# Additional ML libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# For distributed training and TPU support
try:
    from transformers import TFAutoModel, AutoTokenizer
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    TRANSFORMERS_AVAILABLE = False
    logging.warning("Transformers library not available. Transformer models will be disabled.")

# Configure warnings
warnings.filterwarnings('ignore', category=FutureWarning)

# Configure logging for Colab
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler()  # Colab displays stdout/stderr inline
    ]
)
logger = logging.getLogger(__name__)

class GPUEnvironmentManager:
    """
    Manages GPU/TPU environment setup and optimization for Google Colab
    Handles device detection, memory management, and mixed precision setup
    """

    def __init__(self):
        """Initialize GPU environment manager with optimal settings."""
        self.device_type = self._detect_accelerator()
        self.strategy = self._setup_distribution_strategy()
        self.mixed_precision_enabled = False

        # Log environment info
        self._log_environment_info()

        # Setup mixed precision if GPU available
        if self.device_type == 'GPU':
            self._setup_mixed_precision()

    def _detect_accelerator(self) -> str:
        """
        Detect available accelerator (GPU/TPU/CPU) in Google Colab.

        Returns:
            str: Device type ('GPU', 'TPU', or 'CPU')
        """
        try:
            # Check for TPU
            tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
            tf.config.experimental_connect_to_cluster(tpu)
            tf.tpu.experimental.initialize_tpu_system(tpu)
            logger.info("TPU detected and initialized")
            return 'TPU'
        except (ValueError, RuntimeError):
            pass

        # Check for GPU
        gpus = tf.config.list_physical_devices('GPU')
        if gpus:
            logger.info(f"GPU detected: {len(gpus)} device(s)")
            # Configure GPU memory growth to prevent OOM
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            return 'GPU'

        logger.warning("No accelerator detected, using CPU")
        return 'CPU'

    def _setup_distribution_strategy(self):
        """
        Setup appropriate distribution strategy based on available hardware.

        Returns:
            tf.distribute.Strategy: Configured strategy for training
        """
        if self.device_type == 'TPU':
            tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
            strategy = tf.distribute.TPUStrategy(tpu)
            logger.info(f"Using TPU strategy with {strategy.num_replicas_in_sync} replicas")
        elif self.device_type == 'GPU':
            strategy = tf.distribute.MirroredStrategy()
            logger.info(f"Using MirroredStrategy with {strategy.num_replicas_in_sync} replicas")
        else:
            strategy = tf.distribute.get_strategy()  # Default strategy for CPU
            logger.info("Using default strategy (CPU)")

        return strategy

    def _setup_mixed_precision(self):
        """Setup mixed precision training for faster GPU training."""
        try:
            # Enable mixed precision for faster training on modern GPUs
            policy = tf.keras.mixed_precision.Policy('mixed_float16')
            tf.keras.mixed_precision.set_global_policy(policy)
            self.mixed_precision_enabled = True
            logger.info("Mixed precision training enabled (float16)")
        except Exception as e:
            logger.warning(f"Could not enable mixed precision: {e}")
            self.mixed_precision_enabled = False

    def _log_environment_info(self):
        """Log comprehensive environment information for debugging."""
        logger.info("=== GPU/TPU Environment Information ===")
        logger.info(f"TensorFlow version: {tf.__version__}")
        logger.info(f"Detected accelerator: {self.device_type}")

        if self.device_type == 'GPU':
            gpus = tf.config.list_physical_devices('GPU')
            for i, gpu in enumerate(gpus):
                logger.info(f"GPU {i}: {gpu}")
                # Get GPU memory info
                try:
                    memory_info = tf.config.experimental.get_memory_info(gpu.name.replace('/physical_device:', ''))
                    logger.info(f"  Memory - Current: {memory_info['current']//1024//1024}MB, "
                              f"Peak: {memory_info['peak']//1024//1024}MB")
                except:
                    logger.info("  Memory info not available")

        logger.info(f"Available logical devices: {len(tf.config.list_logical_devices())}")
        logger.info("=" * 50)

    def get_optimal_batch_size(self, base_batch_size: int = 32) -> int:
        """
        Calculate optimal batch size based on available hardware.

        Args:
            base_batch_size: Base batch size for CPU/single GPU

        Returns:
            int: Optimized batch size
        """
        if self.device_type == 'TPU':
            # TPU works best with larger batch sizes (multiple of 8)
            return max(128, base_batch_size * 8)
        elif self.device_type == 'GPU':
            # GPU can handle moderate batch sizes
            return base_batch_size * max(1, self.strategy.num_replicas_in_sync)
        else:
            # CPU - smaller batch sizes
            return max(16, base_batch_size // 2)


class OptimizedDataLoader:
    """
    Optimized data loading for Google Colab with GPU/TPU acceleration
    Handles efficient data preprocessing and batching
    """

    def __init__(self, env_manager: GPUEnvironmentManager):
        """
        Initialize data loader with environment-specific optimizations.

        Args:
            env_manager: GPU environment manager instance
        """
        self.env_manager = env_manager
        self.tokenizer = None

    def prepare_colab_data(self, df: pd.DataFrame,
                          text_col: str = 'text',
                          label_col: str = 'sentiment',
                          max_words: int = 10000,
                          max_length: int = 100,
                          validation_split: float = 0.2) -> Tuple[tf.data.Dataset, tf.data.Dataset, Tokenizer]:
        """
        Prepare data optimized for Colab GPU/TPU training.

        Args:
            df: Input DataFrame with text and labels
            text_col: Name of text column
            label_col: Name of label column
            max_words: Maximum vocabulary size
            max_length: Maximum sequence length
            validation_split: Fraction for validation set

        Returns:
            Tuple of (train_dataset, val_dataset, tokenizer)
        """
        logger.info("Preparing data for GPU/TPU training...")
        logger.info(f"Dataset shape: {df.shape}")

        # Ensure we have the required columns
        if text_col not in df.columns or label_col not in df.columns:
            raise ValueError(f"Required columns {text_col} and {label_col} not found")

        # Extract text and labels
        texts = df[text_col].fillna('').astype(str).values
        labels = df[label_col].values

        logger.info(f"Processing {len(texts)} text samples")

        # Initialize and fit tokenizer
        self.tokenizer = Tokenizer(
            num_words=max_words,
            oov_token='<OOV>',
            filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
        )

        logger.info("Fitting tokenizer on text data...")
        self.tokenizer.fit_on_texts(texts)

        # Convert texts to sequences
        logger.info("Converting texts to sequences...")
        sequences = self.tokenizer.texts_to_sequences(texts)

        # Pad sequences
        logger.info(f"Padding sequences to max length {max_length}...")
        X = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
        y = labels.astype(np.float32)

        # Split data
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=validation_split, random_state=42, stratify=y
        )

        logger.info(f"Train set: {X_train.shape[0]} samples")
        logger.info(f"Validation set: {X_val.shape[0]} samples")

        # Calculate optimal batch size
        batch_size = self.env_manager.get_optimal_batch_size()
        logger.info(f"Using batch size: {batch_size}")

        # Create TensorFlow datasets with optimization
        train_dataset = self._create_optimized_dataset(
            X_train, y_train, batch_size, shuffle=True, repeat=True
        )
        val_dataset = self._create_optimized_dataset(
            X_val, y_val, batch_size, shuffle=False, repeat=False
        )

        return train_dataset, val_dataset, self.tokenizer

    def _create_optimized_dataset(self, X: np.ndarray, y: np.ndarray,
                                batch_size: int, shuffle: bool = True,
                                repeat: bool = False) -> tf.data.Dataset:
        """
        Create optimized TensorFlow dataset for training.

        Args:
            X: Input sequences
            y: Labels
            batch_size: Batch size
            shuffle: Whether to shuffle data
            repeat: Whether to repeat dataset

        Returns:
            tf.data.Dataset: Optimized dataset
        """
        dataset = tf.data.Dataset.from_tensor_slices((X, y))

        if shuffle:
            # Use larger buffer for better shuffling
            buffer_size = min(10000, len(X))
            dataset = dataset.shuffle(buffer_size)

        if repeat:
            dataset = dataset.repeat()

        dataset = dataset.batch(batch_size)

        # Optimization for performance
        dataset = dataset.prefetch(tf.data.AUTOTUNE)

        if self.env_manager.device_type in ['GPU', 'TPU']:
            # Additional optimizations for accelerators
            dataset = dataset.cache()  # Cache preprocessed data

        return dataset


class OptimizedDeepLearningModel:
    """
    Base class for GPU/TPU optimized deep learning models
    Includes mixed precision training, model checkpointing, and advanced callbacks
    """

    def __init__(self, env_manager: GPUEnvironmentManager,
                 max_words: int = 10000, max_length: int = 100):
        """
        Initialize optimized deep learning model.

        Args:
            env_manager: GPU environment manager
            max_words: Maximum vocabulary size
            max_length: Maximum sequence length
        """
        self.env_manager = env_manager
        self.max_words = max_words
        self.max_length = max_length
        self.model = None
        self.history = None
        self.training_time = 0

    def _get_optimizer(self, learning_rate: float = 0.001):
        """
        Get optimized optimizer for the current hardware setup.

        Args:
            learning_rate: Learning rate for optimizer

        Returns:
            tf.keras.optimizers.Optimizer: Configured optimizer
        """
        if self.env_manager.device_type == 'TPU':
            # TPU works well with higher learning rates
            optimizer = optimizers.Adam(learning_rate=learning_rate * 2)
        else:
            optimizer = optimizers.Adam(learning_rate=learning_rate)

        # Wrap with loss scale optimizer if mixed precision is enabled
        if self.env_manager.mixed_precision_enabled:
            optimizer = LossScaleOptimizer(optimizer)
            logger.info("Using mixed precision optimizer")

        return optimizer

    def _get_callbacks(self, model_name: str, patience: int = 5) -> List[callbacks.Callback]:
        """
        Get optimized callbacks for training.

        Args:
            model_name: Name of the model for checkpointing
            patience: Patience for early stopping

        Returns:
            List of configured callbacks
        """
        callback_list = []

        # Early stopping
        early_stop = callbacks.EarlyStopping(
            monitor='val_loss',
            patience=patience,
            restore_best_weights=True,
            verbose=1,
            mode='min'
        )
        callback_list.append(early_stop)

        # Learning rate reduction
        reduce_lr = callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=max(2, patience // 2),
            min_lr=1e-7,
            verbose=1
        )
        callback_list.append(reduce_lr)

        # Model checkpointing (save to Colab's temporary storage)
        checkpoint_path = f'/content/best_{model_name}_model.h5'
        model_checkpoint = callbacks.ModelCheckpoint(
            filepath=checkpoint_path,
            monitor='val_loss',
            save_best_only=True,
            save_weights_only=False,
            verbose=1
        )
        callback_list.append(model_checkpoint)

        # Progress tracking for Colab
        class ColabProgressCallback(callbacks.Callback):
            def __init__(self):
                super().__init__()
                self.start_time = time.time()

            def on_epoch_end(self, epoch, logs=None):
                elapsed_time = time.time() - self.start_time
                logs = logs or {}
                logger.info(f"Epoch {epoch + 1} - Loss: {logs.get('loss', 0):.4f} - "
                          f"Val Loss: {logs.get('val_loss', 0):.4f} - "
                          f"Accuracy: {logs.get('accuracy', 0):.4f} - "
                          f"Time: {elapsed_time:.1f}s")

        callback_list.append(ColabProgressCallback())

        return callback_list

    def compile_model(self):
        """Compile model with appropriate settings for the hardware."""
        if self.model is None:
            raise ValueError("Model not built yet. Call build_model() first.")

        optimizer = self._get_optimizer()

        # Configure loss and metrics
        loss = 'binary_crossentropy'
        metrics = ['accuracy']

        # Add AUC metric for better evaluation
        if hasattr(tf.keras.metrics, 'AUC'):
            metrics.append(tf.keras.metrics.AUC(name='auc'))

        self.model.compile(
            optimizer=optimizer,
            loss=loss,
            metrics=metrics
        )

        logger.info(f"Model compiled with {optimizer.__class__.__name__} optimizer")

    def train_model(self, train_dataset: tf.data.Dataset,
                   val_dataset: tf.data.Dataset,
                   epochs: int = 10,
                   steps_per_epoch: Optional[int] = None,
                   validation_steps: Optional[int] = None) -> Dict:
        """
        Train model with GPU/TPU optimization.

        Args:
            train_dataset: Training dataset
            val_dataset: Validation dataset
            epochs: Number of epochs
            steps_per_epoch: Steps per epoch (auto-calculated if None)
            validation_steps: Validation steps (auto-calculated if None)

        Returns:
            Dict: Training history
        """
        if self.model is None:
            raise ValueError("Model not built. Call build_model() first.")

        logger.info(f"Starting training for {epochs} epochs...")

        # Get callbacks
        callbacks_list = self._get_callbacks(self.__class__.__name__)

        # Record training start time
        start_time = time.time()

        # Train model within distribution strategy scope
        with self.env_manager.strategy.scope():
            self.history = self.model.fit(
                train_dataset,
                epochs=epochs,
                validation_data=val_dataset,
                steps_per_epoch=steps_per_epoch,
                validation_steps=validation_steps,
                callbacks=callbacks_list,
                verbose=1
            )

        # Record training time
        self.training_time = time.time() - start_time
        logger.info(f"Training completed in {self.training_time:.2f} seconds")

        return self.history.history

    def evaluate_model(self, test_dataset: tf.data.Dataset) -> Dict:
        """
        Evaluate model performance.

        Args:
            test_dataset: Test dataset

        Returns:
            Dict: Evaluation metrics
        """
        if self.model is None:
            raise ValueError("Model not trained yet.")

        logger.info("Evaluating model...")

        # Evaluate model
        eval_results = self.model.evaluate(test_dataset, verbose=1)

        # Create results dictionary
        metrics = {}
        for i, metric_name in enumerate(self.model.metrics_names):
            metrics[metric_name] = eval_results[i]

        # Add training time
        metrics['training_time'] = self.training_time

        logger.info("Evaluation Results:")
        for metric, value in metrics.items():
            logger.info(f"  {metric}: {value:.4f}")

        return metrics


class OptimizedLSTMModel(OptimizedDeepLearningModel):
    """
    GPU/TPU optimized LSTM model for sentiment analysis
    Includes bidirectional LSTM layers and advanced regularization
    """

    def build_model(self, embedding_dim: int = 128,
                   lstm_units: int = 64,
                   dropout_rate: float = 0.3) -> models.Model:
        """
        Build optimized LSTM model architecture.

        Args:
            embedding_dim: Dimension of word embeddings
            lstm_units: Number of LSTM units
            dropout_rate: Dropout rate for regularization

        Returns:
            Compiled Keras model
        """
        logger.info("Building optimized LSTM model...")

        with self.env_manager.strategy.scope():
            model = models.Sequential([
                # Embedding layer with mask_zero for padding
                layers.Embedding(
                    input_dim=self.max_words,
                    output_dim=embedding_dim,
                    input_length=self.max_length,
                    mask_zero=True,
                    name='embedding'
                ),

                # Spatial dropout for embedding regularization
                layers.SpatialDropout1D(dropout_rate * 0.5),

                # Bidirectional LSTM for better context understanding
                layers.Bidirectional(
                    layers.LSTM(
                        lstm_units,
                        dropout=dropout_rate,
                        recurrent_dropout=dropout_rate * 0.5,
                        return_sequences=True
                    ),
                    name='bi_lstm_1'
                ),

                # Second LSTM layer for deeper representation
                layers.Bidirectional(
                    layers.LSTM(
                        lstm_units // 2,
                        dropout=dropout_rate,
                        recurrent_dropout=dropout_rate * 0.5,
                        return_sequences=False
                    ),
                    name='bi_lstm_2'
                ),

                # Dense layers with batch normalization
                layers.Dense(64, activation='relu', name='dense_1'),
                layers.BatchNormalization(),
                layers.Dropout(dropout_rate),

                layers.Dense(32, activation='relu', name='dense_2'),
                layers.BatchNormalization(),
                layers.Dropout(dropout_rate * 0.5),

                # Output layer
                layers.Dense(1, activation='sigmoid', name='output')
            ])

        self.model = model
        self.compile_model()

        # Log model summary
        logger.info("LSTM Model Architecture:")
        self.model.summary(print_fn=logger.info)

        return model


class OptimizedCNNModel(OptimizedDeepLearningModel):
    """
    GPU/TPU optimized CNN model for sentiment analysis
    Uses multiple filter sizes for better n-gram capture
    """

    def build_model(self, embedding_dim: int = 128,
                   filter_sizes: List[int] = [3, 4, 5],
                   num_filters: int = 100,
                   dropout_rate: float = 0.5) -> models.Model:
        """
        Build optimized CNN model with multiple filter sizes.

        Args:
            embedding_dim: Dimension of word embeddings
            filter_sizes: List of filter sizes for convolution
            num_filters: Number of filters per filter size
            dropout_rate: Dropout rate for regularization

        Returns:
            Compiled Keras model
        """
        logger.info("Building optimized CNN model...")

        with self.env_manager.strategy.scope():
            # Input layer
            inputs = layers.Input(shape=(self.max_length,), name='input')

            # Embedding layer
            embedding = layers.Embedding(
                input_dim=self.max_words,
                output_dim=embedding_dim,
                input_length=self.max_length,
                mask_zero=False,  # CNN doesn't work well with masking
                name='embedding'
            )(inputs)

            embedding = layers.SpatialDropout1D(dropout_rate * 0.3)(embedding)

            # Multiple CNN branches with different filter sizes
            conv_blocks = []
            for filter_size in filter_sizes:
                conv = layers.Conv1D(
                    filters=num_filters,
                    kernel_size=filter_size,
                    activation='relu',
                    padding='valid',
                    name=f'conv1d_{filter_size}'
                )(embedding)

                # Batch normalization for training stability
                conv = layers.BatchNormalization()(conv)

                # Global max pooling to capture most important features
                conv = layers.GlobalMaxPooling1D(name=f'maxpool_{filter_size}')(conv)

                conv_blocks.append(conv)

            # Concatenate all conv blocks
            if len(conv_blocks) > 1:
                merged = layers.Concatenate(name='concat')(conv_blocks)
            else:
                merged = conv_blocks[0]

            # Dense layers for classification
            dense = layers.Dense(128, activation='relu', name='dense_1')(merged)
            dense = layers.BatchNormalization()(dense)
            dense = layers.Dropout(dropout_rate)(dense)

            dense = layers.Dense(64, activation='relu', name='dense_2')(dense)
            dense = layers.BatchNormalization()(dense)
            dense = layers.Dropout(dropout_rate * 0.5)(dense)

            # Output layer
            outputs = layers.Dense(1, activation='sigmoid', name='output')(dense)

            model = models.Model(inputs=inputs, outputs=outputs, name='OptimizedCNN')

        self.model = model
        self.compile_model()

        # Log model summary
        logger.info("CNN Model Architecture:")
        self.model.summary(print_fn=logger.info)

        return model


class OptimizedTransformerModel(OptimizedDeepLearningModel):
    """
    GPU/TPU optimized Transformer model for sentiment analysis
    Uses multi-head attention with positional encoding
    """

    def build_model(self, embedding_dim: int = 128,
                   num_heads: int = 8,
                   ff_dim: int = 256,
                   num_transformer_blocks: int = 2,
                   dropout_rate: float = 0.1) -> models.Model:
        """
        Build optimized Transformer model architecture.

        Args:
            embedding_dim: Dimension of word embeddings
            num_heads: Number of attention heads
            ff_dim: Feed-forward network dimension
            num_transformer_blocks: Number of transformer blocks
            dropout_rate: Dropout rate for regularization

        Returns:
            Compiled Keras model
        """
        logger.info("Building optimized Transformer model...")

        with self.env_manager.strategy.scope():
            # Input layer
            inputs = layers.Input(shape=(self.max_length,), name='input')

            # Embedding layer
            embedding = layers.Embedding(
                input_dim=self.max_words,
                output_dim=embedding_dim,
                input_length=self.max_length,
                mask_zero=True,
                name='embedding'
            )(inputs)

            # Positional encoding
            position_encoding = self._get_positional_encoding(
                self.max_length, embedding_dim
            )

            # Add positional encoding to embeddings
            x = embedding + position_encoding
            x = layers.Dropout(dropout_rate)(x)

            # Transformer blocks
            for i in range(num_transformer_blocks):
                x = self._transformer_block(
                    x, embedding_dim, num_heads, ff_dim, dropout_rate, f'transformer_{i}'
                )

            # Global average pooling
            x = layers.GlobalAveragePooling1D()(x)

            # Dense layers for classification
            x = layers.Dense(128, activation='relu', name='dense_1')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Dropout(dropout_rate)(x)

            x = layers.Dense(64, activation='relu', name='dense_2')(x)
            x = layers.Dropout(dropout_rate * 0.5)(x)

            # Output layer
            outputs = layers.Dense(1, activation='sigmoid', name='output')(x)

            model = models.Model(inputs=inputs, outputs=outputs, name='OptimizedTransformer')

        self.model = model
        self.compile_model()

        # Log model summary
        logger.info("Transformer Model Architecture:")
        self.model.summary(print_fn=logger.info)

        return model

    def _get_positional_encoding(self, seq_len: int, d_model: int):
        """
        Create positional encoding for transformer.

        Args:
            seq_len: Sequence length
            d_model: Model dimension

        Returns:
            tf.Tensor: Positional encoding tensor
        """
        pos = np.arange(seq_len)[:, np.newaxis]
        i = np.arange(d_model)[np.newaxis, :]

        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        angle_rads = pos * angle_rates

        # Apply sin to even indices and cos to odd indices
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

        pos_encoding = angle_rads[np.newaxis, ...]

        return tf.cast(pos_encoding, dtype=tf.float32)

    def _transformer_block(self, x, embed_dim: int, num_heads: int,
                          ff_dim: int, dropout_rate: float, name: str):
        """
        Create a transformer block.

        Args:
            x: Input tensor
            embed_dim: Embedding dimension
            num_heads: Number of attention heads
            ff_dim: Feed-forward dimension
            dropout_rate: Dropout rate
            name: Block name

        Returns:
            tf.Tensor: Output tensor
        """
        # Multi-head attention
        attention = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=embed_dim,
            name=f'{name}_attention'
        )(x, x)

        attention = layers.Dropout(dropout_rate)(attention)

        # Add & Norm
        attention = layers.Add(name=f'{name}_add_1')([x, attention])
        attention = layers.LayerNormalization(name=f'{name}_norm_1')(attention)

        # Feed-forward network
        ff = layers.Dense(ff_dim, activation='relu', name=f'{name}_ff_1')(attention)
        ff = layers.Dropout(dropout_rate)(ff)
        ff = layers.Dense(embed_dim, name=f'{name}_ff_2')(ff)
        ff = layers.Dropout(dropout_rate)(ff)

        # Add & Norm
        output = layers.Add(name=f'{name}_add_2')([attention, ff])
        output = layers.LayerNormalization(name=f'{name}_norm_2')(output)

        return output


def evaluate_all_models_gpu(data_path: str = '/content/data.csv',
                           sample_fraction: float = 1.0,
                           epochs: int = 10) -> Dict:
    """
    Evaluate all deep learning models with GPU/TPU optimization.

    Args:
        data_path: Path to CSV data file
        sample_fraction: Fraction of data to use (1.0 = full data)
        epochs: Number of training epochs

    Returns:
        Dict: Comprehensive evaluation results
    """
    logger.info("Starting GPU/TPU optimized model evaluation...")

    # Initialize environment manager
    env_manager = GPUEnvironmentManager()

    # Load data
    logger.info(f"Loading data from {data_path}")
    try:
        df = pd.read_csv(data_path)
        logger.info(f"Loaded dataset with shape: {df.shape}")
    except Exception as e:
        logger.error(f"Failed to load data: {e}")
        raise

    # Sample data if requested
    if sample_fraction < 1.0:
        df = df.sample(frac=sample_fraction, random_state=42)
        logger.info(f"Using {sample_fraction*100}% of data: {df.shape[0]} samples")

    # Initialize data loader
    data_loader = OptimizedDataLoader(env_manager)

    # Prepare data
    train_dataset, val_dataset, tokenizer = data_loader.prepare_colab_data(df)

    # Initialize models
    models_to_evaluate = [
        ('LSTM', OptimizedLSTMModel),
        ('CNN', OptimizedCNNModel),
        ('Transformer', OptimizedTransformerModel)
    ]

    results = {}

    # Evaluate each model
    for model_name, ModelClass in models_to_evaluate:
        logger.info(f"\n{'='*60}")
        logger.info(f"Training {model_name} Model")
        logger.info(f"{'='*60}")

        try:
            # Initialize model
            model = ModelClass(env_manager)

            # Build model with appropriate parameters
            if model_name == 'LSTM':
                model.build_model(embedding_dim=128, lstm_units=64)
            elif model_name == 'CNN':
                model.build_model(embedding_dim=128, filter_sizes=[3, 4, 5], num_filters=100)
            elif model_name == 'Transformer':
                model.build_model(embedding_dim=128, num_heads=8, ff_dim=256)

            # Train model
            history = model.train_model(
                train_dataset,
                val_dataset,
                epochs=epochs
            )

            # Evaluate model
            metrics = model.evaluate_model(val_dataset)

            # Store results
            results[model_name] = {
                'metrics': metrics,
                'history': history,
                'model_params': model.model.count_params()
            }

            logger.info(f"{model_name} training completed successfully")

            # Save model for later use
            model.model.save(f'/content/{model_name.lower()}_model.h5')
            logger.info(f"Model saved as {model_name.lower()}_model.h5")

        except Exception as e:
            logger.error(f"Failed to train {model_name}: {e}")
            results[model_name] = {
                'metrics': {'error': str(e)},
                'history': {},
                'model_params': 0
            }

    # Create summary
    logger.info("\n" + "="*60)
    logger.info("MODEL EVALUATION SUMMARY")
    logger.info("="*60)

    summary_data = []
    for model_name, result in results.items():
        if 'error' not in result['metrics']:
            summary_data.append({
                'Model': model_name,
                'Accuracy': result['metrics'].get('accuracy', 0),
                'Loss': result['metrics'].get('loss', 0),
                'AUC': result['metrics'].get('auc', 0),
                'Training Time (s)': result['metrics'].get('training_time', 0),
                'Parameters': result['model_params']
            })

    # Display results
    if summary_data:
        summary_df = pd.DataFrame(summary_data)
        summary_df = summary_df.sort_values('Accuracy', ascending=False)
        logger.info(f"\n{summary_df.to_string(index=False)}")

        # Save results
        results['summary'] = summary_df.to_dict('records')

        # Save comprehensive results
        with open('/content/model_evaluation_results.json', 'w') as f:
            json.dump(results, f, indent=2, default=str)

        logger.info("Results saved to /content/model_evaluation_results.json")

    return results


def plot_training_history(results: Dict, save_path: str = '/content/training_plots.png'):
    """
    Plot training history for all models.

    Args:
        results: Results dictionary from evaluate_all_models_gpu
        save_path: Path to save the plot
    """
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))

    # Plot training and validation loss
    ax = axes[0, 0]
    for model_name, result in results.items():
        if model_name != 'summary' and 'history' in result:
            history = result['history']
            if 'loss' in history:
                epochs = range(1, len(history['loss']) + 1)
                ax.plot(epochs, history['loss'], label=f'{model_name} Train')
                if 'val_loss' in history:
                    ax.plot(epochs, history['val_loss'], label=f'{model_name} Val', linestyle='--')

    ax.set_title('Model Loss')
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Loss')
    ax.legend()
    ax.grid(True)

    # Plot training and validation accuracy
    ax = axes[0, 1]
    for model_name, result in results.items():
        if model_name != 'summary' and 'history' in result:
            history = result['history']
            if 'accuracy' in history:
                epochs = range(1, len(history['accuracy']) + 1)
                ax.plot(epochs, history['accuracy'], label=f'{model_name} Train')
                if 'val_accuracy' in history:
                    ax.plot(epochs, history['val_accuracy'], label=f'{model_name} Val', linestyle='--')

    ax.set_title('Model Accuracy')
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Accuracy')
    ax.legend()
    ax.grid(True)

    # Plot final metrics comparison
    if 'summary' in results:
        summary_df = pd.DataFrame(results['summary'])

        ax = axes[1, 0]
        ax.bar(summary_df['Model'], summary_df['Accuracy'], color=['blue', 'green', 'red'])
        ax.set_title('Final Accuracy Comparison')
        ax.set_ylabel('Accuracy')
        ax.tick_params(axis='x', rotation=45)

        ax = axes[1, 1]
        ax.bar(summary_df['Model'], summary_df['Training Time (s)'], color=['blue', 'green', 'red'])
        ax.set_title('Training Time Comparison')
        ax.set_ylabel('Time (seconds)')
        ax.tick_params(axis='x', rotation=45)

    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.show()

    logger.info(f"Training plots saved to {save_path}")


# Example usage for Google Colab
if __name__ == "__main__":
    # Example usage - modify data_path to your actual data location
    logger.info("Starting GPU/TPU optimized deep learning evaluation...")

    # Example: evaluate with sample data
    try:
        results = evaluate_all_models_gpu(
            data_path='/content/sentiment_data.csv',  # Update this path
            sample_fraction=1.0,  # Use full dataset
            epochs=10
        )

        # Plot results
        plot_training_history(results)

        logger.info("Evaluation completed successfully!")

    except Exception as e:
        logger.error(f"Evaluation failed: {e}")
        raise

ERROR:__main__:Failed to load data: [Errno 2] No such file or directory: '/content/sentiment_data.csv'
ERROR:__main__:Evaluation failed: [Errno 2] No such file or directory: '/content/sentiment_data.csv'


FileNotFoundError: [Errno 2] No such file or directory: '/content/sentiment_data.csv'

### Now

In [None]:
"""
Deep Learning Models for Sentiment Analysis - GPU/TPU Optimized for Google Colab
PIPELINE-COMPATIBLE VERSION - Works with existing MICAP preprocessing pipeline
Implements LSTM, CNN, and Transformer models with CUDA/TPU acceleration

Author: AI Assistant
Date: 2025-01-20
Dependencies: tensorflow>=2.13.0, pandas, numpy, pyspark (for data loading)
Environment: Google Colab with GPU/TPU runtime
Pipeline: Reads from preprocessed parquet files created by MICAP pipeline
"""

import os
import numpy as np
import pandas as pd
import logging
import time
from typing import Dict, List, Tuple, Optional, Union
import json
import warnings
from pathlib import Path

# TensorFlow imports with GPU optimization
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks, optimizers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
from tensorflow.keras.mixed_precision import LossScaleOptimizer

# Additional ML libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# PySpark imports for reading existing pipeline data
try:
    from pyspark.sql import SparkSession
    from pyspark.sql.functions import col
    PYSPARK_AVAILABLE = True
except ImportError:
    PYSPARK_AVAILABLE = False
    logging.warning("PySpark not available. Will use pandas for data loading.")

tf.config.optimizer.set_jit(True)      # XLA compilation
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})

# Configure warnings and logging
warnings.filterwarnings('ignore', category=FutureWarning)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)


class PipelineDataLoader:
    """
    Loads data from MICAP preprocessing pipeline (parquet files)
    Maintains compatibility with existing feature engineering
    """

    def __init__(self):
        """Initialize pipeline data loader."""
        self.spark = None
        if PYSPARK_AVAILABLE:
            self._init_spark_session()

    def _init_spark_session(self):
        """Initialize Spark session for reading parquet files."""
        try:
            self.spark = (SparkSession.builder
                         .appName("ColabDataLoader")
                         .master("local[*]")
                         .config("spark.driver.memory", "4g")
                         .config("spark.sql.shuffle.partitions", "200")
                         .getOrCreate())
            self.spark.sparkContext.setLogLevel("WARN")
            logger.info("Spark session created for pipeline data loading")
        except Exception as e:
            logger.error(f"Failed to create Spark session: {e}")
            self.spark = None

    def load_pipeline_data(self, data_path: str,
                          sample_fraction: float = 1.0,
                          use_spark: bool = True) -> pd.DataFrame:
        """
        Load preprocessed data from MICAP pipeline.

        Args:
            data_path: Path to processed parquet file or directory
            sample_fraction: Fraction of data to use
            use_spark: Whether to use Spark for loading (fallback to pandas)

        Returns:
            pd.DataFrame: Loaded and sampled data
        """
        logger.info(f"Loading pipeline data from: {data_path}")

        if use_spark and self.spark and PYSPARK_AVAILABLE:
            return self._load_with_spark(data_path, sample_fraction)
        else:
            return self._load_with_pandas(data_path, sample_fraction)

    def _load_with_spark(self, data_path: str, sample_fraction: float) -> pd.DataFrame:
        """Load data using Spark (maintains original pipeline compatibility)."""
        logger.info("Loading data with Spark...")

        try:
            # Read parquet file(s)
            df = self.spark.read.parquet(data_path)

            # Sample if requested
            if sample_fraction < 1.0:
                df = df.sample(sample_fraction, seed=42)

            # Convert to pandas for TensorFlow compatibility
            # Use efficient streaming for large datasets
            pandas_df = self._spark_to_pandas_efficient(df)

            logger.info(f"Loaded {len(pandas_df)} records with Spark")
            return pandas_df

        except Exception as e:
            logger.error(f"Spark loading failed: {e}")
            logger.info("Falling back to pandas...")
            return self._load_with_pandas(data_path, sample_fraction)

    def _load_with_pandas(self, data_path: str, sample_fraction: float) -> pd.DataFrame:
        """Load data with pandas (fallback method)."""
        logger.info("Loading data with pandas...")

        try:
            # Try reading as parquet first
            if data_path.endswith('.parquet') or os.path.isdir(data_path):
                df = pd.read_parquet(data_path)
            else:
                # Fallback to CSV
                df = pd.read_csv(data_path)

            # Sample if requested
            if sample_fraction < 1.0:
                df = df.sample(frac=sample_fraction, random_state=42)

            logger.info(f"Loaded {len(df)} records with pandas")
            return df

        except Exception as e:
            logger.error(f"Failed to load data: {e}")
            raise

    def _spark_to_pandas_efficient(self, spark_df, batch_size: int = 50000) -> pd.DataFrame:
        """Efficiently convert Spark DataFrame to pandas using streaming."""
        try:
            # Try direct conversion for smaller datasets
            if spark_df.count() < batch_size:
                return spark_df.toPandas()

            # Stream large datasets in batches
            logger.info("Streaming large dataset in batches...")
            parts = []
            for batch in spark_df.toLocalIterator(batch_size):
                batch_df = pd.DataFrame(list(batch), columns=spark_df.columns)
                parts.append(batch_df)

            return pd.concat(parts, ignore_index=True)

        except Exception as e:
            logger.warning(f"Streaming failed, using direct conversion: {e}")
            return spark_df.toPandas()

    def validate_pipeline_features(self, df: pd.DataFrame) -> bool:
        """
        Validate that the DataFrame contains expected pipeline features.

        Args:
            df: DataFrame to validate

        Returns:
            bool: True if valid pipeline data
        """
        # Expected features from MICAP pipeline
        required_features = [
            'text', 'sentiment', 'text_processed',
            'text_length', 'processed_length', 'token_count'
        ]

        # Optional but expected features
        expected_features = [
            'vader_compound', 'vader_positive', 'vader_negative', 'vader_neutral',
            'emoji_sentiment', 'exclamation_count', 'question_count',
            'uppercase_ratio', 'punctuation_density',
            'hour_sin', 'hour_cos', 'is_weekend'
        ]

        # Check required features
        missing_required = [f for f in required_features if f not in df.columns]
        if missing_required:
            logger.error(f"Missing required pipeline features: {missing_required}")
            return False

        # Log available optional features
        available_optional = [f for f in expected_features if f in df.columns]
        logger.info(f"Available pipeline features: {len(available_optional)}/{len(expected_features)}")

        return True

    def prepare_features_for_training(self, df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, Dict]:
        """
        Prepare features from pipeline data for deep learning training.

        Args:
            df: DataFrame with pipeline features

        Returns:
            Tuple of (text_sequences, labels, feature_info)
        """
        logger.info("Preparing pipeline features for training...")

        # Validate data
        if not self.validate_pipeline_features(df):
            raise ValueError("Invalid pipeline data structure")

        # Use processed text for deep learning (already cleaned by pipeline)
        text_column = 'text_processed' if 'text_processed' in df.columns else 'text'
        texts = df[text_column].fillna('').astype(str)
        labels = df['sentiment'].values

        # Extract numeric features created by pipeline
        numeric_features = []
        feature_names = []

        # Basic text features
        if 'text_length' in df.columns:
            numeric_features.append(df['text_length'].fillna(0))
            feature_names.append('text_length')

        if 'token_count' in df.columns:
            numeric_features.append(df['token_count'].fillna(0))
            feature_names.append('token_count')

        # VADER sentiment features
        vader_features = ['vader_compound', 'vader_positive', 'vader_negative', 'vader_neutral']
        for feature in vader_features:
            if feature in df.columns:
                numeric_features.append(df[feature].fillna(0))
                feature_names.append(feature)

        # Emoji and text statistics
        text_stat_features = ['emoji_sentiment', 'exclamation_count', 'question_count',
                             'uppercase_ratio', 'punctuation_density']
        for feature in text_stat_features:
            if feature in df.columns:
                numeric_features.append(df[feature].fillna(0))
                feature_names.append(feature)

        # Temporal features
        temporal_features = ['hour_sin', 'hour_cos', 'is_weekend']
        for feature in temporal_features:
            if feature in df.columns:
                numeric_features.append(df[feature].fillna(0))
                feature_names.append(feature)

        # Combine numeric features
        if numeric_features:
            numeric_array = np.column_stack(numeric_features)
            logger.info(f"Extracted {len(feature_names)} numeric features: {feature_names}")
        else:
            numeric_array = None
            logger.warning("No numeric features found in pipeline data")

        feature_info = {
            'text_column': text_column,
            'numeric_features': feature_names,
            'numeric_shape': numeric_array.shape if numeric_array is not None else None,
            'text_samples': len(texts),
            'label_distribution': pd.Series(labels).value_counts().to_dict()
        }

        logger.info(f"Feature preparation completed: {feature_info}")

        return texts.values, labels, numeric_array, feature_info


class GPUEnvironmentManager:
    """Manages GPU/TPU environment setup and optimization for Google Colab"""

    def __init__(self):
        self.device_type = self._detect_accelerator()
        self.strategy = self._setup_distribution_strategy()
        self.mixed_precision_enabled = False
        # self._configure_mixed_precision()
        self._log_environment_info()

        # if self.device_type == 'GPU':
            # self._setup_mixed_precision()

    def _detect_accelerator(self) -> str:
        """Detect available accelerator (GPU/TPU/CPU)"""
        try:
            tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
            tf.config.experimental_connect_to_cluster(tpu)
            tf.tpu.experimental.initialize_tpu_system(tpu)
            logger.info("TPU detected and initialized")
            return 'TPU'
        except (ValueError, RuntimeError):
            pass

        gpus = tf.config.list_physical_devices('GPU')
        if gpus:
            logger.info(f"GPU detected: {len(gpus)} device(s)")
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            return 'GPU'

        logger.warning("No accelerator detected, using CPU")
        return 'CPU'

    def _setup_distribution_strategy(self):
        """Setup distribution strategy based on hardware"""
        if self.device_type == 'TPU':
            tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
            strategy = tf.distribute.TPUStrategy(tpu)
            logger.info(f"Using TPU strategy with {strategy.num_replicas_in_sync} replicas")
        elif self.device_type == 'GPU':
            strategy = tf.distribute.MirroredStrategy()
            logger.info(f"Using MirroredStrategy with {strategy.num_replicas_in_sync} replicas")
        else:
            strategy = tf.distribute.get_strategy()
            logger.info("Using default strategy (CPU)")

        return strategy

    def _setup_mixed_precision(self):
        """Setup mixed precision training for faster GPU training"""
        try:
            policy = tf.keras.mixed_precision.Policy('mixed_float16')
            tf.keras.mixed_precision.set_global_policy(policy)
            self.mixed_precision_enabled = True
            logger.info("Mixed precision training enabled (float16)")
        except Exception as e:
            logger.warning(f"Could not enable mixed precision: {e}")

    def _log_environment_info(self):
        """Log environment information"""
        logger.info("=== GPU/TPU Environment Information ===")
        logger.info(f"TensorFlow version: {tf.__version__}")
        logger.info(f"Detected accelerator: {self.device_type}")

        if self.device_type == 'GPU':
            gpus = tf.config.list_physical_devices('GPU')
            for i, gpu in enumerate(gpus):
                logger.info(f"GPU {i}: {gpu}")

        logger.info("=" * 50)

    def get_optimal_batch_size(self, base_batch_size: int = 16384) -> int:
        """Calculate optimal batch size based on hardware"""
        if self.device_type == 'TPU':
            return max(128, base_batch_size * 8)
        elif self.device_type == 'GPU':
            return base_batch_size * max(1, self.strategy.num_replicas_in_sync)
        else:
            return max(16, base_batch_size // 2)


class PipelineOptimizedModel:
    """
    Base class for pipeline-compatible GPU/TPU optimized deep learning models
    Works with features from MICAP preprocessing pipeline
    """

    def __init__(self, env_manager: GPUEnvironmentManager,
                 max_words: int = 10000, max_length: int = 100):
        self.env_manager = env_manager
        self.max_words = max_words
        self.max_length = max_length
        self.model = None
        self.tokenizer = None
        self.history = None
        self.training_time = 0
        self.numeric_features_dim = 0

    def prepare_data(self, texts: np.ndarray, labels: np.ndarray,
                    numeric_features: Optional[np.ndarray] = None,
                    validation_split: float = 0.2) -> Tuple:
        """
        Prepare text and numeric features for training.

        Args:
            texts: Array of text data
            labels: Array of labels
            numeric_features: Optional array of numeric features from pipeline
            validation_split: Validation split ratio

        Returns:
            Tuple of prepared datasets
        """
        logger.info("Preparing data for pipeline-compatible training...")

        # Initialize tokenizer
        self.tokenizer = Tokenizer(
            num_words=self.max_words,
            oov_token='<OOV>',
            filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
        )

        # Fit tokenizer
        self.tokenizer.fit_on_texts(texts)

        # Convert to sequences
        sequences = self.tokenizer.texts_to_sequences(texts)
        X_text = pad_sequences(sequences, maxlen=self.max_length,
                              padding='post', truncating='post')

        # Prepare numeric features if available
        X_numeric = None
        if numeric_features is not None:
            X_numeric = numeric_features.astype(np.float32)
            self.numeric_features_dim = X_numeric.shape[1]
            logger.info(f"Using {self.numeric_features_dim} numeric features from pipeline")

        y = labels.astype(np.float32)

        # Split data
        if X_numeric is not None:
            X_text_train, X_text_val, X_num_train, X_num_val, y_train, y_val = train_test_split(
                X_text, X_numeric, y, test_size=validation_split,
                random_state=42, stratify=y
            )

            return (X_text_train, X_num_train, y_train), (X_text_val, X_num_val, y_val)
        else:
            X_text_train, X_text_val, y_train, y_val = train_test_split(
                X_text, y, test_size=validation_split,
                random_state=42, stratify=y
            )

            return (X_text_train, y_train), (X_text_val, y_val)

    def _get_optimizer(self, learning_rate: float = 0.001):
        """Get optimized optimizer"""
        if self.env_manager.device_type == 'TPU':
            optimizer = optimizers.Adam(learning_rate=learning_rate * 2)
        else:
            optimizer = optimizers.Adam(learning_rate=learning_rate)

        if self.env_manager.mixed_precision_enabled:
            optimizer = LossScaleOptimizer(optimizer)

        return optimizer

    def _get_callbacks(self, model_name: str, patience: int = 5):
        """Get training callbacks"""
        return [
            callbacks.EarlyStopping(
                monitor='val_loss', patience=patience,
                restore_best_weights=True, verbose=1
            ),
            callbacks.ReduceLROnPlateau(
                monitor='val_loss', factor=0.5,
                patience=max(2, patience // 2), min_lr=1e-7, verbose=1
            ),
            callbacks.ModelCheckpoint(
                filepath=f'/content/best_{model_name}_pipeline_model.h5',
                monitor='val_loss', save_best_only=True, verbose=1
            )
        ]


class PipelineLSTMModel(PipelineOptimizedModel):
    """LSTM model optimized for pipeline features"""

    def build_model(self, embedding_dim: int = 128, lstm_units: int = 64,
                   dropout_rate: float = 0.3):
        """Build LSTM model with optional numeric features integration"""
        logger.info("Building pipeline-compatible LSTM model...")

        with self.env_manager.strategy.scope():
            # Text input branch
            text_input = layers.Input(shape=(self.max_length,), name='text_input')

            # Embedding layer
            embedding = layers.Embedding(
                input_dim=self.max_words,
                output_dim=embedding_dim,
                input_length=self.max_length,
                mask_zero=True,
                name='embedding'
            )(text_input)

            embedding = layers.SpatialDropout1D(dropout_rate * 0.5)(embedding)

            # Bidirectional LSTM layers
            lstm1 = layers.Bidirectional(
                layers.LSTM(lstm_units, dropout=dropout_rate,
                            return_sequences=True),
                name='bi_lstm_1'
            )(embedding)

            lstm2 = layers.Bidirectional(
                layers.LSTM(lstm_units // 2, dropout=dropout_rate,
                            return_sequences=False),
                name='bi_lstm_2'
            )(lstm1)

            # Text features
            text_features = layers.Dense(64, activation='relu', name='text_dense')(lstm2)
            text_features = layers.BatchNormalization()(text_features)
            text_features = layers.Dropout(dropout_rate)(text_features)

            # Combine with numeric features if available
            if self.numeric_features_dim > 0:
                # Numeric input branch
                numeric_input = layers.Input(shape=(self.numeric_features_dim,), name='numeric_input')
                numeric_features = layers.Dense(32, activation='relu', name='numeric_dense')(numeric_input)
                numeric_features = layers.BatchNormalization()(numeric_features)
                numeric_features = layers.Dropout(dropout_rate * 0.5)(numeric_features)

                # Combine text and numeric features
                combined = layers.Concatenate(name='combine_features')([text_features, numeric_features])
                inputs = [text_input, numeric_input]
            else:
                combined = text_features
                inputs = text_input

            # Final classification layers
            dense = layers.Dense(32, activation='relu', name='final_dense')(combined)
            dense = layers.Dropout(dropout_rate * 0.5)(dense)

            output = layers.Dense(1, activation='sigmoid', name='output')(dense)

            model = models.Model(inputs=inputs, outputs=output, name='PipelineLSTM')

        self.model = model

        # Compile model
        optimizer = self._get_optimizer()
        self.model.compile(
            optimizer=optimizer,
            loss='binary_crossentropy',
            metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
        )

        logger.info("LSTM Model Architecture:")
        self.model.summary(print_fn=logger.info)

        return model

    def train(self, train_data: Tuple, val_data: Tuple, epochs: int = 10):
        """Train the LSTM model"""
        logger.info("Training pipeline LSTM model...")

        callbacks_list = self._get_callbacks('LSTM')

        start_time = time.time()

        # Prepare training data
        if self.numeric_features_dim > 0:
            X_text_train, X_num_train, y_train = train_data
            X_text_val, X_num_val, y_val = val_data

            train_inputs = [X_text_train, X_num_train]
            val_inputs = [X_text_val, X_num_val]
        else:
            X_text_train, y_train = train_data
            X_text_val, y_val = val_data

            train_inputs = X_text_train
            val_inputs = X_text_val

        batch_size = self.env_manager.get_optimal_batch_size()

        # with self.env_manager.strategy.scope():
        #     self.history = self.model.fit(
        #         train_inputs, y_train,
        #         batch_size=batch_size,
        #         epochs=epochs,
        #         validation_data=(val_inputs, y_val),
        #         callbacks=callbacks_list,
        #         verbose=1
        #     )
        def make_ds(x_text, x_num, y, shuffle=True):
            inputs = (x_text, x_num) if x_num is not None else x_text
            ds = tf.data.Dataset.from_tensor_slices((inputs, y))
            if shuffle:
                ds = ds.shuffle(100_000)
            return ds.batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)

        train_ds = make_ds(*train_data, shuffle=True)
        val_ds   = make_ds(*val_data,   shuffle=False)

        with self.env_manager.strategy.scope():
            self.history = self.model.fit(
                train_ds,
                epochs=epochs,
                validation_data=val_ds,
                callbacks=callbacks_list,
                verbose=1
            )

        self.training_time = time.time() - start_time
        logger.info(f"Training completed in {self.training_time:.2f} seconds")

        return self.history.history


def evaluate_pipeline_models_gpu(data_path: str,
                                sample_fraction: float = 1.0,
                                epochs: int = 10) -> Dict:
    """
    Evaluate models using MICAP pipeline data with GPU/TPU optimization.

    Args:
        data_path: Path to processed pipeline data (parquet)
        sample_fraction: Fraction of data to use
        epochs: Number of training epochs

    Returns:
        Dict: Evaluation results
    """
    logger.info("Starting pipeline-compatible GPU evaluation...")

    # Initialize components
    env_manager = GPUEnvironmentManager()
    data_loader = PipelineDataLoader()

    # Load pipeline data
    df = data_loader.load_pipeline_data(data_path, sample_fraction)

    # Prepare features
    texts, labels, numeric_features, feature_info = data_loader.prepare_features_for_training(df)

    logger.info(f"Loaded pipeline data: {feature_info}")

    # Initialize models
    models_to_evaluate = [
        ('Pipeline_LSTM', PipelineLSTMModel)
    ]

    results = {}

    for model_name, ModelClass in models_to_evaluate:
        logger.info(f"\n{'='*60}")
        logger.info(f"Training {model_name}")
        logger.info(f"{'='*60}")

        try:
            # Initialize model
            model = ModelClass(env_manager)

            # Prepare data
            train_data, val_data = model.prepare_data(
                texts, labels, numeric_features
            )

            # Build model
            model.build_model()

            # Train model
            history = model.train(train_data, val_data, epochs=epochs)

            # Evaluate
            if model.numeric_features_dim > 0:
                val_inputs = [val_data[0], val_data[1]]
                val_labels = val_data[2]
            else:
                val_inputs = val_data[0]
                val_labels = val_data[1]

            eval_results = model.model.evaluate(val_inputs, val_labels, verbose=0)

            metrics = {}
            for i, metric_name in enumerate(model.model.metrics_names):
                metrics[metric_name] = eval_results[i]
            metrics['training_time'] = model.training_time

            results[model_name] = {
                'metrics': metrics,
                'history': history,
                'feature_info': feature_info
            }

            # Save model
            model.model.save(f'/content/{model_name.lower()}_model.h5')
            logger.info(f"Model saved as {model_name.lower()}_model.h5")

        except Exception as e:
            logger.error(f"Failed to train {model_name}: {e}")
            results[model_name] = {'error': str(e)}

    # Save results
    with open('pipeline_model_results.json', 'w') as f:
        json.dump(results, f, indent=2, default=str)

    logger.info("Pipeline evaluation completed!")
    return results


# Usage examples for Google Colab
if __name__ == "__main__":
    """
    Example usage in Google Colab:

    # Option 1: Use your existing pipeline data (RECOMMENDED)
    results = evaluate_pipeline_models_gpu(
        data_path='/content/pipeline_features.parquet',  # Upload your processed parquet file
        sample_fraction=1.0,
        epochs=10
    )

    # Option 2: If you have PySpark setup in Colab (advanced)
    # First run your preprocessing pipeline to create the parquet file
    # Then use the above approach
    """

    results = evaluate_pipeline_models_gpu(
        'part-00000-a1634db0-8c09-4cb9-b1ca-466599ba8394-c000.snappy.parquet',
        sample_fraction=1.0,
        epochs=5
    )
    results = evaluate_pipeline_models_gpu(
        'part-00001-a1634db0-8c09-4cb9-b1ca-466599ba8394-c000.snappy.parquet',
        sample_fraction=1.0,
        epochs=5
    )
    results = evaluate_pipeline_models_gpu(
        'part-00002-a1634db0-8c09-4cb9-b1ca-466599ba8394-c000.snappy.parquet',
        sample_fraction=1.0,
        epochs=5
    )
    results = evaluate_pipeline_models_gpu(
        'part-00003-a1634db0-8c09-4cb9-b1ca-466599ba8394-c000.snappy.parquet',
        sample_fraction=1.0,
        epochs=5
    )

    logger.info("Pipeline-compatible GPU deep learning models ready!")
    logger.info("Upload your 'pipeline_features.parquet' file to /content/ and run evaluation.")



Epoch 1/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2s/step - accuracy: 0.5203 - auc: 0.0000e+00 - loss: 0.6988 - learning_rate: 0.0010
Epoch 2/5


  current = self.get_monitor_value(logs)
  callback.on_epoch_end(epoch, logs)
  self._save_model(epoch=epoch, batch=None, logs=logs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2s/step - accuracy: 0.7907 - auc: 0.0000e+00 - loss: 0.5712 - learning_rate: 0.0010
Epoch 3/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2s/step - accuracy: 0.9076 - auc: 0.0000e+00 - loss: 0.4543 - learning_rate: 0.0010
Epoch 4/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2s/step - accuracy: 0.9593 - auc: 0.0000e+00 - loss: 0.3552 - learning_rate: 0.0010
Epoch 5/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2s/step - accuracy: 0.9822 - auc: 0.0000e+00 - loss: 0.2781 - learning_rate: 0.0010




Epoch 1/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.5760 - auc: 0.4401 - loss: 0.6584
Epoch 1: val_loss improved from inf to 0.17509, saving model to /content/best_LSTM_pipeline_model.h5




[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2s/step - accuracy: 0.5875 - auc: 0.4422 - loss: 0.6515 - val_accuracy: 0.9746 - val_auc: 0.4966 - val_loss: 0.1751 - learning_rate: 0.0010
Epoch 2/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.8419 - auc: 0.4779 - loss: 0.4773
Epoch 2: val_loss did not improve from 0.17509
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2s/step - accuracy: 0.8461 - auc: 0.4784 - loss: 0.4717 - val_accuracy: 0.9746 - val_auc: 0.4964 - val_loss: 0.1774 - learning_rate: 0.0010
Epoch 3/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.9279 - auc: 0.4952 - loss: 0.3413
Epoch 3: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 3: val_loss did not improve from 0.17509
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2s/step - accuracy: 0.



Epoch 1/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2s/step - accuracy: 0.4702 - auc: 0.0000e+00 - loss: 0.7274 - learning_rate: 0.0010
Epoch 2/5


  current = self.get_monitor_value(logs)
  callback.on_epoch_end(epoch, logs)
  self._save_model(epoch=epoch, batch=None, logs=logs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2s/step - accuracy: 0.6937 - auc: 0.0000e+00 - loss: 0.6174 - learning_rate: 0.0010
Epoch 3/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2s/step - accuracy: 0.7966 - auc: 0.0000e+00 - loss: 0.5237 - learning_rate: 0.0010
Epoch 4/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2s/step - accuracy: 0.8934 - auc: 0.0000e+00 - loss: 0.4252 - learning_rate: 0.0010
Epoch 5/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2s/step - accuracy: 0.9565 - auc: 0.0000e+00 - loss: 0.3346 - learning_rate: 0.0010




Epoch 1/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.7784 - auc: 0.0000e+00 - loss: 0.5816
Epoch 1: val_loss improved from inf to 0.10028, saving model to /content/best_LSTM_pipeline_model.h5




[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2s/step - accuracy: 0.7876 - auc: 0.0000e+00 - loss: 0.5752 - val_accuracy: 0.9999 - val_auc: 0.0000e+00 - val_loss: 0.1003 - learning_rate: 0.0010
Epoch 2/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.9499 - auc: 0.0000e+00 - loss: 0.3985
Epoch 2: val_loss improved from 0.10028 to 0.09630, saving model to /content/best_LSTM_pipeline_model.h5




[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2s/step - accuracy: 0.9520 - auc: 0.0000e+00 - loss: 0.3915 - val_accuracy: 0.9999 - val_auc: 0.0000e+00 - val_loss: 0.0963 - learning_rate: 0.0010
Epoch 3/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.9869 - auc: 0.0000e+00 - loss: 0.2464
Epoch 3: val_loss improved from 0.09630 to 0.08983, saving model to /content/best_LSTM_pipeline_model.h5




[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2s/step - accuracy: 0.9874 - auc: 0.0000e+00 - loss: 0.2426 - val_accuracy: 0.9999 - val_auc: 0.0000e+00 - val_loss: 0.0898 - learning_rate: 0.0010
Epoch 4/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.9955 - auc: 0.0000e+00 - loss: 0.1559
Epoch 4: val_loss improved from 0.08983 to 0.08560, saving model to /content/best_LSTM_pipeline_model.h5




[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2s/step - accuracy: 0.9957 - auc: 0.0000e+00 - loss: 0.1534 - val_accuracy: 1.0000 - val_auc: 0.0000e+00 - val_loss: 0.0856 - learning_rate: 0.0010
Epoch 5/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.9985 - auc: 0.0000e+00 - loss: 0.0989
Epoch 5: val_loss did not improve from 0.08560
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2s/step - accuracy: 0.9985 - auc: 0.0000e+00 - loss: 0.0972 - val_accuracy: 1.0000 - val_auc: 0.0000e+00 - val_loss: 0.0858 - learning_rate: 0.0010
Restoring model weights from the end of the best epoch: 4.


