In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models
import librosa
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import pywt
from multiprocessing import Pool, cpu_count
from functools import partial
import time

# Define paths and constants
TRAIN_DATA_PATH = '../dataset/'  # folder containing raga folders
TEST_DATA_PATH = '../Test/'  # folder containing test files
SAMPLE_RATE = 22050  # standard sample rate
DURATION = 20  # seconds per audio file
HOP_LENGTH = 512  # hop length for feature extraction
FRAME_SIZE = 2048  # frame size for feature extraction
WINDOW_SIZE = 80  # number of frames per window for LSTM input
HOP_FRAMES = 40  # hop size between windows (50% overlap)
N_MFCC = 13  # number of MFCC coefficients
BATCH_SIZE = 32  # batch size for training
MAX_EPOCHS = 50  # maximum number of epochs
USE_MIXED_PRECISION = True  # Enable mixed precision training
CACHE_FEATURES = True  # Cache extracted features to disk
FEATURES_CACHE_DIR = './features_cache_lstm/'  # Directory to cache features

# Function definitions must be at the module level for proper pickling
def extract_sequential_features(y, sr):
    """Extract sequential features for LSTM processing"""
    # Initialize features array to store per-frame features
    features = []
    
    # Frame the audio signal
    for i in range(0, len(y) - FRAME_SIZE, HOP_LENGTH):
        frame = y[i:i + FRAME_SIZE]
        if len(frame) < FRAME_SIZE:
            # Pad the last frame if needed
            frame = np.pad(frame, (0, FRAME_SIZE - len(frame)), 'constant')
        
        # Extract features for this frame
        frame_features = []
        
        # 1. MFCC (tonal characteristics)
        mfcc = librosa.feature.mfcc(y=frame, sr=sr, n_mfcc=N_MFCC)
        frame_features.extend(np.mean(mfcc, axis=1))
        
        # 2. Chroma (pitch class distribution)
        chroma = librosa.feature.chroma_stft(y=frame, sr=sr)
        frame_features.extend(np.mean(chroma, axis=1))
        
        # 3. Spectral Centroid (brightness of sound)
        centroid = librosa.feature.spectral_centroid(y=frame, sr=sr)
        frame_features.append(np.mean(centroid))
        
        # 4. Spectral Contrast (difference between peaks and valleys)
        contrast = librosa.feature.spectral_contrast(y=frame, sr=sr)
        frame_features.extend(np.mean(contrast, axis=1))
        
        # 5. Spectral Rolloff (frequency below which most energy exists)
        rolloff = librosa.feature.spectral_rolloff(y=frame, sr=sr)
        frame_features.append(np.mean(rolloff))
        
        # 6. Zero Crossing Rate (rhythmic feature)
        zcr = librosa.feature.zero_crossing_rate(frame)
        frame_features.append(np.mean(zcr))
        
        features.append(frame_features)
    
    # Convert to numpy array
    features = np.array(features)
    
    return features

def create_windowed_sequences(features, window_size=WINDOW_SIZE, hop_size=HOP_FRAMES):
    """Create overlapping windows for sequential processing"""
    sequences = []
    
    # Create windows with overlap
    for i in range(0, len(features) - window_size + 1, hop_size):
        window = features[i:i + window_size]
        if len(window) == window_size:  # Ensure we have complete windows
            sequences.append(window)
    
    # If we have at least one sequence, return as numpy array
    if sequences:
        return np.array(sequences)
    else:
        # Return at least one sequence (with zero padding if needed)
        if len(features) > 0:
            padding = np.zeros((window_size - len(features), features.shape[1]))
            padded_features = np.vstack([features, padding])
            return np.array([padded_features[:window_size]])
        else:
            # If no features at all, return empty window
            return np.zeros((1, window_size, 1))

def process_audio_file_lstm(audio_path, cache_dir=None):
    """Process a single audio file for LSTM with caching option"""
    # Check if cached features exist
    if cache_dir is not None:
        cache_filename = os.path.basename(audio_path).replace('.', '_') + '_lstm.npz'
        cache_path = os.path.join(cache_dir, cache_filename)
        
        if os.path.exists(cache_path):
            cached_data = np.load(cache_path, allow_pickle=True)
            return cached_data['sequences']
    
    try:
        # Load audio file with resampling
        y, sr = librosa.load(audio_path, sr=SAMPLE_RATE, duration=DURATION, res_type='kaiser_fast')
        
        # Add small silence if audio is shorter than expected duration
        if len(y) < DURATION * SAMPLE_RATE:
            y = np.pad(y, (0, DURATION * SAMPLE_RATE - len(y)), 'constant')
        
        # Extract sequential features
        features = extract_sequential_features(y, sr)
        
        # Create overlapping windows
        sequences = create_windowed_sequences(features)
        
        # Cache features if requested
        if cache_dir is not None:
            np.savez_compressed(cache_path, sequences=sequences)
        
        return sequences
    
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return None

def load_training_data_lstm(data_path, cache_dir=None):
    """Load training data for LSTM with parallel processing"""
    sequences = []
    labels = []
    label_map = {}
    
    # Get all raga folders
    raga_folders = [f for f in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, f))]
    
    # Create label mapping
    for i, raga in enumerate(sorted(raga_folders)):
        label_map[raga] = i
    
    print(f"Found {len(raga_folders)} ragas: {sorted(raga_folders)}")
    
    # Prepare list of all audio files with their labels
    audio_paths = []
    audio_labels = []
    
    for raga in raga_folders:
        raga_path = os.path.join(data_path, raga)
        print(f"Finding files for raga: {raga}")
        
        # Get all audio files
        audio_files = [f for f in os.listdir(raga_path) if f.endswith(('.wav', '.mp3'))]
        
        for audio_file in audio_files:
            audio_path = os.path.join(raga_path, audio_file)
            audio_paths.append(audio_path)
            audio_labels.append(label_map[raga])
    
    # Process files sequentially instead of using multiprocessing
    print(f"Processing {len(audio_paths)} files...")
    
    start_time = time.time()
    
    # Sequential processing - safe alternative
    results = []
    for path in audio_paths:
        result = process_audio_file_lstm(path, cache_dir)
        results.append(result)
    
    end_time = time.time()
    print(f"Feature extraction completed in {end_time - start_time:.2f} seconds")
    
    # Collect results
    for i, seq in enumerate(results):
        if seq is not None:
            # For each sequence window from this audio file
            for window in seq:
                sequences.append(window)
                labels.append(audio_labels[i])
    
    # Convert to numpy arrays
    sequences = np.array(sequences)
    labels = np.array(labels)
    
    print(f"Created {len(sequences)} sequence windows from {len(audio_paths)} audio files")
    print(f"Sequence shape: {sequences.shape}")
    
    return sequences, labels, label_map

def load_test_data_lstm(test_path, label_map, cache_dir=None):
    """Load test data for LSTM model"""
    sequences = []
    labels = []
    file_sequences = []  # To track which sequences belong to which file
    file_names = []
    
    # Get all test files
    test_files = [f for f in os.listdir(test_path) if f.endswith(('.wav', '.mp3'))]
    audio_paths = []
    expected_labels = []
    
    for file in test_files:
        # Extract raga name from filename (assuming format is raga_name_test_XX.wav)
        raga_name = file.split('_')[0]
        
        if raga_name in label_map:
            audio_path = os.path.join(test_path, file)
            audio_paths.append(audio_path)
            expected_labels.append(label_map[raga_name])
            file_names.append(file)
    
    # Process files sequentially instead of using multiprocessing
    results = []
    for path in audio_paths:
        result = process_audio_file_lstm(path, cache_dir)
        results.append(result)
    
    # Collect results
    for i, seq in enumerate(results):
        if seq is not None:
            # For testing, keep track of which sequences belong to which file
            file_sequences.append(len(sequences))  # Store the start index
            
            # For each sequence window from this audio file
            for window in seq:
                sequences.append(window)
                labels.append(expected_labels[i])
            
            file_sequences.append(len(sequences))  # Store the end index
    
    # Convert to numpy arrays
    sequences = np.array(sequences)
    labels = np.array(labels)
    
    return sequences, labels, file_names, file_sequences

def build_lstm_model(input_shape, num_classes):
    """Build LSTM model for sequential audio classification"""
    model = models.Sequential([
        # Bidirectional LSTM layers
        layers.Bidirectional(layers.LSTM(128, return_sequences=True), 
                           input_shape=input_shape),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        layers.Bidirectional(layers.LSTM(64)),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        # Dense layers for classification
        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        layers.Dense(64, activation='relu'),
        layers.BatchNormalization(),
        
        # Output layer
        layers.Dense(num_classes, activation='softmax')
    ])
    
    # Compile model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

def train_and_evaluate_lstm():
    """Train and evaluate LSTM model for raga classification"""
    cache_dir = FEATURES_CACHE_DIR if CACHE_FEATURES else None
    
    # Load training data
    start_time = time.time()
    X_train_sequences, y_train, label_map = load_training_data_lstm(
        TRAIN_DATA_PATH, cache_dir)
    
    print(f"Data loading completed in {time.time() - start_time:.2f} seconds")
    print(f"Training sequences shape: {X_train_sequences.shape}")
    
    # Split into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_sequences, y_train, test_size=0.2, random_state=42, stratify=y_train)
    
    print(f"Training set shape: {X_train.shape}")
    print(f"Validation set shape: {X_val.shape}")
    
    # Build and compile the LSTM model
    input_shape = (X_train.shape[1], X_train.shape[2])  # (time_steps, features)
    model = build_lstm_model(input_shape, num_classes=len(label_map))
    model.summary()
    
    # Create TF datasets for better performance
    train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    train_ds = train_ds.cache().shuffle(buffer_size=1000).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    
    val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val))
    val_ds = val_ds.cache().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    
    # Create callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            patience=8, restore_best_weights=True, monitor='val_accuracy'),
        tf.keras.callbacks.ReduceLROnPlateau(
            factor=0.5, patience=4, min_lr=0.00001, monitor='val_loss'),
        tf.keras.callbacks.ModelCheckpoint(
            'best_raga_lstm_model.h5', save_best_only=True, monitor='val_accuracy')
    ]
    
    # Train the model
    training_start = time.time()
    history = model.fit(
        train_ds,
        epochs=MAX_EPOCHS,
        validation_data=val_ds,
        callbacks=callbacks
    )
    training_time = time.time() - training_start
    print(f"Model training completed in {training_time:.2f} seconds ({training_time/60:.2f} minutes)")
    
    # Plot training history
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train')
    plt.plot(history.history['val_accuracy'], label='Validation')
    plt.title('LSTM Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train')
    plt.plot(history.history['val_loss'], label='Validation')
    plt.title('LSTM Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('lstm_training_history.png')
    plt.show()
    
    # Evaluate on test data
    X_test_sequences, y_test, file_names, file_sequences = load_test_data_lstm(
        TEST_DATA_PATH, label_map, cache_dir)
    
    test_ds = tf.data.Dataset.from_tensor_slices((X_test_sequences, y_test)).batch(BATCH_SIZE)
    
    test_loss, test_acc = model.evaluate(test_ds)
    print(f"Test accuracy on sequence level: {test_acc:.4f}")
    
    # Get predictions for each sequence
    y_pred_seq = model.predict(X_test_sequences)
    y_pred_seq_classes = np.argmax(y_pred_seq, axis=1)
    
    # For each file, aggregate predictions across all its sequences (majority voting)
    file_predictions = []
    file_true_labels = []
    
    for i in range(0, len(file_sequences), 2):
        start_idx = file_sequences[i]
        end_idx = file_sequences[i+1]
        
        # Get predictions for all sequences in this file
        file_seq_preds = y_pred_seq_classes[start_idx:end_idx]
        
        # Majority voting
        if len(file_seq_preds) > 0:
            unique_preds, counts = np.unique(file_seq_preds, return_counts=True)
            file_pred = unique_preds[np.argmax(counts)]
            file_predictions.append(file_pred)
            
            # The true label is the same for all sequences of a file
            file_true_labels.append(y_test[start_idx])
    
    # Calculate file-level accuracy
    file_acc = np.mean(np.array(file_predictions) == np.array(file_true_labels))
    print(f"Test accuracy on file level (majority voting): {file_acc:.4f}")
    
    # Inverse mapping for class names
    id_to_raga = {v: k for k, v in label_map.items()}
    
    # Create confusion matrix at file level
    cm = confusion_matrix(file_true_labels, file_predictions)
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
               xticklabels=[id_to_raga[i] for i in range(len(label_map))],
               yticklabels=[id_to_raga[i] for i in range(len(label_map))])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix (File Level)')
    plt.tight_layout()
    plt.savefig('lstm_confusion_matrix.png')
    plt.show()
    
    # Print classification report at file level
    print("\nClassification Report (File Level):")
    class_report = classification_report(
        file_true_labels, file_predictions, 
        target_names=[id_to_raga[i] for i in range(len(label_map))],
        output_dict=True
    )
    
    # Convert to DataFrame for better visualization
    df_report = pd.DataFrame(class_report).transpose()
    print(df_report)
    
    # Save file-level predictions to CSV
    results_df = pd.DataFrame({
        'File': file_names[:len(file_predictions)],
        'True Raga': [id_to_raga[i] for i in file_true_labels],
        'Predicted Raga': [id_to_raga[i] for i in file_predictions],
        'Correct': np.array(file_true_labels) == np.array(file_predictions)
    })
    results_df.to_csv('lstm_test_predictions.csv', index=False)
    print("Prediction results saved to 'lstm_test_predictions.csv'")
    
    # Save the model
    model.save('raga_classification_lstm_model.h5')
    print("Model saved as 'raga_classification_lstm_model.h5'")
    
    # Print performance summary
    print("\nPerformance Summary:")
    print(f"Total data processing and training time: {(time.time() - start_time)/60:.2f} minutes")
    print(f"Model training time: {training_time/60:.2f} minutes")
    print(f"Final test accuracy (sequence level): {test_acc:.4f}")
    print(f"Final test accuracy (file level): {file_acc:.4f}")
    
    return model, label_map

# Additional utility function to visualize LSTM attention on audio features
def visualize_sequence_attention(model, audio_path, label_map):
    """Visualize which parts of the audio the LSTM pays attention to"""
    # This would require a model with attention mechanism
    # Implementation would depend on the specific attention layer used
    pass

# For safer multiprocessing, wrap the main execution in a proper if __name__ block
# and set the correct start method
if __name__ == "__main__":
    # Set multiprocessing start method to 'spawn' for better compatibility
    # This is especially important on Windows and macOS
    try:
        import multiprocessing
        multiprocessing.set_start_method('spawn')
    except RuntimeError:
        # The start method might already be set
        pass
    
    # Enable mixed precision training if GPU is available
    if USE_MIXED_PRECISION and tf.config.list_physical_devices('GPU'):
        policy = tf.keras.mixed_precision.Policy('mixed_float16')
        tf.keras.mixed_precision.set_global_policy(policy)
        print("Mixed precision training enabled")
    
    # Create cache directory if it doesn't exist
    if CACHE_FEATURES and not os.path.exists(FEATURES_CACHE_DIR):
        os.makedirs(FEATURES_CACHE_DIR)
    
    # Set memory growth for GPUs to avoid OOM errors
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            print(f"GPU(s) detected: {len(gpus)}")
        except RuntimeError as e:
            print(e)
    
    # Run the training process
    model, label_map = train_and_evaluate_lstm()

Found 10 ragas: ['Abhogi', 'Ahir Bhairav', 'Bageshree', 'Bhairavi', 'Bhoopali', 'Jog', 'Malhar', 'Shree', 'Todi', 'Yaman']
Finding files for raga: Ahir Bhairav
Finding files for raga: Malhar
Finding files for raga: Bageshree
Finding files for raga: Jog
Finding files for raga: Yaman
Finding files for raga: Bhoopali
Finding files for raga: Bhairavi
Finding files for raga: Todi
Finding files for raga: Shree
Finding files for raga: Abhogi
Processing 4144 files...


  return pitch_tuning(


KeyboardInterrupt: 