In [None]:
import os
import numpy as np
import librosa
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, BatchNormalization
from tensorflow.keras.optimizers import Adam

# ---------------------- Parameters ----------------------
n_mfcc = 40
max_pad_len = 174

# ---------------------- Feature Extraction ----------------------
def extract_features(file_path):
    try:
        audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
        if mfccs.shape[1] < max_pad_len:
            pad_width = max_pad_len - mfccs.shape[1]
            mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            mfccs = mfccs[:, :max_pad_len]
        return mfccs
    except Exception as e:
        print(f"❌ Error extracting features from {file_path}: {e}")
        return None

# ---------------------- Load and Split Data ----------------------
def load_and_split_data(folder_path):
    X = []
    y = []
    print(f"\n📂 Loading data from: {folder_path}")
    
    for label in os.listdir(folder_path):
        class_folder = os.path.join(folder_path, label)
        if os.path.isdir(class_folder):
            clean_label = re.sub(r'\s*Test.*$', '', label).strip()
            for filename in os.listdir(class_folder):
                if filename.endswith(".wav"):
                    file_path = os.path.join(class_folder, filename)
                    mfcc = extract_features(file_path)
                    if mfcc is not None:
                        X.append(mfcc)
                        y.append(clean_label)
    
    print(f"✅ Loaded {len(X)} audio files from {folder_path}")
    X = np.array(X)
    y = np.array(y)

    # Normalize features
    X = (X - np.mean(X)) / np.std(X)

    # Reshape for LSTM
    X = X.transpose((0, 2, 1))  # (samples, timesteps, features)

    # Encode labels
    label_encoder = LabelEncoder()
    y_enc = label_encoder.fit_transform(y)

    # Train-Val-Test split (60/20/20)
    X_train, X_temp, y_train, y_temp = train_test_split(X, y_enc, test_size=0.4, stratify=y_enc, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

    return X_train, X_val, X_test, y_train, y_val, y_test, label_encoder

# ---------------------- Build Model ----------------------
def build_model(input_shape, num_classes):
    model = Sequential()
    model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=input_shape))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))

    model.add(Bidirectional(LSTM(64)))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))

    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.0005), metrics=['accuracy'])
    return model

# ---------------------- Train and Evaluate ----------------------
def train_model(dataset_folder):
    print("📦 Loading and splitting data...")
    X_train, X_val, X_test, y_train, y_val, y_test, label_encoder = load_and_split_data(dataset_folder)

    model = build_model(input_shape=X_train.shape[1:], num_classes=len(label_encoder.classes_))
    
    print("\n🚀 Starting training...")
    model.fit(X_train, y_train, epochs=40, batch_size=16, validation_data=(X_val, y_val))

    # Evaluate
    print("\n📊 Evaluating on test data...")
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f"\n✅ Final test accuracy: {accuracy * 100:.2f}%")

    return model

# ---------------------- Run ----------------------
dataset_folder = r"C:\Users\Lakshay Aggarwal\Downloads\LSTM-20250508T035831Z-1-001\LSTM\dataset"
model = train_model(dataset_folder)