In [2]:
import librosa
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

## Loading the Dataset
We will use librosa to extract audio features (e.g., MFCCs) and pandas to handle the CSV labels.

In [3]:
# Path to the audio folder and label CSV
audio_folder = '../dataset_normalized'
labels_csv = 'audio_durations_labels.csv'

# Load the labels
labels_df = pd.read_csv(labels_csv)

# Function to extract audio features (MFCCs) from the audio file
def extract_mfcc(file_path, n_mfcc=13, max_len=400):
    y, sr = librosa.load(file_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    
    # Padding or truncating the MFCCs to ensure uniform length
    if mfcc.shape[1] < max_len:
        mfcc = np.pad(mfcc, ((0, 0), (0, max_len - mfcc.shape[1])), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
        
    return mfcc.T  # Transpose to have shape (max_len, n_mfcc)

# Create lists to store features and labels
features = []
labels = []

# Map string labels to integers
label_map = {'short': 0, 'medium': 1, 'long': 2}

# Load and preprocess each audio file
for i, row in labels_df.iterrows():
    file_name = row['file_name']
    label = row['label']
    
    # Construct the full path to the audio file
    file_path = f"{audio_folder}/{file_name}"
    
    # Extract features
    mfcc_features = extract_mfcc(file_path)
    
    # Append features and corresponding label
    features.append(mfcc_features)
    labels.append(label_map[label])

# Convert to numpy arrays
X = np.array(features)
y = np.array(labels)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Define the RNN Model in TensorFlow
We'll define the RNN model using LSTM layers in TensorFlow. The shape of the input data will be (max_len, n_mfcc).

In [5]:
# Define the RNN model using the updated argument 'shape' instead of 'input_shape'
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(shape=(max_len, input_size)),  # Updated 'input_shape' to 'shape'
    tf.keras.layers.LSTM(hidden_size, return_sequences=False),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display the model architecture
model.summary()


## Train the Model
Train the model on the training dataset (X_train, y_train) for a few epochs.

In [6]:
# Train the model
epochs = 10
batch_size = 16

history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, batch_size=batch_size)

Epoch 1/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 120ms/step - accuracy: 0.3476 - loss: 1.1819 - val_accuracy: 0.6432 - val_loss: 0.8475
Epoch 2/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 119ms/step - accuracy: 0.6801 - loss: 0.7829 - val_accuracy: 0.8000 - val_loss: 0.6195
Epoch 3/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 115ms/step - accuracy: 0.8199 - loss: 0.5621 - val_accuracy: 0.8432 - val_loss: 0.4953
Epoch 4/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 115ms/step - accuracy: 0.8206 - loss: 0.4690 - val_accuracy: 0.8324 - val_loss: 0.4654
Epoch 5/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 118ms/step - accuracy: 0.8094 - loss: 0.4667 - val_accuracy: 0.8432 - val_loss: 0.4344
Epoch 6/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 117ms/step - accuracy: 0.8664 - loss: 0.3772 - val_accuracy: 0.8432 - val_loss: 0.4176
Epoch 7/10
[1m27/27[0m [3

## Save the Model
Once the model is trained, we saved it in TensorFlow's SavedModel format so it can be converted to TensorFlow.js.

In [12]:
# Save the model in TensorFlow SavedModel format (directory-based)
model.export("rnn_model_tf")

INFO:tensorflow:Assets written to: rnn_model_tf/assets


INFO:tensorflow:Assets written to: rnn_model_tf/assets


Saved artifact at 'rnn_model_tf'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 400, 13), dtype=tf.float32, name='keras_tensor_3')
Output Type:
  TensorSpec(shape=(None, 3), dtype=tf.float32, name=None)
Captures:
  13762444128: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13762444480: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13762442368: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13762443424: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13762439904: TensorSpec(shape=(), dtype=tf.resource, name=None)
