In [1]:
import librosa
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
import pickle
from sklearn.model_selection import train_test_split

In [2]:
def extract_mfcc(filepath, n_mfcc=13, hop_length=512, n_fft=2048):
    if filepath.endswith('.wav'):
        audio, sr = librosa.load(filepath, sr=None)
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length, n_fft=n_fft)
        features = mfcc.T #Transpose to get the correct shape
        average_features = np.mean(features, axis=0)
        return average_features

In [3]:
train_folder_path = '/content/drive/MyDrive/Assignment 2/train'
mfcc_features_list = []
file_names_list = []

for filename in os.listdir(train_folder_path):
    filepath = os.path.join(train_folder_path, filename)
    if filepath.endswith('.wav'):
        mfcc = extract_mfcc(filepath)
        if mfcc is not None:
            mfcc_features_list.append(mfcc)
            file_names_list.append(filename)

# Convert the MFCC features list to a NumPy array
mfcc_features_array = np.array(mfcc_features_list)

In [4]:
mfcc_train, mfcc_val, filenames_train, filenames_val = train_test_split(
    mfcc_features_array,
    file_names_list,
    test_size=0.2,
    random_state=42
)

In [5]:
# Save the filenames list as a pickle file
with open('/content/drive/MyDrive/train_filenames.pkl', 'wb') as f:
    pickle.dump(file_names_list, f)

In [6]:
print(mfcc_features_array[0])

[-1.8044212e+02  1.3999611e+02 -8.1985598e+00  3.1531612e+01
 -4.2815495e+00  2.0610485e+01 -1.0084117e+01  1.3115306e+01
  5.2859783e-01  2.1011593e+00  1.3066685e-01  3.4481905e+00
  5.4831678e-01]


In [7]:
print(f"MFCC Features Array Shape: {mfcc_features_array.shape}")

MFCC Features Array Shape: (220, 13)


In [8]:
input_shape = mfcc_features_array.shape[1]

# Encoder
inputs = Input(shape=(input_shape))
encoded = Dense(128, activation="relu")(inputs)
encoded = Dense(64, activation="relu")(encoded)
encoded = Dense(32, activation="relu", name="encoder_layer")(encoded)

# Decoder
decoded = Dense(64, activation="relu")(encoded)
decoded = Dense(128, activation="relu")(decoded)
decoded = Dense(13)(decoded)

autoencoder = Model(inputs, decoded)

In [9]:
# Compile the model
autoencoder.compile(optimizer='adam', loss='mse')

# Train the model
autoencoder.fit(mfcc_train, mfcc_train, epochs=100, batch_size=32, validation_data=(mfcc_val, mfcc_val))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7ff5606d8e80>

In [10]:
# Extract the encoder part of the model
encoder = Model(inputs=autoencoder.input, outputs=autoencoder.get_layer('encoder_layer').output)

# Encode all training audios to latent representations
latent_representations = encoder.predict(mfcc_features_array)



In [11]:
# Save the latent representations (LRDB) as a pickle file
with open('/content/drive/MyDrive/LRDB.pkl', 'wb') as f:
    pickle.dump(latent_representations, f)

In [12]:
encoder.save('/content/drive/MyDrive/encoder_model.keras')