In [188]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Layer, Dense, Dropout
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, GlobalAveragePooling2D
import pickle
import numpy as np
from tensorflow.keras import backend as K
from sklearn.metrics import f1_score
import os


## Model Architecture Notes
- need to handle variable length inputs
- Input spectrograms or MFCCs
- use tf.data.Dataset

In [29]:
file_path = r"C:\Users\proba\OneDrive\Documents\Booz Training\generative-ai-audio-enhancement-classification\data\features\fold1\spectrograms.pickle"
with open(file_path, 'rb') as f:
    data = pickle.load(f)

data[0]

{'file': '101415-3-0-2.wav',
 'fold': 'fold1',
 'class': 3,
 'spectrogram': array([[7.5957165e+00, 1.1212744e+01, 8.0402737e+00, ..., 4.4006630e+01,
         1.2173186e+02, 6.7787331e+01],
        [7.7117748e+00, 7.3635635e+00, 7.2426586e+00, ..., 2.2728676e+01,
         5.1560234e+01, 1.8138807e+01],
        [4.9771862e+00, 1.8851723e+01, 2.3887272e+01, ..., 7.8501983e+00,
         1.3240112e+01, 1.4655676e+01],
        ...,
        [1.4012260e-02, 3.4747527e-03, 4.4790872e-12, ..., 3.9292393e-12,
         2.6693215e-07, 5.3465010e-06],
        [1.3874512e-02, 3.4418404e-03, 3.7548515e-12, ..., 4.9936058e-12,
         1.7793683e-07, 3.6044783e-06],
        [1.3787241e-02, 3.4209301e-03, 4.3137828e-12, ..., 1.9792165e-12,
         1.2118500e-07, 2.4911822e-06]], dtype=float32)}

In [144]:
spectrograms = []
labels = []
for row in data:
    spectrograms.append(row['spectrogram'])
    labels.append(row['class'])

target_shape = (128, 238)

# Pad each numpy array in each tuple
padded_data = []
for item in spectrograms:
    padded_item = tuple(
        pad_sequences(
            [array], maxlen=target_shape[1], dtype='float32', padding='post', truncating='post', value=0.0
        )[0]
        for array in item
    )
    padded_data.append(padded_item)




In [156]:
# Convert to TensorFlow dataset
one_hot_labels = tf.one_hot(labels, depth=10)

dataset = tf.data.Dataset.from_tensor_slices((padded_data, one_hot_labels.numpy()))

# Optionally batch the dataset
batch_size = 32
dataset = dataset.batch(batch_size=batch_size)

In [184]:
class ExtractPatches(Layer):
    def __init__(self, **kwargs):
        super(ExtractPatches, self).__init__(**kwargs)

    def call(self, inputs):
        shape = tf.shape(inputs)
        reshaped_data = tf.reshape(inputs, (-1, shape[1], shape[2], 1))
        patches = tf.image.extract_patches(reshaped_data, sizes=[1, 16, 16, 1], strides=[1, 8, 8, 1], rates=[1,1,1,1], padding='SAME')
        return patches

def embed(data):
   patches = ExtractPatches()(data)
   dense = Dense(128, activation='relu')(patches)
   return dense

# Build the Transformer Block

def transformer_block(x):
    # MultiHead Attention (add only one MultiHeadAttention with 2 heads, add droput and LayerNormalization)
    attention = MultiHeadAttention(2, key_dim = 128) (x, x)
    attention = Dropout(.2)(attention)
    out1 = LayerNormalization(epsilon=1e-6) (x + attention)

    # Feed Forward Network (add fully connected layers)
    ffn_output = Dense(128, activation = 'relu')(out1)
    ffn_output = Dropout(.2)(ffn_output)

    return LayerNormalization(epsilon=1e-6)(out1 + ffn_output)

# Assemble full model

def build_model():
    input = Input(shape = (128,238))
    embedding_layer = embed(input)

    x = transformer_block(embedding_layer)
    x = GlobalAveragePooling2D()(x)
    x =  Dropout(0.1)(x)# add droput layer
    outputs = Dense(10, activation='sigmoid')(x)#dende layer

    model = Model(input, outputs) # define inputs and outputs
    model.compile(loss='categorical_crossentropy', optimizer=Adam(0.0002, 0.5), metrics=['accuracy', 'f1_score']) # define params
    return model

In [189]:
model = build_model()
# display architecture

In [191]:
os.chdir(r"C:\Users\proba\OneDrive\Documents\Booz Training\generative-ai-audio-enhancement-classification\data\features")
data_folds = []
label_folds = []
for root, dirs, files in os.walk("."):  
    for file in files:
        if "spectrograms" in file:
            folder = os.path.basename(root)
            i = int(folder[-1])
            relative_path = os.path.join(folder, file)
            with open(relative_path, 'rb') as f:
                data = pickle.load(f)

            specs = []
            lbls = []
            for row in data:
                specs.append(row['spectrogram'])
                lbls.append(row['class'])

            padded_data = []
            target_shape = (128, 238)
            specs = specs[0:50]
            lbls = lbls[0:50]
            for item in specs:
                padded_item = tuple(
                    pad_sequences(
                        [array], maxlen=target_shape[1], dtype='float32', padding='post', truncating='post', value=0.0
                    )[0]
                    for array in item
                )
                padded_data.append(padded_item)

            one_hot_labels = tf.one_hot(lbls, depth=10)
            data_folds.append(padded_data)
            label_folds.append(one_hot_labels)


In [194]:
num_folds = 10

fold_losses = []
fold_accuracies = []
fold_f1_scores = []
for fold in range(num_folds):
    train_data, val_data = [], []
    train_labels, val_labels = [], []
    for i, (data_fold, label_fold) in enumerate(zip(data_folds, label_folds)):
        if i == fold:
            val_data.extend(data_fold)
            val_labels.extend(label_fold)
        else:
            train_data.extend(data_fold)
            train_labels.extend(label_fold)

    # Convert lists to numpy arrays if needed
    train_data = np.array(train_data)
    val_data = np.array(val_data)
    train_labels = np.array(train_labels)
    val_labels = np.array(val_labels)

    

    train_dataset = tf.data.Dataset.from_tensor_slices((train_data, train_labels))
    # Optionally batch the dataset
    batch_size = 32
    train_dataset = train_dataset.batch(batch_size=batch_size)

    validation_dataset = tf.data.Dataset.from_tensor_slices((val_data, val_labels)).batch(batch_size=batch_size).shuffle(buffer_size=100)


    # Train your model on train_data and train_labels
    model.fit(train_dataset, epochs=10) #define params
    # Evaluate your model on val_data and val_labels
    results = model.evaluate(validation_dataset)
    fold_loss = results[0]
    fold_accuracy = results[1]

    fold_losses.append(fold_loss)
    fold_accuracies.append(fold_accuracy)
    fold_predictions = model.predict(val_data)
    val_labels_indices = np.argmax(val_labels, axis=1)
    fold_f1 = f1_score(val_labels_indices, fold_predictions.argmax(axis=1), average='macro')
    fold_f1_scores.append(fold_f1)
    # Calculate performance metric (e.g., accuracy) for this fold

# Calculate average performance metric across all folds
#average_accuracy = ...
avg_loss = sum(fold_losses) / len(fold_losses)
avg_accuracy = sum(fold_accuracies) / len(fold_accuracies)
avg_f1_score = sum(fold_f1_scores) / len(fold_f1_scores)

print("Average Loss:", avg_loss)
print("Average Accuracy:", avg_accuracy)
print("Average F1 Score:", avg_f1_score)


Epoch 1/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 184ms/step - accuracy: 1.0000 - f1_score: 0.8625 - loss: 0.0204
Epoch 2/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 178ms/step - accuracy: 1.0000 - f1_score: 0.8625 - loss: 0.0211
Epoch 3/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 227ms/step - accuracy: 1.0000 - f1_score: 0.8625 - loss: 0.0198
Epoch 4/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 226ms/step - accuracy: 1.0000 - f1_score: 0.8625 - loss: 0.0172
Epoch 5/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 250ms/step - accuracy: 1.0000 - f1_score: 0.8625 - loss: 0.0172
Epoch 6/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 186ms/step - accuracy: 1.0000 - f1_score: 0.8625 - loss: 0.0152
Epoch 7/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 241ms/step - accuracy: 1.0000 - f1_score: 0.8625 - loss: 0.0145
Epoch 8/10
[1m15/15[0m [