In [1]:
import pandas as pd
import numpy as np
import os
import time
import librosa
import librosa.display
import matplotlib.pyplot as plt
from pydub import AudioSegment
from scipy.ndimage import zoom
from keras.applications import VGG16
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
import cv2 as cv #REMEMBER TO ADD THIS TO REQUIREMENTS!!!!!!
from keras import layers, models, Model
from keras.optimizers import Adam


In [3]:
def create_spectrogram(wav_path, sr=16000):

    y, sr = librosa.load(wav_path)

    # Create mel spectrogram
    mel_spect = librosa.feature.melspectrogram(
        y=y,
        sr=sr,
        n_mels=128,  # Number of mel bands
        fmax=8000    # Maximum frequency
    )

    # Convert to log scale and return
    return np.array(librosa.power_to_db(mel_spect, ref=np.max))

wav_path = '../raw_data/test/0_4753_4846_OffMenu263.wav'
spectrogram = create_spectrogram(wav_path)

In [4]:
spectrogram.shape

(128, 216)

In [6]:
def resize_spectrogram(spectrogram, output_size):
    sp_row, sp_col = spectrogram.shape
    out_row, out_col = output_size
    resized_spec = zoom(spectrogram, (out_row/sp_row, out_col/sp_col))
    return resized_spec

In [7]:
def minmax_scaler(spectrogram):
    min_val = np.min(spectrogram)
    max_val = np.max(spectrogram)

    normalised_spectrogram = (spectrogram - min_val) / (max_val - min_val)

    return normalised_spectrogram

In [19]:
def reshape_spectrogram(spectrogram):
    temp_spectogrram =  np.stack((spectrogram, spectrogram, spectrogram), axis=2)
    # final_spec = np.expand_dims(temp_spectogrram, axis=0)
    return temp_spectogrram

In [20]:
# Function to loop through all clip files and generate spectrograms
def get_features_model (folder_path):
    spectrograms = [] # This will store the spectrograms of each clip
    labels = []  # This will store the labels of each clip
    seconds = []  # Number of seconds to consider for each clip
    durations = []  # Duration of the full audio file
    podcast_names = []  # This will store the podcast names of each clip

    # Iterate over all files in the directory
    file_list = os.listdir(folder_path)
    print(f"Processing files: total {len(file_list)}")
    for filename in file_list:
        # Check if the file is a .wav or .mp3 (you can adjust this as needed)
        if filename.endswith('.wav') or filename.endswith('.mp3'):
            file_path = os.path.join(folder_path, filename)

            # Split the filename by underscore
            filename_parts = filename.split('_')

            # Extract 0 or 1 from the first part of the filename (label: ad or no_ad)
            is_ad = int(filename_parts[0])  # First part is the label

            # Extract the start time in seconds (second part of the filename)
            start_time = int(filename_parts[1])  # Second part is the start time in seconds

            # Extract the total duration (third part of the filename)
            duration = int(filename_parts[2])  # Third part is the total duration of the podcast

             # Extract the podcast name (four part of the filename)
            podcast_name = filename_parts[3].replace('.wav', '')  # Third part is the total duration of the podcast

            # Create spectrogram and convert to numpy array
            spectrogram = create_spectrogram(file_path)
            resized_spectrogram =resize_spectrogram(spectrogram, (96,64))
            scaled_spectrogram = minmax_scaler(resized_spectrogram)
            reshaped_spectrogram = reshape_spectrogram(scaled_spectrogram)

            # Append the numpy array to the list
            spectrograms.append(reshaped_spectrogram)
            labels.append(is_ad)
            seconds.append(start_time)
            durations.append(duration)
            podcast_names.append(podcast_name)

    return spectrograms, labels, seconds, durations, podcast_names

In [21]:
# Data folder path: Change this to the path where your audio clips are stored
folder_path = '../raw_data/OffMenu263'
all_spectrograms = get_features_model(folder_path)

Processing files: total 4800


In [32]:
X = np.array(all_spectrograms[0])
y = np.array(all_spectrograms[1])

In [33]:
np.array(X).shape

(4800, 96, 64, 3)

In [34]:
X_train, X_test, y_train, y_test = train_test_split(np.array(X), y, test_size=0.2, random_state=42)

In [35]:
X_train

array([[[[0.6409163 , 0.6409163 , 0.6409163 ],
         [0.38479653, 0.38479653, 0.38479653],
         [0.37839317, 0.37839317, 0.37839317],
         ...,
         [0.19931024, 0.19931024, 0.19931024],
         [0.19428426, 0.19428426, 0.19428426],
         [0.57369953, 0.57369953, 0.57369953]],

        [[0.72761166, 0.72761166, 0.72761166],
         [0.6307602 , 0.6307602 , 0.6307602 ],
         [0.67127293, 0.67127293, 0.67127293],
         ...,
         [0.2568575 , 0.2568575 , 0.2568575 ],
         [0.50509113, 0.50509113, 0.50509113],
         [0.6777614 , 0.6777614 , 0.6777614 ]],

        [[0.871038  , 0.871038  , 0.871038  ],
         [0.7902225 , 0.7902225 , 0.7902225 ],
         [0.6498148 , 0.6498148 , 0.6498148 ],
         ...,
         [0.69515383, 0.69515383, 0.69515383],
         [0.69425166, 0.69425166, 0.69425166],
         [0.67825186, 0.67825186, 0.67825186]],

        ...,

        [[0.3666606 , 0.3666606 , 0.3666606 ],
         [0.31119344, 0.31119344, 0.31119344]

VGG16
(96,64) optimal shape

base_model = 
include_top=False (you don't want top tlayers)
model = models,Sequential() *add this layer sigmoid activation at end

X = np.random.rand(samplenum, 96, 64, 1)

In [None]:
def plot_spectrogram(spectrogram_db, sr, hop_length):
    plt.figure(figsize=(20, 10))
    librosa.display.specshow(spectrogram_db, sr=sr, hop_length=hop_length)
    plt.title('Spectrogram')
    plt.tight_layout()
    # plt.savefig(output_path)
    # plt.close()
plot_spectrogram(spectrogram, 16000, 1)
spectrogram.shape

# VGG-16 Model
# Import VGG16 and set the necessary arguments:

In [48]:
base_model = VGG16(weights='imagenet',
                  include_top=False,
                  input_shape=(96,64,3))

Freeze convolutional blocks

In [49]:
for layer in base_model.layers:
    layer.trainable = False
# Make sure you have frozen the correct layers
for i, layer in enumerate(base_model.layers):
    print(i, layer.name, layer.trainable)

0 input_layer_1 False
1 block1_conv1 False
2 block1_conv2 False
3 block1_pool False
4 block2_conv1 False
5 block2_conv2 False
6 block2_pool False
7 block3_conv1 False
8 block3_conv2 False
9 block3_conv3 False
10 block3_pool False
11 block4_conv1 False
12 block4_conv2 False
13 block4_conv3 False
14 block4_pool False
15 block5_conv1 False
16 block5_conv2 False
17 block5_conv3 False
18 block5_pool False


In [None]:
# # x = base_model.output
# model = base_model.Sequential()

# # First convolutional layer
# model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(X.shape[1], X.shape[2], X.shape[3])))
# model.add(layers.MaxPooling2D((2, 2)))

# # Second convolutional layer
# model.add(layers.Conv2D(64, (3, 3), activation='relu'))
# model.add(layers.MaxPooling2D((2, 2)))

# # Third convolutional layer
# model.add(layers.Conv2D(64, (3, 3), activation='relu'))

# # Flatten the output and add Dense layers
# model.add(layers.Flatten())
# model.add(layers.Dense(64, activation='relu'))
# model.add(layers.Dense(1, activation='sigmoid'))

In [38]:
(X_train.shape[1], X_train.shape[2], X_train.shape[3])

(96, 64, 3)

In [132]:
# def load_vgg_model():
#     base_model = tf.keras.applications.VGG16(
#         include_top=False,
#         input_shape=(96, 64, 1),
#         weights=None)

#     # specific layers for ad detection
#     model = models.Sequential([
#         base_model,
#         layers.Flatten(),
#         layers.Dense(128, activation='relu'),
#         layers.Dropout(0.5),
#         layers.Dense(1, activation='sigmoid')  # ad vs. non-ad
#     ])

#     return model

In [None]:
# load_vgg_model()

In [57]:
base_model.input

<KerasTensor shape=(None, 96, 64, 3), dtype=float32, sparse=False, name=keras_tensor_37>

In [58]:
x = base_model.output

# First convolutional layer
x = layers.Conv2D(32,(3, 3), activation='relu', input_shape=(X_train.shape[1], X_train.shape[2], X_train.shape[3]))(x)
x = layers.MaxPooling2D(2, 2)(x)

# # Second convolutional layer
# x = layer.Conv2D(64, (3, 3), activation='relu')
# x = layer.MaxPooling2D((2, 2))

# # Third convolutional layer
# x = layer.Conv2D(64, (3, 3), activation='relu')

# create basic convo layer + maxpooling
x = layers.Flatten()(x) # Flatten dimensions for use in FC layers
x = layers.Dense(512, activation='relu')(x)
x = layers.Dropout(0.5)(x) # Dropout layer to reduce overfitting
# x = layer.Dense(256, activation='relu')(x)
x = layers.Dense(8, activation='sigmoid')(x)
transfer_model = Model(inputs=base_model.input, outputs=x)

# # why is it useful to have pooling layer after convolutional layer?

Compile and fit

In [56]:
# learning_rate= 0.01
transfer_model.compile(loss="categorical_crossentropy",
                       optimizer='adam',
                       metrics=["accuracy"])
history = transfer_model.fit(X_train,
                             y_train,
                             batch_size = 1,
                             epochs=50,
                             validation_data=(X_test,y_test))

Epoch 1/50


ValueError: Exception encountered when calling Conv2D.call().

[1mNegative dimension size caused by subtracting 3 from 2 for '{{node functional_5_1/conv2d_5_1/convolution}} = Conv2D[T=DT_FLOAT, data_format="NHWC", dilations=[1, 1, 1, 1], explicit_paddings=[], padding="VALID", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true](functional_5_1/block5_pool_1/MaxPool2d, functional_5_1/conv2d_5_1/convolution/ReadVariableOp)' with input shapes: [1,3,2,512], [3,3,512,32].[0m

Arguments received by Conv2D.call():
  • inputs=tf.Tensor(shape=(1, 3, 2, 512), dtype=float32)