In [4]:
#Libraries:
import pandas as pd
import numpy as np
import os
import time
import librosa
import librosa.display
import matplotlib.pyplot as plt
from pydub import AudioSegment
from scipy.ndimage import zoom
from keras.applications import VGG16
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import cv2 as cv
from keras import layers, models, Model
from keras.optimizers import Adam
import tensorflow as tf
from pydub import AudioSegment
import pickle

# DATA PREPARATION:

In [5]:
#------------------Function to create spectrogram from audio file------------------
def create_spectrogram(wav_path, sr=16000):

    y, sr = librosa.load(wav_path)
    # Create mel spectrogram
    mel_spect = librosa.feature.melspectrogram(
        y=y,
        sr=sr,
        n_mels=128,  # Number of mel bands
        fmax=8000    # Maximum frequency
    )
    # Convert to log scale and return
    return np.array(librosa.power_to_db(mel_spect, ref=np.max))

#------------------Function to create spectrogram from audio file------------------
def resize_spectrogram(spectrogram, output_size):
    sp_row, sp_col = spectrogram.shape
    out_row, out_col = output_size
    resized_spec = zoom(spectrogram, (out_row/sp_row, out_col/sp_col))
    return resized_spec

#------------------Function to resize spectrogram------------------
def minmax_scaler(spectrogram):
    min_val = np.min(spectrogram)
    max_val = np.max(spectrogram)

    normalised_spectrogram = (spectrogram - min_val) / (max_val - min_val)

    return normalised_spectrogram
#------------------Function to normalise spectrogram------------------

def reshape_spectrogram(spectrogram):
    temp_spectogrram =  np.stack((spectrogram, spectrogram, spectrogram), axis=2)
    return temp_spectogrram

In [6]:
# Function to loop through all clip files and generate spectrograms
def get_features_model (folder_path):
    spectrograms = [] # This will store the spectrograms of each clip
    labels = []  # This will store the labels of each clip
    seconds = []  # Number of seconds to consider for each clip
    durations = []  # Duration of the full audio file
    podcast_names = []  # This will store the podcast names of each clip

    # Iterate over all files in the directory
    file_list = os.listdir(folder_path)
    print(f"Processing files: total {len(file_list)}")
    for filename in file_list:
        # Check if the file is a .wav or .mp3 (you can adjust this as needed)
        if filename.endswith('.wav') or filename.endswith('.mp3'):
            file_path = os.path.join(folder_path, filename)

            # Split the filename by underscore
            filename_parts = filename.split('_')

            # Extract 0 or 1 from the first part of the filename (label: ad or no_ad)
            is_ad = int(filename_parts[0])  # First part is the label

            # Extract the start time in seconds (second part of the filename)
            start_time = int(filename_parts[1])  # Second part is the start time in seconds

            # Extract the total duration (third part of the filename)
            duration = int(filename_parts[2])  # Third part is the total duration of the podcast

             # Extract the podcast name (four part of the filename)
            podcast_name = filename_parts[3].replace('.wav', '')  # Third part is the total duration of the podcast

            # Create spectrogram and convert to numpy array
            spectrogram = create_spectrogram(file_path)
            resized_spectrogram =resize_spectrogram(spectrogram, (224,224))
            scaled_spectrogram = minmax_scaler(resized_spectrogram)
            reshaped_spectrogram = reshape_spectrogram(scaled_spectrogram)

            # Append the numpy array to the list
            spectrograms.append(reshaped_spectrogram)
            labels.append(is_ad)
            seconds.append(start_time)
            durations.append(duration)
            podcast_names.append(podcast_name)

    return spectrograms, labels, seconds, durations, podcast_names

In [7]:
# List of folder paths
folder_paths = [
    # '../raw_data/5_sec_clips/changesinthebigapple',
    # '../raw_data/5_sec_clips/drewbarrymoreasksaboutboogers',
    # '../raw_data/5_sec_clips/farking&thelyingjester',
    '../raw_data/5_sec_clips/trevornoah',
    '../raw_data/5_sec_clips/ramitsethi',
    '../raw_data/5_sec_clips/glucosegoddess',
    '../raw_data/5_sec_clips/gabriellelyon',
    '../raw_data/5_sec_clips/eyedoctor',
    # '../raw_data/5_sec_clips/borisjohnson',
]

# Initialize a list to store all spectrograms
all_spectrograms = []

# Loop through each folder path
for folder_path in folder_paths:
    # Call the get_features_model function for each folder
    spectrograms = get_features_model(folder_path)

    # Append the results to all_spectrograms
    all_spectrograms.extend(spectrograms)

Processing files: total 9773
Processing files: total 6663
Processing files: total 6010
Processing files: total 6864
Processing files: total 6163


# MODEL

In [8]:
# Define X and y:
X = np.array(all_spectrograms[0])
y = np.array(all_spectrograms[1])

# Print the shape of X and y
print(X.shape)
print(y.shape)

(9773, 224, 224, 3)
(9773,)


In [9]:
# Split the data into training and testing sets: 80% training, 20% testing:
X_train, X_test, y_train, y_test = train_test_split(
    np.array(X),
    y, test_size=0.2,
    random_state=42,
    stratify=y # This will ensure that the distribution of labels is the same in both training and testing sets
    )

In [10]:
# MODEL:
def build_baseline_model(input_shape=(224,224,3), freeze_base=True):
    base_model = tf.keras.applications.VGG16(
        include_top=False,
        input_shape=input_shape,
        weights=None)
    base_model.trainable = freeze_base
    x = base_model.output

    #flatten
    x = layers.Flatten()(x)

    #dense layer for ad detection
    x = layers.Dense(512, activation='relu')(x)
    x = layers.Dropout(0.5)(x)

    #output layer for ad detection
    output = layers.Dense(1, activation='sigmoid')(x)
    loss = 'binary_crossentropy'

    #create
    model = models.Model(
        inputs=base_model.input,
        outputs=output
    )

    #compile
    model.compile(
        loss=loss,
        optimizer='adam',
        metrics=['accuracy']
    )
    return model

In [11]:
# Base model with frozen weights:
model = build_baseline_model(
    input_shape=(224,224,3),
    freeze_base=True
)

In [12]:
# Execute the model:
history = model.fit(
    X_train,
    y_train,
    batch_size=16,
    epochs=2,
    validation_data=(X_test,y_test))

Epoch 1/2
[1m489/489[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1407s[0m 3s/step - accuracy: 0.9808 - loss: 678.1646 - val_accuracy: 0.9836 - val_loss: 0.0835
Epoch 2/2
[1m489/489[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1419s[0m 3s/step - accuracy: 0.9822 - loss: 0.0969 - val_accuracy: 0.9836 - val_loss: 0.0925


In [None]:
model.save('latest_model.h5')

In [13]:
# Evaluate the model:
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_acc}")

[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 2s/step - accuracy: 0.9771 - loss: 0.1278
Test accuracy: 0.9836317300796509


# NEW FILE & OUTPUT FILE:

In [19]:
def detect_ads(podcast_file, model, clip_duration=5):
    """
    This function splits the podcast into clips, creates spectrograms, and passes them to the model to detect ads.
    podcast_file: Path to the podcast audio file (mp3)
    model: The trained model for ad detection
    clip_duration: Duration of each clip in seconds (default 5)
    return: List of ad segments (start_time, end_time) in seconds
    """

    # Load the podcast file
    podcast = AudioSegment.from_file(podcast_file) # Load the new podcast file
    podcast_duration = len(podcast) / 1000  # Duration in seconds

    # List to hold the ad segments
    ad_segments = []

    # Process the podcast in chunks of clip_duration seconds
    for i in range(0, int(podcast_duration), clip_duration):
        start_time = i * 1000  # Convert to milliseconds
        end_time = (i + clip_duration) * 1000

        # Extract the clip from the podcast
        clip = podcast[start_time:end_time]

        # Save the clip as a temporary wav file (for librosa to process)
        clip_file = "temp_clip.wav"
        clip.export(clip_file, format="wav")

        # Create a spectrogram for the clip
        spectrogram = create_spectrogram(clip_file) # We already have this function
        resized_spectrogram =resize_spectrogram(spectrogram, (224,224))
        scaled_spectrogram = minmax_scaler(resized_spectrogram)
        reshaped_spectrogram = reshape_spectrogram(scaled_spectrogram)

        # Convert the spectrogram to a numpy array and pass it to the model
        spectrogram_np = np.expand_dims(reshaped_spectrogram, axis=0)  # Add batch dimension
        prediction = model.predict(spectrogram_np) # Use the model to predict

        # If the model predicts 'ad' it will mark this segment as an ad (1)
        if prediction == 1:
            ad_segments.append((i, i + clip_duration))

        # Clean up the temporary file
        os.remove(clip_file)

    return ad_segments

# # Spectrogram creation function (we already have this)
# def create_spectrogram(audio_file_wav):
#     data, sample_rate = librosa.load(audio_file_wav, sr=None)
#     spectrogram = librosa.stft(data)
#     spectrogram_db = librosa.amplitude_to_db(abs(spectrogram))
#     return spectrogram_db

In [15]:
def remove_ads_from_podcast(podcast_file, ad_segments):
    """
    Removes the ad segments from the podcast and returns an ad-free podcast.
    podcast_file: Path to the podcast audio file
    ad_segments: List of tuples with (start_time, end_time) of ads in seconds
    return: An AudioSegment object, the podcast without ads
    """
    podcast = AudioSegment.from_file(podcast_file) # Load the podcast file
    podcast_duration = len(podcast)

    clean_podcast = AudioSegment.empty() # Create an empty AudioSegment object
    current_time = 0

    for ad_start, ad_end in ad_segments:
        ad_start_ms = ad_start * 1000 # Convert to milliseconds
        ad_end_ms = ad_end * 1000

        clean_podcast += podcast[current_time:ad_start_ms] # Add the non-ad segment to the clean podcast
        current_time = ad_end_ms  # Update the current time

    clean_podcast += podcast[current_time:podcast_duration]  # Add the last segment of the podcast

    return clean_podcast

In [21]:
# Example of using the functions:
podcast_file = "../raw_data/new_podcast_ceo/Boris Johnson - They Were Looking at Engineering The Virus.mp3" # Path to the new podcast file
ad_segments = detect_ads(podcast_file, model)  # Use trained model here
clean_podcast = remove_ads_from_podcast(podcast_file, ad_segments)

# Saving the ad-free podcast:
clean_podcast.export('podcast_without_ads.mp3', format='mp3')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 272ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 198ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 191ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 187ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 194ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 185ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 185ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

<_io.BufferedRandom name='podcast_without_ads.mp3'>

In [22]:
model.save('model.h5')


