In [None]:
# import required libraries
import os
import math
import random
import pandas as pd
import numpy as np
import IPython.display as ipd
import librosa
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Dense, Flatten, MaxPooling2D, Dropout, BatchNormalization, SpatialDropout2D
from tensorflow.keras import regularizers
from sklearn.metrics import average_precision_score, PrecisionRecallDisplay
from sklearn.utils import class_weight
tf.get_logger().setLevel('ERROR')

In [None]:
# Step 1: Upload kaggle.json and set permissions
from google.colab import files
files.upload()  # upload kaggle.json

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Step 2: Download dataset
!kaggle datasets download -d gray8ed/audio-dataset-of-low-flying-aircraft-aerosonicdb

# Step 3: Unzip dataset
!unzip audio-dataset-of-low-flying-aircraft-aerosonicdb.zip -d ./audio_data

# Step 4: Use it
import os
DATA_DIR = './audio_data'
print(os.listdir(DATA_DIR))

In [None]:
DATA_DIR = '/content/audio_data'

# take a look at the directory files and structure
print(os.listdir(DATA_DIR))
print(os.listdir(DATA_DIR + '/audio'))
print(os.listdir(DATA_DIR + '/audio/audio'))
print(os.listdir(DATA_DIR + '/env_audio'))
print(os.listdir(DATA_DIR + '/env_audio/env_audio'))

In [None]:
# set a path to the audio/audio directory
AUDIO_DIR = os.path.join(DATA_DIR, 'audio/audio')

# set a path to the env_audio/env_audio directory
ENV_DIR = os.path.join(DATA_DIR, 'env_audio/env_audio')

In [None]:
# take a look at the audio directory,
# how many negative class "0", how many positive "1"?
print(os.listdir(AUDIO_DIR))

for i in ['0', '1']:
    dir_files = len(os.listdir(os.path.join(AUDIO_DIR, i)))
    print(f'Class {i} contains {dir_files} samples')

In [None]:
# load the sample_meta.csv file for a look
df = pd.read_csv(os.path.join(DATA_DIR, 'sample_meta.csv'))
# sanity check on the number of samples in each class
df['class'].value_counts()

In [None]:
# take a look at all of the columns/labels available for each sample
df.columns

In [None]:
## Fetch a random file from each class
random.seed(42)
NEG_FILE = random.sample(os.listdir(os.path.join(AUDIO_DIR, '0')), 1)[0]
POS_FILE = random.sample(os.listdir(os.path.join(AUDIO_DIR, '1')), 1)[0]
print(NEG_FILE)
print(POS_FILE)

In [None]:
# define a function to build a filepath from a filename and class combination
def get_audio_path(df, filename):
    # locate the filename and fetch the corresponding class ("fclass" == file class)
    fclass = df.loc[df['filename'] == filename, 'class'].values[0]
    filepath = os.path.join(AUDIO_DIR, str(fclass), filename)
    return filepath, fclass

In [None]:
# check the function above works with our example files
print(get_audio_path(df=df, filename=POS_FILE))
print(get_audio_path(df=df, filename=NEG_FILE))

In [None]:
# function to load a file to play and show it's waveform
def load_show_audio(filename):
    path, fclass = get_audio_path(df=df, filename=filename)
    signal, sr = librosa.load(path)
    print(f'{filename} sample rate: {str(sr)}')
    plt.figure(figsize=(6, 3))
    librosa.display.waveshow(y=signal, sr=sr)
    plt.show()
    return ipd.Audio(path)

In [None]:
# load and play the positive/aircraft example
load_show_audio(filename=POS_FILE)

In [None]:
# load and play the negative/silence example
load_show_audio(filename=NEG_FILE)

In [None]:
# set some constants for feature extraction, training and inference
SR = 22050 # sample rate of the audio files
DURATION = 5 # length of a segment in seconds
SAMPLES_PER_SEGMENT = SR*DURATION # the number of samples per segment we expect
N_FFT = 2048 # approx frequency resolution of 21.5 Hz
HOP_LENGTH = 1024
EXP_VECTORS_PER_SEGMENT = math.floor(SAMPLES_PER_SEGMENT/HOP_LENGTH)
N_MELS = 128 # the number of frequency bins for spectrogram
EXP_INPUT_SHAPE = (N_MELS, EXP_VECTORS_PER_SEGMENT) # the expected shape of the spectrogram
print('Expected spectrogram shape:', EXP_INPUT_SHAPE)

In [None]:
# function to load a file and chop it into spectrograms equal to the segment length
def audio_to_spectrogram(filename):
    path, fclass = get_audio_path(df=df, filename=filename)
    signal, sr = librosa.load(path)


    if sr != SR:
        raise ValueError('Sample rate mismatch between audio and target')

    clip_segments = math.ceil(len(signal) / SAMPLES_PER_SEGMENT)

    # empty list to hold the spectrograms for this clip
    specs = []

    for segment in range(clip_segments):

        start = SAMPLES_PER_SEGMENT * segment
        end = start + SAMPLES_PER_SEGMENT - HOP_LENGTH

        spec = librosa.feature.melspectrogram(y=signal[start:end],
                                              sr=sr, n_fft=N_FFT,
                                              n_mels=N_MELS,
                                              hop_length=HOP_LENGTH,
                                              window='hann')

        db_spec = librosa.power_to_db(spec, ref=0.0)

        if db_spec.shape[1] == EXP_VECTORS_PER_SEGMENT:
            specs.append(db_spec)

        # if the clip is shorter than the segment, add zero padding to the right
        elif db_spec.shape[1] < EXP_VECTORS_PER_SEGMENT:
            n_short = EXP_VECTORS_PER_SEGMENT - db_spec.shape[1]
            db_spec = np.pad(db_spec, [(0, 0), (0, n_short)], 'constant')
            specs.append(db_spec)

    return (specs, fclass)

In [None]:
# double check the segmentation, spectrogram and padding are working correctly on a single file
specs, fclass = audio_to_spectrogram(POS_FILE)

fig, axes = plt.subplots(1,len(specs), sharey='row', figsize=(11, 3))

count = 0

for spec in specs:
    axes[count] = librosa.display.specshow(spec, ax=axes[count])
    count += 1

plt.show()

In [None]:
# function to apply min-max scaling to squeeze spectrogram values between 0 and 1
def normalise_array(array):
    array = np.asarray(array)
    min_val = array.min()
    max_val = array.max()

    norm_array = (array - min_val) / (max_val - min_val)

    return norm_array

In [None]:
# wrapper function to take a list of files and extract their features
# -> array of features (X) and array of corresponding labels (y)
def preprocess(file_list):

    data = {'feature': [], 'label': []}

    for file in file_list:
        specs, fclass = audio_to_spectrogram(filename=file)

        for spec in specs:
            norm_spec = normalise_array(spec)
            data['feature'].append(norm_spec)
            data['label'].append(fclass)

    X = np.asarray(data['feature'])
    y = np.asarray(data['label'])

    return X, y

In [None]:
# split dataset into training, validation and testing portions
train = df['filename'].loc[(df['fold'] == '1') | (df['fold'] == '2') | (df['fold'] == '3')| (df['fold'] == '4')].reset_index(drop=True) # takes folds 1, 2, 3 and 4 for training
val = df['filename'].loc[df['fold'] == '5'].reset_index(drop=True) # takes fold 5 for validation
test = df['filename'].loc[(df['fold'] == 'test')].reset_index(drop=True) # held-out test set

print(f'The "TRAIN" set contains {train.shape[0]} samples.')
print(f'The "VALIDATION" set contains {val.shape[0]} samples.')
print(f'The "TEST" set contains {test.shape[0]} samples.')

In [None]:
# preprocess the train set
X_train, y_train = preprocess(train)

# preprocess the validation set
X_val, y_val = preprocess(val)

# preprocess the validation set
X_test, y_test = preprocess(test)

In [None]:
# check the shape of the output equals the expected shape of the spectrogram
X_train[0].shape == EXP_INPUT_SHAPE

In [None]:
# set a random seed for reproducability
tf.keras.utils.set_random_seed(42)


# define the model architecture
model = Sequential()
model.add(Conv2D(16, (3, 3), activation='relu', input_shape=(128, 107,1)))
model.add(MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
model.add(tf.keras.layers.BatchNormalization())
model.add(SpatialDropout2D(0.5))

model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
model.add(SpatialDropout2D(0.5))

model.add(Conv2D(64, (3,3), activation='relu'))
model.add(MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
model.add(SpatialDropout2D(0.5))

model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# compile model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='BinaryCrossentropy', metrics=[tf.keras.metrics.AUC(curve='PR', name='PR-AUC')])
#model.summary()

In [None]:
BATCH_SIZE = 32
hist = model.fit(x=X_train,
                 y=y_train,
                 epochs=50,
                 validation_data=(X_val, y_val),
                 class_weight={0: 3, 1:1},
                 verbose=1,
                 batch_size=BATCH_SIZE)

In [None]:
model.save("aircraft_detector_model.h5")

In [None]:
plt.title('Loss')
plt.plot(hist.history['loss'], 'r')
plt.plot(hist.history['val_loss'], 'b')
plt.show()

In [None]:
plt.title('PR-AUC')
plt.plot(hist.history['PR-AUC'], 'r')
plt.plot(hist.history['val_PR-AUC'], 'b')
plt.show()

In [None]:
from tensorflow.keras.models import load_model

model = load_model('aircraft_detector_model.h5')

In [None]:
import numpy as np
import librosa

def preprocess_audio(file_path):
    SR = 22050
    DURATION = 5
    SAMPLES_PER_SEGMENT = SR * DURATION
    N_FFT = 2048
    HOP_LENGTH = 1024
    N_MELS = 128
    EXP_VECTORS_PER_SEGMENT = int(np.floor(SAMPLES_PER_SEGMENT / HOP_LENGTH))

    y, sr = librosa.load(file_path, sr=SR)

    if len(y) < SAMPLES_PER_SEGMENT:
        y = np.pad(y, (0, SAMPLES_PER_SEGMENT - len(y)), mode='constant')
    else:
        y = y[:SAMPLES_PER_SEGMENT]

    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS)
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

    # Normalize between 0 and 1
    norm_spec = (log_mel_spec - log_mel_spec.min()) / (log_mel_spec.max() - log_mel_spec.min())

    # Add batch and channel dimension for model input: (1, height, width, 1)
    input_tensor = norm_spec[np.newaxis, :, :, np.newaxis].astype(np.float32)

    return input_tensor

# Load your model
from tensorflow.keras.models import load_model
model = load_model('aircraft_detector_model.h5')

def predict_audio_class(file_path):
    processed_audio = preprocess_audio(file_path)
    prediction = model.predict(processed_audio)[0][0]
    predicted_class = 1 if prediction >= 0.5 else 0
    confidence = prediction
    return predicted_class, confidence

# Example usage
file_path = '/content/audio_data/audio/audio/1/7C1CE4_2023-05-09_13-01-23_2_1.wav'
pred_class, conf = predict_audio_class(file_path)
print(f'Predicted class: {pred_class} (1=aircraft, 0=no aircraft)')
print(f'Confidence: {conf:.4f}')