In [1]:
from IPython.utils.io import capture_output
with capture_output() as captured:
    ! pip install import_ipynb
    ! pip install tensorflow
    ! pip install boto3
    ! pip install pandas
    ! pip install librosa
    ! pip install soundfile
    ! pip install opencv-contrib-python
    ! pip install tensorflow_probability
    ! pip install scikit-maad
    ! pip install tensorflow_addons
    ! pip install wave

## Import statements

In [1]:
import warnings
warnings.filterwarnings('ignore')
from IPython.utils.io import capture_output
from IPython.display import clear_output

import sys
sys.path.append('/opt/anaconda3/lib/python3.11/site-packages')

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Input
from tensorflow.keras.layers import Dense, Lambda
import tensorflow_probability as tfp
import tensorflow_addons as tfa
import matplotlib.pyplot as plt
from scipy import stats
from PIL import ImageColor, ImageFont
import librosa
import soundfile as sf
import pdb
import glob
import cv2
import boto3
import wave
from tqdm import tqdm
import import_ipynb
import os
import re

In [2]:
def awsKeys(file):
    awsKeys = pd.read_csv(file)
    access_key = awsKeys['Access key ID'][0]
    secret_key = awsKeys['Secret access key'][0]
    return access_key, secret_key

def clientAndBucket(file, region = 'us-west-2'):
    aws_access_key_id, aws_secret_access_key = awsKeys(file)
    s3_client = boto3.client(
        's3',
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        region_name=region
    )
    bucket_name = 'whale-recordings'
    s3 = boto3.resource('s3',
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        region_name=region
    )
    bucket = s3.Bucket(bucket_name)
    return s3_client, bucket

KEYS = "ssundar_accessKeys.csv"
s3_client, bucket = clientAndBucket(KEYS)

%run model_functions.ipynb
warnings.filterwarnings('ignore')

# List to store processed data
processed_data = []
D2 = []
backgroundFiles = []

path = "CPhydrophone/Avila/Deployment 2/selection-tables/"

keys = [obj.key for obj in bucket.objects.all()]
selectionTables = [(obj.split("/")[-1], obj) for obj in keys if path in obj][1:]
len(selectionTables)

finished preprocessing


39

## Download Script

Downloads all **39** audio files and the corresponding annotated .txt files from the S3 bucket

to a folder named "files" 

In [3]:
wavPath = "CPhydrophone/Avila/Deployment 2/wav-files/decimated_files/"
backgroundFiles = []
! find . -type f \( -name "*.wav" -o -name "*-SS.txt" \) -exec rm {} +
FOLDER = "files" # name of folder to download files to
os.makedirs(FOLDER, exist_ok=True)
for item in tqdm(selectionTables):
    try:
        ss = item[0]
        wav = ss.split("-SS.txt")[0] + "_processed.wav"
        if not os.path.exists(ss):
            s3_client.download_file(bucket_name, item[1], f'{FOLDER}/{ss}')
        if not os.path.exists(wav):
            s3_client.download_file(bucket_name, wavPath + wav, f'{FOLDER}/{wav}')
        background_noise = exclude(wav, labels_file=ss)[1]
        backgroundFiles.append(background_noise)
    except:
        continue

100%|██████████| 39/39 [02:32<00:00,  3.90s/it]


## Model training functions

In [5]:
def pad_and_concat_arrays(arrays, pad_value=0):
    # Find the length of the longest array
    max_length = max(len(arr) for arr in arrays)

    # Pad each array and store it in a new list
    padded_arrays = [np.pad(arr, (0, max_length - len(arr)), 'constant', constant_values=pad_value) for arr in arrays]

    # Concatenate the padded arrays
    concatenated_array = np.concatenate(padded_arrays, axis=0)

    return concatenated_array

def exclude(audio_file, labels_file, filtered_filename="concatenatedTrainingData.wav"):
    # Load audio file
    y, sr = sf.read(audio_file,dtype="float32")

    # Load labels file as pandas dataframe
    df = pd.read_csv(labels_file, delimiter='\t')

    # Convert start and end times to sample indices
    start_idx = librosa.time_to_samples(df['Begin Time (s)'], sr=sr)
    end_idx = librosa.time_to_samples(df['End Time (s)'], sr=sr)

    # Create a boolean mask for each frame
    frames = librosa.util.frame(y, frame_length=sr, hop_length=sr).T
    mask = np.ones(frames.shape[0], dtype=bool)

    # Loop over each interval and exclude corresponding frames
    for idx in range(len(start_idx)):
        start_frame = start_idx[idx] // sr
        end_frame = end_idx[idx] // sr
        mask[start_frame:end_frame+1] = False

    # Apply mask to frames
    frames_filtered = frames[mask]

    # Reshape filtered frames into audio signal
    y_filtered = frames_filtered.reshape(-1)
    sf.write(filtered_filename, y_filtered, sr)
    return sr, y_filtered

def train_model_notebook(train_dataset,path=None):
    segments=1
    latent_dim=2000
    vae=build_model(latent_dim,None)
    vae.fit(train_dataset, epochs=5)
    if path is not None:
        # Save model weights to a file
        vae.save(path)  

## Ensemble model

VAE Ensemble implementation to train and predict on audio data.

Default number of units is 10

Linearly partition the data into *n* parts and train a VAE model on each part

**Note**: The training does not retrain the models if they already exist

The predict function takes in a decimated audio file and returns the predictions in a pandas dataframe

In [6]:
import os
import numpy as np
import pandas as pd
import wave
from tqdm import tqdm
import keras

ENDING = 'processed.wav'

class VAEnsemble:
    """Class for training and predicting with VAE models for audio processing."""
    def __init__(self, filePath=".", n=10):
        self.filePath = filePath  # Path to the files
        self.n = n
        self.files = [f for f in os.listdir(filePath) if f.endswith("-SS.txt") or f.endswith(".wav")]
        self.folders = []

    def splitData(self):
        """Split data into n parts for model training."""
        stuff = [f for f in self.files if f.endswith("-SS.txt")]
        return np.array_split(stuff, self.n)

    def concatenate_wav_files(self, prefix, wav_files, output_file):
        """Concatenate multiple wav files into a single wav file.
        :param prefix: The directory containing the wav files.
        :param wav_files: A list of wav file names.
        :param output_file: The name of the output file.
        """
        with wave.open(f'{prefix}/{wav_files[0]}', 'rb') as wf:
            params = wf.getparams()
            frames = wf.readframes(wf.getnframes())

        with wave.open(output_file, 'wb') as output:
            output.setparams(params)
            output.writeframes(frames)
            for file in wav_files[1:]:
                with wave.open(f'{prefix}/{file}', 'rb') as wf:
                    frames = wf.readframes(wf.getnframes())
                    output.writeframes(frames)

    def trainEnsembles(self):
        """Train VAE models in the ensemble"""
        splits = self.splitData() # Split data first
        i = 1
        if self.folders != []: # if we already know the model weight folders, don't retrain
            print("Already trained")
            return
        for data in tqdm(splits):
            # Training loop
            # Create a new folder for each model
            path = f"test_vae_ensemble_mod_pcen_{i}"
            self.folders.append(path)
            if os.path.exists(path):
                i += 1
                continue
            print(f"Building model {i}")
            wavFiles = [f'{file.replace("-SS.txt", "")}_{ENDING}' for file in data]
            outPath = f'files/backgroundNoiseSplit{i}'
            self.concatenate_wav_files('files', wavFiles, outPath)
            # Additional processing and training steps should be implemented here.
            everythingSS = pd.DataFrame()
            for ss in data:
                # Concatenate the SS files necessary for each model
                if "-SS.txt" in ss:
                    df = pd.read_csv(f'files/{ss}', sep = '\t')
                    everythingSS = pd.concat([everythingSS, df])
            everythingSS.to_csv(f'{outPath}_SS.txt', sep = '\t', index = False)
            # Extract the background noise from the concatenated wav file
            background_noise = exclude(outPath, labels_file=f'{outPath}_SS.txt', filtered_filename=f'{outPath}.wav')[1]
            
            dataset_train,sr = process_wav(f'{outPath}.wav', running = True) # Process the concatenated wav file
            train_model_notebook(dataset_train, path, epochs=self.epochs)
            i += 1
            self.finished = len(self.folders) == self.n

    def predict(self, predictingFile):
        """Use trained VAE models to predict on new audio data.
        
        :param predictingFile: The file to predict on.
        :return predictions: A pandas dataframe containing the predictions.
        
        """
        if not os.path.exists("predictions"):
            os.mkdir("predictions")
        dataset_test, sr = process_wav(predictingFile, running=True)
        everything = pd.DataFrame()
        for path in tqdm(self.folders): # Loop through each model in the ensemble and run the predict framework
            try:
                vae = keras.models.load_model(path, custom_objects={"vae_loss_function": vae_loss_function})
                non_normal_scores = error_dataset(vae, dataset_test, False, sr=sr)
                bounding_boxes = run_model(non_normal_scores)
                titles = ["Begin Time (s)", "End Time (s)", "Low Freq (Hz)", "High Freq (Hz)", "Species confidence"]
                predictions_path = predictingFile.split('_')[0] + "_predictions.txt"
                write_array_to_file(bounding_boxes, titles, predictions_path)
                predictions = pd.read_csv(predictions_path, sep='\t')
                predictions['model'] = path
                everything = pd.concat([everything, predictions])
            except Exception as e:
                # If the above code breaks, skip that model and continue
                print(f"Error processing model {path}: {e}")
                continue
            vae = keras.models.load_model(path, custom_objects={"vae_loss_function": vae_loss_function})
            non_normal_scores = error_dataset(vae, dataset_test, False, sr=sr)
            bounding_boxes = run_model(non_normal_scores)
            titles = ["Begin Time (s)", "End Time (s)", "Low Freq (Hz)", "High Freq (Hz)", "Species confidence"]
            predictions_path = predictingFile.split('_')[0] + "_predictions.txt"
            write_array_to_file(bounding_boxes, titles, predictions_path)
            predictions = pd.read_csv(predictions_path, sep='\t')
            predictions['model'] = path
            everything = pd.concat([everything, predictions])
        numbers = re.search(r'\d+\.\d+', predictingFile).group()
        everything.reset_index(drop=True, inplace=True)
        output_file = f"predictions/{numbers}_predictions.txt"
        everything.to_csv(output_file, sep="\t", index=False)
        print(f"Predictions written to {output_file}")

In [7]:
vae = VAEnsemble("files")
vae.folders = [f'test_vae_ensemble_mod_pcen_{i}' for i in range(1, 11)]
vae.trainEnsembles()

Already trained


## Testing the ensemble model predict method

In [4]:
randomFile = np.array([f for f in os.listdir("files") if f.endswith("_processed.wav")])
randomFile = np.random.choice(randomFile, 1)[0]
randomFile = f'files/{randomFile}'
randomFile

'files/6805.230203110826_processed.wav'

In [None]:
print(vae.folders)
vae.predict(randomFile)

# Not written by us

In [9]:
#%run postprocessing_original.ipynb
# file = '6805.230201090825'
# 
# ground = read_boxes(f'files/{file}-SS.txt', False)
# output = read_boxes(f'{file}_predictions.txt', True)
# 
# fOut = nms
# (
#     filterBoxes(
#         filterBoxes(
#             filterBoxes(
#                 output,
#                 dim = 'top',
#                 upper = False,
#                 thresh = 2400
#             ), 
#             thresh = 0.3
#         ),
#         dim = 'dur',
#         thresh = 0.25
# 
#     ), 
#     0.5
# )
# # raw model output metrics
# 
# getBoxSuccess(
#     predicted = fOut,
#     annotated = ground,
#     thresh = 0.6,
#     compare = "ovr"
# )

In [24]:
%run postprocessing_original.ipynb

# Metrics functions

The metrics function takes in a file prefix and returns a pandas dataframe with the metrics for that file

The filtering function takes in the output, confidence, species, and duration thresholds and returns the filtered boxes

The getBoxSuccess function takes in the predicted and annotated boxes and returns the metrics for the predicted boxes

In [25]:
def filtering(output, conf, species, dur, nmsThreshold=0.5):
    return nms(
            filterBoxes(
                filterBoxes(
                    filterBoxes(
                        output,
                        dim = 'top',
                        upper = False,
                        thresh = conf
                    ), 
                    thresh = species
                ),
                dim = 'dur',
                thresh = dur

            ),
        nmsThreshold)

In [36]:
def metrics(prefix, ssFolder='files', everythingFolder='', threshold=0.80, outPath='metrics.csv', conf=None, species=None, dur=None):
    if not os.path.exists(f'predictions/{prefix}_predictions.txt'):
        raise FileNotFoundError("No predictions file")
    ground = os.path.join(ssFolder, f'{prefix}-SS.txt')
    output = os.path.join(everythingFolder, f'predictions/{prefix}_predictions.txt')
    ground = read_boxes(ground, False)
    output = read_boxes(output, True)

    fOut = nms(output, None)
    things = getBoxSuccess(
        predicted = fOut,
        annotated = ground,
        thresh = threshold,
        compare = "ovr"
    )
    df = pd.DataFrame([things])
    df['file'] = prefix
    df.set_index('file', inplace=True)
    #df.to_csv(outPath)
    return df

### Testing the metrics function

In [35]:
everything = pd.DataFrame()
for file in tqdm(os.listdir('combos')):
    prefix = file.replace('_combined.txt', '')
    data = metrics(prefix)
    everything = pd.concat([everything, data])
everything

100%|██████████| 39/39 [00:01<00:00, 26.01it/s]


Unnamed: 0_level_0,numPredicted,numAnnotated,truePositives,falsePositives,falseNegatives,binaryAccuracy,binaryPrecision,BinaryRecall,binaryF1
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
6805.230205030826,112,21,2,90,19,0.017857,0.021739,0.095238,0.035398
6805.230201090825,301,50,11,233,39,0.036545,0.045082,0.22,0.07483
6805.230205180826,225,67,10,125,57,0.044444,0.074074,0.149254,0.09901
6805.230204210826,180,72,16,52,56,0.088889,0.235294,0.222222,0.228571
6805.230206030826,4028,336,131,2925,205,0.032522,0.042866,0.389881,0.077241
6805.230202120825,81,6,2,32,4,0.024691,0.058824,0.333333,0.1
6805.230207000827,333,32,10,213,22,0.03003,0.044843,0.3125,0.078431
6805.230203090826,154,15,2,149,13,0.012987,0.013245,0.133333,0.024096
6805.230202030825,254,14,0,254,14,0.0,0.0,0.0,0.0
6805.230205090826,88,2,1,80,1,0.011364,0.012346,0.5,0.024096


In [49]:
m = pd.read_csv('metrics.csv')
m.set_index('file', inplace=True)
everything.index

Index(['6805.230205030826', '6805.230201090825', '6805.230205180826',
       '6805.230204210826', '6805.230206030826', '6805.230202120825',
       '6805.230207000827', '6805.230203090826', '6805.230202030825',
       '6805.230205090826', '6805.230201180825', '6805.230203180826',
       '6805.230206090827', '6805.230202180825', '6805.230203000825',
       '6805.230207120827', '6805.230204003826', '6805.230205183826',
       '6805.230202000825', '6805.230205150826', '6805.230204120826',
       '6805.230204090826', '6805.230201210825', '6805.230207043827',
       '6805.230202100825', '6805.230203210826', '6805.230205000826',
       '6805.230202150825', '6805.230205210826', '6805.230204180826',
       '6805.230203110826', '6805.230206210827', '6805.230203150826',
       '6805.230206233827', '6805.230206100827', '6805.230204030826',
       '6805.230206000826', '6805.230206163827', '6805.230201150825'],
      dtype='object', name='file')

## Running the ensemble on all 39 files

In [None]:
import time
processedWavFiles = [f'files/{f}' for f in os.listdir('files') if f.endswith('_processed.wav')]

for file in tqdm(processedWavFiles):
    with capture_output() as captured:
        vae.predict(file)

  5%|▌         | 2/39 [04:45<1:28:31, 143.56s/it]

## Threshold Tuning

This function tests all thresholds between 0 and 1 to determine the best binary classification metrics.
"Class" is determined by the IOU threshold.

In [13]:
full = pd.DataFrame()
for threshold in np.arange(0.01, 1, 0.01):
    stuff = metrics('6805.230206163827', threshold=threshold, ssFolder='files', everythingFolder='')
    stuff['threshold'] = threshold
    stuff.set_index('threshold', append=True, inplace=True)
    full = pd.concat([full, stuff])
full

Unnamed: 0_level_0,Unnamed: 1_level_0,numPredicted,numAnnotated,truePositives,falsePositives,falseNegatives,binaryAccuracy,binaryPrecision,BinaryRecall,binaryF1,nonbinaryAccuracy
file,threshold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6805.230206163827,0.01,1877,374,164,155,210,0.087373,0.514107,0.438503,0.473304,0.829433
6805.230206163827,0.02,1877,374,163,156,211,0.086841,0.510972,0.435829,0.470418,0.829433
6805.230206163827,0.03,1877,374,163,156,211,0.086841,0.510972,0.435829,0.470418,0.829433
6805.230206163827,0.04,1877,374,163,160,211,0.086841,0.504644,0.435829,0.467719,0.829433
6805.230206163827,0.05,1877,374,162,163,212,0.086308,0.498462,0.433155,0.463519,0.829433
6805.230206163827,...,...,...,...,...,...,...,...,...,...,...
6805.230206163827,0.95,1877,374,100,827,274,0.053277,0.107875,0.267380,0.153728,0.829433
6805.230206163827,0.96,1877,374,98,833,276,0.052211,0.105263,0.262032,0.150192,0.829433
6805.230206163827,0.97,1877,374,98,842,276,0.052211,0.104255,0.262032,0.149163,0.829433
6805.230206163827,0.98,1877,374,96,872,278,0.051145,0.099174,0.256684,0.143070,0.829433


## Testing the filtering function

The filtering function takes in the output, confidence, species, and duration thresholds and returns the filtered boxes

In [13]:
file = 'predictions/6805.230206163827_predictions.txt'
predictions = pd.read_csv(file, sep='\t')
list = read_boxes(file, True)
filtering(list, 2400, 0.7, 0.5)

NameError: name 'output' is not defined

In [5]:
import pandas as pd

# Load the predictions
predictions = pd.read_csv('predictions/6805.230206163827_predictions.txt', sep='\t')

# Filter the predictions
run = [f'test_vae_ensemble_mod_pcen_{i}' for i in range(1, 11)]
predictions = predictions[predictions['model'].isin(run)]

# Define the function to check if two values are similar
def equal(row, other, col, delta=0.5):
    """ Check if two values are approximately equal. 
    :param row: The first value.
    :param other: The second value.
    :param col: The column to compare.
    :param delta: The threshold for equality.
    :return: True if the values are approximately equal, False otherwise.
    """
    return abs(row[col] - other[col]) < delta

condensed = pd.DataFrame()
for _, row in predictions.iterrows():
    for _, other in predictions.iterrows():
        if equal(row, other, 'Begin Time (s)') and equal(row, other, 'End Time (s)') and (equal(row, other, 'Low Freq (Hz)') or equal(row, other, 'High Freq (Hz)')) and not row.equals(other):
            # Combine boxes and create a new box
            newStuff = {}
            newStuff['Begin Time (s)'] = min(row['Begin Time (s)'], other['Begin Time (s)'])
            newStuff['End Time (s)'] = max(row['End Time (s)'], other['End Time (s)'])
            newStuff['Low Freq (Hz)'] = min(row['Low Freq (Hz)'], other['Low Freq (Hz)'])
            newStuff['High Freq (Hz)'] = max(row['High Freq (Hz)'], other['High Freq (Hz)'])
            newStuff['Species confidence'] = max(row['Species confidence'], other['Species confidence'])
            newStuff = pd.DataFrame([newStuff])
            condensed = pd.concat([condensed, newStuff])
            # blacklist boxes with end times and start times within the range of the newStuff box
            predictions = predictions[~((predictions['Begin Time (s)'] >= newStuff['Begin Time (s)'].values[0]) & (predictions['End Time (s)'] <= newStuff['End Time (s)'].values[0]))] 
condensed.drop_duplicates()

Unnamed: 0,Begin Time (s),End Time (s),Low Freq (Hz),High Freq (Hz),Species confidence
0,15.15,15.40,13.179572,204.283361,1.000000
0,15.15,15.35,13.179572,204.283361,1.000000
0,16.80,16.90,13.179572,144.975288,1.000000
0,16.80,16.85,19.769357,144.975288,1.000000
0,18.05,18.15,19.769357,158.154860,1.000000
...,...,...,...,...,...
0,530.85,531.00,13.179572,204.283361,1.000000
0,1683.85,1684.05,6.589786,158.154860,0.845827
0,230.65,231.15,6.589786,151.565074,1.000000
0,530.85,531.30,6.589786,158.154860,1.000000
