In [1]:
from IPython.utils.io import capture_output
with capture_output() as captured:
    ! pip install import_ipynb
    ! pip install tensorflow==2.15.0
    ! pip install boto3
    ! pip install pandas
    ! pip install librosa
    ! pip install soundfile
    ! pip install opencv-contrib-python
    ! pip install tensorflow_probability==0.23.0
    ! pip install scikit-maad
    ! pip install tensorflow_addons==0.23.0
    ! pip install wave

In [3]:
import warnings
warnings.filterwarnings('ignore')
from IPython.utils.io import capture_output
from IPython.display import clear_output

import sys
sys.path.append('/opt/anaconda3/lib/python3.11/site-packages')

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Input
from tensorflow.keras.layers import Dense, Lambda
import tensorflow_probability as tfp
import tensorflow_addons as tfa
import matplotlib.pyplot as plt
from scipy import stats
from PIL import ImageColor, ImageFont
import librosa
import soundfile as sf
import pdb
import glob
import cv2
import boto3
import wave
from tqdm import tqdm
import import_ipynb
import os
import re
tf.config.set_visible_devices([], 'GPU')

2024-05-07 20:02:46.498186: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-07 20:02:46.498223: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-07 20:02:46.499584: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-07 20:02:46.507264: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-07 20:02:49.245427: E external/local_

In [4]:
def awsKeys(file):
    awsKeys = pd.read_csv(file)
    access_key = awsKeys['Access key ID'][0]
    secret_key = awsKeys['Secret access key'][0]
    return access_key, secret_key

def clientAndBucket(file, region = 'us-west-2'):
    aws_access_key_id, aws_secret_access_key = awsKeys(file)
    s3_client = boto3.client(
        's3',
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        region_name=region
    )
    bucket_name = 'whale-recordings'
    s3 = boto3.resource('s3',
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        region_name=region
    )
    bucket = s3.Bucket(bucket_name)
    return s3_client, bucket

KEYS = "ssundar_accessKeys.csv"
s3_client, bucket = clientAndBucket(KEYS)

%run model_functions_mel.ipynb
warnings.filterwarnings('ignore')

# List to store processed data
processed_data = []
D2 = []
backgroundFiles = []

path = "CPhydrophone/Avila/Deployment 2/selection-tables/"

keys = [obj.key for obj in bucket.objects.all()]
selectionTables = [(obj.split("/")[-1], obj) for obj in keys if path in obj][1:]
len(selectionTables)

finished preprocessing


39

## when doing predictions do this

In [5]:
#%run model_functions_mel.ipynb

In [6]:
def pad_and_concat_arrays(arrays, pad_value=0):
    # Find the length of the longest array
    max_length = max(len(arr) for arr in arrays)

    # Pad each array and store it in a new list
    padded_arrays = [np.pad(arr, (0, max_length - len(arr)), 'constant', constant_values=pad_value) for arr in arrays]

    # Concatenate the padded arrays
    concatenated_array = np.concatenate(padded_arrays, axis=0)

    return concatenated_array

def exclude(audio_file, labels_file, filtered_filename="concatenatedTrainingData.wav"):
    # Load audio file
    y, sr = sf.read(audio_file,dtype="float32")

    # Load labels file as pandas dataframe
    df = pd.read_csv(labels_file, delimiter='\t')

    # Convert start and end times to sample indices
    start_idx = librosa.time_to_samples(df['Begin Time (s)'], sr=sr)
    end_idx = librosa.time_to_samples(df['End Time (s)'], sr=sr)

    # Create a boolean mask for each frame
    frames = librosa.util.frame(y, frame_length=sr, hop_length=sr).T
    mask = np.ones(frames.shape[0], dtype=bool)

    # Loop over each interval and exclude corresponding frames
    for idx in range(len(start_idx)):
        start_frame = start_idx[idx] // sr
        end_frame = end_idx[idx] // sr
        mask[start_frame:end_frame+1] = False

    # Apply mask to frames
    frames_filtered = frames[mask]

    # Reshape filtered frames into audio signal
    y_filtered = frames_filtered.reshape(-1)
    sf.write(filtered_filename, y_filtered, sr)
    return sr, y_filtered

def train_model_notebook(train_dataset,path=None, epochs=5):
    segments=1
    latent_dim=2000
    vae=build_model(latent_dim,None)
    vae.fit(train_dataset, epochs=epochs)
    if path is not None:
        # Save model weights to a file
        vae.save(path)

In [7]:
import random

ENDING = 'processed.wav'
class VAEnsemble:
    """Assumes the vae_loss_function and error_dataset and run_model functions and write_array_to_file are defined in the model_functions.ipynb file"""
    def __init__(self, filePath = ".", n = 10, trainSplit=0.80, epochs=5):
        """
        
        :param filePath: path to decimated wav files
        :param n: number of units in ensemble
        :param trainSplit: train/test split percentage
        :param epochs: number of epochs
        """
        self.filePath = filePath # path to the files
        self.n = n
        self.files = os.listdir(filePath)
        self.files = [file for file in self.files if file.endswith("-SS.txt") or file.endswith(".wav")]
        self.folders = []
        self.finished = False
        self.epochs = epochs
    
    def splitData(self):
        """
        Splits the data into n parts
        :return: 
        """
        # split data into n parts for model training
        stuff = [f for f in self.files if f.endswith("-SS.txt")]
        return np.array_split(stuff, self.n)
    
    def concatenate_wav_files(self, prefix, wav_files, output_file):
        """Concatenate multiple wav files into a single wav file.
        :param prefix: The directory containing the wav files.
        :param wav_files: A list of wav file names.
        :param output_file: The name of the output file.
        """
        
        # Open the first wav file
        with wave.open(f'{prefix}/{wav_files[0]}', 'rb') as wf:
            params = wf.getparams()
            frames = wf.readframes(wf.getnframes())

        # Create a new wav file
        with wave.open(output_file, 'wb') as output:
            output.setparams(params)
            output.writeframes(frames)

            # Iterate over the rest of the wav files
            for file in wav_files[1:]:
                with wave.open(f'{prefix}/{file}', 'rb') as wf:
                    frames = wf.readframes(wf.getnframes())
                    output.writeframes(frames)
    
    def trainEnsembles(self, check=True):
        """
        Trains the ensemble
        :param check: check if the models are already trained
        :return: None, writes model weights to directories
        """
        splits = self.splitData()
        i = 1
        j = 1
        if check and self.folders != []:
            print("Already trained")
            return
        self.folders = []
        for data in tqdm(splits):
            j = 1
            path = f"test_vae_ensemble_mod_pcen_mel_{i}"
            self.folders.append(path)
            if check and os.path.exists(path):
                i += 1
                continue
            print(f"Building model {i}")
            wavFiles = [f'{i.replace("-SS.txt", "")}_{ENDING}' for i in data if "-SS.txt" in i]
            
            #print(wavFiles)
            outPath = f'files/backgroundNoiseSplit{i}'
            self.concatenate_wav_files('files', wavFiles, outPath)
            everythingSS = pd.DataFrame()
            for ss in data:
                if "-SS.txt" in ss:
                    df = pd.read_csv(f'files/{ss}', sep = '\t')
                    everythingSS = pd.concat([everythingSS, df])
            everythingSS.to_csv(f'{outPath}_SS.txt', sep = '\t', index = False)
            background_noise = exclude(outPath, labels_file=f'{outPath}_SS.txt', filtered_filename=f'{outPath}.wav')[1]
            
            dataset_train,sr = process_wav(f'{outPath}.wav', running = True)
            train_model_notebook(dataset_train, path, epochs=self.epochs)
            i += 1
            self.finished = len(self.folders) == self.n
            
    def predict(self, predictingFile, folder="predictions"):
        """Use trained VAE models to predict on new audio data.
        
        :param predictingFile: The file to predict on.
        :return predictions: A pandas dataframe containing the predictions.
        
        """
        
        # make a directory for the predictions
        if not os.path.exists(folder):
            os.mkdir(folder)
        dataset_test, sr = process_wav(predictingFile, running = True)
        everything = pd.DataFrame()
        for path in tqdm(self.folders):
            try:
                vae=keras.models.load_model(path, custom_objects={"vae_loss_function": vae_loss_function})
                non_normal_scores=error_dataset(vae,dataset_test,False,sr=sr) # len > 0
                bounding_boxes=run_model(non_normal_scores)
                print(len(bounding_boxes))
                titles=["Begin Time (s)","End Time (s)","Low Freq (Hz)","High Freq (Hz)","Species confidence"]
                write_array_to_file(bounding_boxes,titles,predictingFile.split('_')[0] + "_predictions.txt")
                predictions = pd.read_csv(predictingFile.split('_')[0] + "_predictions.txt", sep = '\t')
                predictions['model'] = path
                everything = pd.concat([everything, predictions])
            except:
                print("Could not do it for model", path)
                continue
        # group by the path and take the mean of the predictions
        numbers = re.search(r'\d+\.\d+', predictingFile).group()
        everything.reset_index(drop=True, inplace=True)
        print("writing to", f"{folder}/{numbers}_predictions.txt")
        everything.to_csv(f"{folder}/{numbers}_predictions.txt", sep = "\t", index=False) # in case the bottom doesn't work

## Test the VAE Ensemble

In [8]:
vae = VAEnsemble("files", epochs=20)
vae.folders = [f for f in os.listdir() if "test_vae_ensemble_mod_pcen_mel_" in f]
vae.trainEnsembles()

Already trained


In [9]:
file = [f for f in os.listdir('files') if f.endswith('_processed.wav')][0]
file

'6805.230204210826_processed.wav'

In [1]:
vae.predict('files/6805.230204210826_processed.wav', folder="predictions_mel")

## Generate predictions for all 39 files

In [None]:
predictionsFiles = [os.path.join('files', f) for f in os.listdir('files') if f.endswith('_processed.wav')]

count = 0
for file in tqdm(predictionsFiles):
    numbers = file.split('/')[-1].split('_')[0]
    path = os.path.join('predictions_mel', f'{numbers}_predictions.txt')
    if os.path.exists(path):
        count += 1
        continue
    print(f'{count} of 39 files done')
    with capture_output() as captured:
        vae.predict(file, folder='predictions_mel')

  0%|          | 0/39 [00:00<?, ?it/s]

28 of 39 files done


## check for any empty predictions file
(debugging)

In [14]:
count = 0
for file in tqdm([f for f in os.listdir('predictions_mel') if f.endswith('.txt')]):
    things = pd.read_csv(os.path.join('predictions_mel', file), sep='\t')
    if len(things) == 0:
        count += 1
count

100%|██████████| 39/39 [00:00<00:00, 441.06it/s]


0

## Get metrics for the predictions



In [44]:
%run postprocessing_original.ipynb
%run ahc.ipynb

In [33]:
def find_floats(text):
    # Matches a floating-point number not surrounded by any non-whitespace characters
    pattern = r'[\d.]+'
    match = re.search(pattern, text)
    return match.group(0) if match else None


def metrics_bc(predictions_file, boxCombo=False, threshold=0.5, outPath='metrics.csv', method="intersection"):
    """
    Gets binary classification metrics for a specified predictions file
    :param predictions_file: path to predictions file
    :param boxCombo: whether to use box combo
    :param threshold: distance threshold for box combination agglomerative clustering
    :param outPath: path of output file
    :param method: method to implement box combinations (if box combination is desired)
    :return: pandas dataframe containing binary classification metrics
    """
    # Extract the prefix from the predictions file
    prefix = find_floats(predictions_file)
    ground = os.path.join('files', f'{prefix}-SS.txt')
    ground = read_boxes(ground, False)
    output = read_boxes(predictions_file, True)

    if boxCombo:
        # Import the box_combo function from ahc.ipynb
        %run ahc.ipynb
        output = box_combo(predictions_file, method, threshold)

    fOut = nms(output, None)
    things = getBoxSuccess(
        predicted = fOut,
        annotated = ground,
        thresh = threshold,
        compare = "ovr"
    )
    df = pd.DataFrame([things])
    df['file'] = prefix
    df.set_index('file', inplace=True)
    df.to_csv(outPath)
    return df

def box_combo(file_path, method, threshold=0.5):
    """
    This function reads in a file containing boxes and clusters them using Agglomerative Clustering.
    :param file_path: predictions file path containing bounding boxes
    :param method: overlap or intersection 
    :param threshold: Agglomerative Clustering distance threshold
    :return: dataframe containing combined boxes
    """
    # SEE AHC.IPYNB FOR MORE INFORMATION
    
    data = pd.read_csv(file_path, delimiter='\t')
    X = data[['Begin Time (s)', 'End Time (s)', 'Low Freq (Hz)', 'High Freq (Hz)']]
    clusters = AgglomerativeClustering(n_clusters=None, distance_threshold=threshold).fit(X)

    cluster_lst = []
    for c in range(max(clusters.labels_) + 1):
        indices = list(np.where(clusters.labels_ == c)[0])
        df = X.iloc[indices]
        #print(f'Cluster {c}:')
        #print(df)
        #print('\n')

        cluster = Cluster(boxes=df.transpose().values.tolist())
        cluster_lst.append(cluster)
    
    combined = pd.DataFrame(columns=['Begin Time (s)', 'End Time (s)', 'Low Freq (Hz)', 'High Freq (Hz)'])
    for c in cluster_lst:
        if method == "union":
            box = [min(c.boxes[0]), max(c.boxes[1]), min(c.boxes[2]), max(c.boxes[3])]
        if method == "intersection":
            box = [max(c.boxes[0]), min(c.boxes[1]), max(c.boxes[2]), min(c.boxes[3])]
        combined.loc[len(combined)] = box
    if not os.path.exists('combos_mel'):
        os.makedirs('combos_mel')
    combined.to_csv(f'combos_mel/{file_path.split("/")[-1].split("_")[0]}_{method}_box_combo_{method}.txt', sep='\t', index=False)
    return combined

In [24]:
files = [os.path.join('predictions_mel', f) for f in os.listdir('predictions_mel') if f.endswith('_predictions.txt')]

In [11]:
for file in tqdm(files):
    path = os.path.join('predictions_mel', file)
    t = pd.read_csv(path, sep='\t')
    t['conf'] = 1.0
    t.to_csv(path, index=False)

100%|██████████| 39/39 [00:00<00:00, 223.44it/s]


In [12]:
for file in tqdm(files):
    path = os.path.join('predictions_mel', file)
    if os.path.exists(path):
        continue
    box_combo(path, 'intersection', 0.50)

100%|██████████| 39/39 [00:00<00:00, 121528.87it/s]


In [34]:
import os
import pandas as pd
import tempfile
import shutil

def fix(file_path):
    # Create a temporary file
    temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False, encoding='utf-8')
    
    # Read the original file, remove outer quotes, and write to the temporary file
    with open(file_path, 'r', encoding='utf-8') as infile, temp_file:
        for line in infile:
            # Strip quotes at the start and end of the line if they exist
            cleaned_line = line.strip().strip('\"')
            temp_file.write(cleaned_line + '\n')
    
    # Close the temporary file to ensure all data is written
    temp_file.close()
    
    # Load the cleaned data from the temporary file, specifying tab as the separator
    df = pd.read_csv(temp_file.name, sep='\t')
    
    # Write the DataFrame back to the original file using tab as the separator and ensuring no index is written
    df.to_csv(file_path, sep='\t', index=False)
    
    # Remove the temporary file
    os.unlink(temp_file.name)

# Example usage, assuming you have a list of files
files = [os.path.join('predictions_mel', f) for f in os.listdir('predictions_mel') if f.endswith('_predictions.txt')]
for file in tqdm(files):
    fix(file)

100%|██████████| 39/39 [00:00<00:00, 129.44it/s]


# Get metrics

In [45]:
everything = pd.DataFrame()
folder = 'combos_mel'
files = [f for f in os.listdir(folder) if f.endswith('box_combo_intersection.txt')]
for file in tqdm(files):
    df = pd.read_csv(f'{folder}/{file}', sep='\t')
    df['Species confidence'] = 1.0
    df.to_csv(f'{folder}/{file}', sep='\t', index=False)
    stuff = metrics_bc(f'{folder}/{file}', boxCombo=False)
    everything = pd.concat([everything, stuff])
everything.to_csv('metrics_MEL.csv')
everything

100%|██████████| 39/39 [00:00<00:00, 107.37it/s]


Unnamed: 0_level_0,numPredicted,numAnnotated,truePositives,falsePositives,falseNegatives,binaryAccuracy,binaryPrecision,BinaryRecall,binaryF1,nonbinaryAccuracy
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6805.230202120825,21,6,0,21,6,0.0,0.0,0.0,0.0,0.0
6805.230203110826,9,10,0,9,10,0.0,0.0,0.0,0.0,0.0
6805.230205210826,46,79,0,46,79,0.0,0.0,0.0,0.0,0.0
6805.230203090826,58,15,0,58,15,0.0,0.0,0.0,0.0,0.0
6805.230202000825,34,1,0,34,1,0.0,0.0,0.0,0.0,0.0
6805.230206163827,426,374,2,424,372,0.004695,0.004695,0.005348,0.005,0.040387
6805.230202150825,29,12,0,29,12,0.0,0.0,0.0,0.0,0.0
6805.230203180826,70,44,0,70,44,0.0,0.0,0.0,0.0,0.0
6805.230206090827,198,168,1,194,167,0.005051,0.005128,0.005952,0.00551,0.266667
6805.230203150826,28,13,0,28,13,0.0,0.0,0.0,0.0,0.0


# Summary of metrics

In [52]:
metrics_mel = pd.read_csv('metrics_MEL.csv')
metrics_mel.describe().round(4).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
file,39.0,6805.2302,0.0,6805.2302,6805.2302,6805.2302,6805.2302,6805.2302
numPredicted,39.0,98.2821,107.5815,7.0,29.0,58.0,111.5,426.0
numAnnotated,39.0,70.4872,93.5186,1.0,12.5,37.0,78.0,374.0
truePositives,39.0,0.3333,1.1773,0.0,0.0,0.0,0.0,7.0
falsePositives,39.0,97.6667,106.0261,7.0,29.0,58.0,111.0,424.0
falseNegatives,39.0,70.1538,92.8024,1.0,12.5,37.0,78.0,372.0
binaryAccuracy,39.0,0.0011,0.0031,0.0,0.0,0.0,0.0,0.0164
binaryPrecision,39.0,0.0011,0.0032,0.0,0.0,0.0,0.0,0.0167
BinaryRecall,39.0,0.0015,0.0049,0.0,0.0,0.0,0.0,0.0262
binaryF1,39.0,0.0012,0.0038,0.0,0.0,0.0,0.0,0.0204
