In [1]:
# Only run once
# !pip install librosa
#!pip install tensorflow_probability
#!pip install tensorflow_addons
#!pip install scikit-maad

In [7]:
import sys
sys.path.append('/Users/sucheen/anaconda3/lib/python3.11/site-packages')
from tensorflow.keras import layers,Input
from tensorflow.keras.layers import Dense,Lambda
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from scipy import stats
from PIL import ImageColor,ImageFont
import pandas as pd
import librosa
import numpy as np
import soundfile as sf
import pdb
import glob
import cv2
from tensorflow_probability import distributions as tfd
import tensorflow_addons as tfa
import boto3
import warnings
warnings.filterwarnings('ignore')

%run preprocessing.ipynb
KEYS = "../../ssundar_accessKeys.csv"
aws_access_key_id, aws_secret_access_key = awsKeys(KEYS)

session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name='us-east-1'  # or your preferred region
)
s3 = session.resource('s3')

# S3 Bucket for Professor's Account is 'monitoring-whale-recordings'
# S3 Bucket for our free tier Account is 'monitoring-whale-records'
bucket_name = 'monitoring-whale-recordings'
bucket = s3.Bucket(bucket_name)


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



finished preprocessing


In [8]:
class AudioVAE(keras.Model):
    def __init__(self, latent_dim, sr, num_heads=4, key_dim=64, value_dim=64):
        super(AudioVAE, self).__init__()
        self.latent_dim = latent_dim
        self.sr = sr
        
        self.num_heads = num_heads
        self.key_dim = key_dim
        self.value_dim = value_dim
        
        
        self.encoder = keras.Sequential([
            layers.InputLayer(input_shape=(608,192, 1)),
            layers.Conv2D(filters=32, kernel_size=5, strides=2, padding='same'),
            tf.keras.layers.Lambda(lambda x: tfa.activations.gelu(x)),
            layers.Conv2D(filters=64, kernel_size=5, strides=2, padding='same'),
            tf.keras.layers.Lambda(lambda x: tfa.activations.gelu(x)),
            layers.Conv2D(filters=128, kernel_size=5, strides=2, padding='same'),
            tf.keras.layers.Lambda(lambda x: tfa.activations.gelu(x)),
            layers.Conv2D(filters=256, kernel_size=5, strides=2, padding='same'),
            tf.keras.layers.Lambda(lambda x: tfa.activations.gelu(x)),
            layers.Conv2D(filters=512, kernel_size=5, strides=2, padding='same'),
            tf.keras.layers.Lambda(lambda x: tfa.activations.gelu(x)),
            layers.Flatten(),
            layers.Dense(latent_dim)
        ])


        # Decoder
        self.decoder = keras.Sequential([
            layers.InputLayer(input_shape=(int(latent_dim/2),)),
            layers.Dense(units=19*6*512, activation='relu'),
            layers.Reshape(target_shape=(19, 6, 512)),
            layers.Conv2DTranspose(filters=512, kernel_size=5, strides=2, padding='same'),
            tf.keras.layers.Lambda(lambda x: tfa.activations.gelu(x)),
            layers.Conv2DTranspose(filters=256, kernel_size=5, strides=2, padding='same'),
            tf.keras.layers.Lambda(lambda x: tfa.activations.gelu(x)),
            layers.Conv2DTranspose(filters=128, kernel_size=5, strides=2, padding='same'),
            tf.keras.layers.Lambda(lambda x: tfa.activations.gelu(x)),
            layers.Conv2DTranspose(filters=64, kernel_size=5, strides=2, padding='same'),
            tf.keras.layers.Lambda(lambda x: tfa.activations.gelu(x)),
            layers.Conv2DTranspose(filters=32, kernel_size=5, strides=2, padding='same'),
            tf.keras.layers.Lambda(lambda x: tfa.activations.gelu(x)),
            layers.Conv2DTranspose(filters=1, kernel_size=5, strides=1, padding='same', activation='linear')
         ])        

        
    @tf.function
    def train_step(self, x):
        with tf.GradientTape(persistent=True) as tape:
            # Encode input
            reconstruction,mean,logvar=self(x)
            loss = vae_loss_function(x, reconstruction, mean, logvar)
            reconstruction_loss = loss["reconstruction_loss"]
            kl_loss = loss["kl_loss"]
            total_loss=loss["total_loss"]
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        return {"kl_loss": kl_loss,"reconstruction_loss":reconstruction_loss,"total_loss":total_loss}
        
    @tf.function
    def encode(self, x):
        mean, logvar = tf.split(self.encoder(x), num_or_size_splits=2, axis=1)
        return mean, logvar
    
    @tf.function
    def reparameterize(self, mean, logvar):
        batch_size = tf.shape(mean)[0]
        latent_dim = tf.shape(mean)[1]
        eps = tf.random.normal(shape=(batch_size, latent_dim))
        return eps * tf.exp(logvar * 0.5) + mean
    
    @tf.function
    def decode(self, z):
        recon = self.decoder(z)
        return recon
    
    @tf.function
    def call(self, x):
        mean, logvar = self.encode(x)
        z = self.reparameterize(mean, logvar)
        reconstruction = self.decode(z)
        return reconstruction, mean, logvar

    
    
    @tf.function
    def reconstructed_probability(self, x, mean, logvar):
        
        x = tf.convert_to_tensor(x, dtype=tf.float64)
        x = tf.cast(x, dtype=tf.float32)
        recon_dist = tfd.Normal(loc=mean, scale=tf.math.exp(0.5*logvar))
        x = tf.expand_dims(x, 0)
        p = tf.exp(recon_dist.log_prob(x).mean(axis=0).mean(axis=-1))  # vector of shape [batch_size]
        return p

In [9]:
def vae_loss_function(x, reconstruction, mean, logvar,prediction=False):
    reconstruction_loss=tf.keras.losses.MeanAbsoluteError()(x,reconstruction)
    reconstruction_loss = tf.reduce_mean(reconstruction_loss)
    # Compute KL divergence loss
    kl_loss = 1 + logvar - tf.square(mean) - tf.exp(logvar)
    kl_loss = -0.5 * tf.reduce_sum(kl_loss, axis=-1)
    

    
    if prediction:
        return {"reconstruction_loss": reconstruction_loss, "kl_loss": kl_loss} 
    
    # Reduce the losses to a scalar
    kl_loss = tf.reduce_mean(kl_loss)

    return {"reconstruction_loss": reconstruction_loss, "kl_loss": kl_loss, "total_loss":reconstruction_loss+kl_loss}   

In [10]:
def build_model(latent_dim,sr):
    # Initialize VAE
    vae = AudioVAE(latent_dim,sr)


    # Compile VAE
    optimizer = tfa.optimizers.AdaBelief(lr=0.0006)
    vae.compile(optimizer=optimizer, loss=vae_loss_function)


    vae.encoder.summary()
    vae.decoder.summary()
    return vae

In [11]:
def calculate_confidence(heatmap, contour,max_value):
    # Get the region of interest from the heatmap based on the contour
    x, y, w, h = cv2.boundingRect(contour)
    roi = heatmap[y:y+h, x:x+w]
    
    # Calculate the average pixel intensity of the region of interest
    average_intensity = np.mean(roi)
    
    # Calculate the confidence value by dividing the average intensity by the maximum known intensity (255)
    confidence = average_intensity / max_value
    
    return confidence

In [12]:
from maad import sound, rois, features
from maad.util import (power2dB, plot2d, format_features, read_audacity_annot,
                       overlay_rois, overlay_centroid)
def apply_bounding_boxes(spectrogram,running=True,time_reference=0.0):
    sr = 8000
    
    
    
    WINDOW_SIZE_SEC = 0.15175
    HOP_LEN_SEC = 0.05
    # Reads-in WAV file information (and annotation information)

    # Parameters needed for the stream
    n_fft = int(WINDOW_SIZE_SEC * sr)
    hop_length = int(HOP_LEN_SEC * sr)
    
    # Compute the frequency values in Hz
    frequencies = librosa.core.fft_frequencies(sr=sr, n_fft=n_fft)
    times = np.arange(0,9.6,0.05)
    
    
    #GRAPH ATTEMPT
    # Convert the spectrogram to an 8-bit grayscale image
    spectrogram_gray = (spectrogram * 255).astype(np.uint8)
    spectrogram_rgb = cv2.cvtColor(spectrogram_gray, cv2.COLOR_GRAY2RGB)
    n=0
    img=spectrogram_rgb
    
    
#     Sxx_power_noNoise= sound.median_equalizer(spectrogram, display=True)
#     Sxx_db_noNoise = power2dB(spectrogram)

    # Then we smooth the spectrogram in order to facilitate the creation of masks as
    # small sparse details are merged if they are close to each other
    Sxx_db_noNoise_smooth = sound.smooth(spectrogram, std=0.5,
                             display=False, savefig=None)

    # Then we create a mask (i.e. binarization of the spectrogram) by using the
    # double thresholding technique
    im_mask = rois.create_mask(im=Sxx_db_noNoise_smooth, mode_bin ='relative',
                               bin_std=16, bin_per=0.5,
                               verbose=False, display=False)

    # Finaly, we put together pixels that belong to the same acoustic event, and
    # remove very small events (<=25 pixel²)
    im_rois, df_rois = rois.select_rois(im_mask, min_roi=30, max_roi=None,
                                     display= False)

    
    
    
    rects=df_rois[["min_x","max_x","min_y","max_y"]]
    
    
    
    boxes=[]
    # Iterate over the resulting bounding boxes
    for box in rects.iterrows():
        x1,x2,y1,y2=box[1]
        if times[x1-1]<times[x2-1] and frequencies[y1-1]<frequencies[y2-1]:
            roi = spectrogram[y1-1:y2, x1-1:x2]
            # Calculate the average pixel intensity of the region of interest
            average_intensity = np.mean(roi)

            # Calculate the confidence value by dividing the average intensity by the maximum known intensity (255)
            # max_compar=np.quantile(spectrogram,0.99)
            max_compar = np.max(spectrogram)
            confidence = average_intensity / max_compar
            
            if running:
                row=[times[x1-1]+time_reference,times[x2-1]+time_reference,frequencies[y1-1],frequencies[y2-1],confidence]
                boxes.append(row)
            else:
                boxes.append([time_constant*x,time_constant*(x+w),frequency_constant*y,frequency_constant*(y+h),confidence])
    return boxes

In [13]:
def train_model(train_dataset,save=True):
    segments=1
    latent_dim=2000
    vae=build_model(latent_dim,None)
    vae.fit(train_dataset, epochs=5)
    if save:
        # Save model weights to a file
        vae.save("test_vae_mod_pcen")  

In [14]:
def error_dataset(vae,data,full=True,sr=None):
    reconstruction,_,_=vae.predict(data)
    reconstruction_loss=tf.keras.losses.MeanAbsoluteError(reduction=tf.keras.losses.Reduction.NONE)(data,reconstruction).numpy()
    return reconstruction_loss

In [15]:
def write_array_to_file(array, headers, filename):
    # Open the file for writing
    with open(filename, 'w') as file:
        # Write the headers to the file
        header_line = '\t'.join(headers)  # Join headers with tabs
        file.write(header_line + '\n')

        # Write the array data to the file
        for row in array:
            row_line = '\t'.join(map(str, row))  # Join row elements with tabs
            file.write(row_line + '\n')

In [16]:
def run_model(non_normal_scores):
    bounding_boxes=[]
    reference=0.0
    
    for i in non_normal_scores:
        bounding_boxes+=apply_bounding_boxes(i,True,reference)
        reference+=9.6
#     print("DONE")
    bounding_boxes=np.array(bounding_boxes)
    return bounding_boxes

# Code to train and get initial bounding boxes

In [17]:
background_noise = "avila_filtered.wav"
predicting_file = "6805.230201070825_processed.wav"
print(predicting_file)

# Get predicting file

s3_client = boto3.client(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name='us-west-2'
)
bucket_name = 'whale-recordings'
bucket = s3.Bucket(bucket_name)

for wf in WAV_FILES:
    if predicting_file == wf[0]:
        # Download WAV file from S3 Bucket
        s3_client.download_file(bucket_name, wf[1], wf[0])

np.set_printoptions(suppress=True)
print("training")
dataset_train,sr = process_wav(background_noise)
#sys.exit(0)
print("testing")
dataset_test,sr = process_wav(predicting_file, running = True)
print("finshed")

6805.230201070825_processed.wav
training


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
print("training for real")
train_model(dataset_train,True)

In [None]:
print("In the vae cell")
vae=keras.models.load_model("test_vae_mod_pcen", custom_objects={"vae_loss_function": vae_loss_function})
non_normal_scores=error_dataset(vae,dataset_test,False,sr=sr)

In [None]:
print("In here")
bounding_boxes=run_model(non_normal_scores)
titles=["Begin Time (s)","End Time (s)","Low Freq (Hz)","High Freq (Hz)","Species confidence"]
write_array_to_file(bounding_boxes,titles,predicting_file.split('_')[0] + "_predictions.txt")
print("done modeling")

In [18]:
background_noise = "avila_filtered.wav"
predicting_file = "6805.230201070825_processed.wav"
dataset_train

<_BatchDataset element_spec=TensorSpec(shape=(None, 608, 192, 1), dtype=tf.float64, name=None)>