# About Dataset
* Z by HP Unlocked Challenge 3
* Z by HP Unlocked Challenge 3 - Audio Recognition - Special thanks to Hunter Kempf for helping create this challenge!
* Watch the tutorial video here: https://youtu.be/9Txxl0FJZas

# CapuchinBird

# The Task
The Challenge is to build a Machine Learning model and code to count the number of Capuchinbird calls within a given clip. This can be done in a variety of ways and we would recommend that you do some research into various methods of audio recognition.

# What is Unlocked?
Unlocked is an action-packed interactive film made by Z by HP for data scientists. Sharpen your skills and solve the data driven mystery here: https://www.hp.com/us-en/workstations/industries/data-science/unlocked-challenge.html

# The Data
The Data is split into Training and Testing Data. For Training Data we have provided enough clips to get a decent model but you can also find, parse, augment and use additional audio clips to improve your model performance.

# Training Sets
In order to download and properly build our Training sets we have provided details and some example code for how to interact with the files.

# 1. Load Dependencis

In [1]:
import os
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_io as tfio
import plotly.graph_objects as go

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Dense, Flatten, Dropout,BatchNormalization



# 2. Build Data Loading Function

#### Define Paths To Files ##

In [2]:
CAPUCHIN_FILE = os.path.join('data', '/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing/Parsed_Capuchinbird_Clips','XC114131-0.wav')
NOT_CAPUCHIN_FILE = os.path.join('data', '/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing/Parsed_Not_Capuchinbird_Clips', 'afternoon-birds-song-in-forest-0.wav')


#### Build Dataloading Function ##

In [3]:
def load_wav_16k_mono(filename):
    # Load encoded wav file
    file_contents = tf.io.read_file(filename)
    # Decode wav (tensors by channels) 
    wav, sample_rate = tf.audio.decode_wav(file_contents, desired_channels=1)
    # Removes trailing axis
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    # Goes from 44100Hz to 16000hz - amplitude of the audio signal
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    return wav

#### Plot Wave ##

In [4]:
# Load the audio files
wave = load_wav_16k_mono(CAPUCHIN_FILE)
nwave = load_wav_16k_mono(NOT_CAPUCHIN_FILE)

# Create the Plotly figure
fig = go.Figure()

# Add the first trace
fig.add_trace(go.Scatter(y=wave, mode='lines', name='Capuchin'))

# Add the second trace
fig.add_trace(go.Scatter(y=nwave, mode='lines', name='Not Capuchin'))

# Set the layout
fig.update_layout(title='Waveform of Capuchin and Not Capuchin Audio Files',
                  xaxis_title='Sample',
                  yaxis_title='Amplitude')

# Show the plot
fig.show()

# 3. Create Tensorflow Dataset

#### Define paths to positive and negative data

In [5]:
POS = os.path.join('data','/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing/Parsed_Capuchinbird_Clips')
NEG = os.path.join('data','/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing/Parsed_Not_Capuchinbird_Clips')

In [6]:
pos_files = os.listdir(POS)
neg_files = os.listdir(NEG)

print("Positive files:", pos_files)
print("Negative files:", neg_files)


Positive files: ['XC27882-2.wav', 'XC227469-2.wav', 'XC456236-3.wav', 'XC227471-6.wav', 'XC227468-3.wav', 'XC513083-1.wav', 'XC216012-12.wav', 'XC9892-0.wav', 'XC27882-0.wav', 'XC227469-3.wav', 'XC433953-10.wav', 'XC79965-10.wav', 'XC526106-4.wav', 'XC46241-4.wav', 'XC600460-0.wav', 'XC201990-4.wav', 'XC388470-5.wav', 'XC114132-4.wav', 'XC227471-3.wav', 'XC9221-0.wav', 'XC433953-8.wav', 'XC46241-1.wav', 'XC178168-4.wav', 'XC216012-19.wav', 'XC395129-4.wav', 'XC227468-2.wav', 'XC46241-2.wav', 'XC216012-3.wav', 'XC216010-7.wav', 'XC216012-8.wav', 'XC65196-2.wav', 'XC3776-4.wav', 'XC3776-2.wav', 'XC526106-0.wav', 'XC456236-0.wav', 'XC3776-3.wav', 'XC114132-5.wav', 'XC27882-5.wav', 'XC178168-2.wav', 'XC79965-2.wav', 'XC307385-0.wav', 'XC600460-1.wav', 'XC9221-1.wav', 'XC433953-9.wav', 'XC16804-0.wav', 'XC178167-1.wav', 'XC388470-6.wav', 'XC456236-1.wav', 'XC22397-1.wav', 'XC216012-0.wav', 'XC216010-1.wav', 'XC227468-5.wav', 'XC216012-4.wav', 'XC433953-2.wav', 'XC16804-1.wav', 'XC114131-3.w

#### Create Tensorflow Datasets

In [7]:
pos = tf.data.Dataset.list_files(POS + '/*-*.wav')
neg = tf.data.Dataset.list_files(NEG + '/*-*.wav')


#### Add labels and combine positive and negative samples

In [8]:
positives = tf.data.Dataset.zip((pos, tf.data.Dataset.from_tensor_slices(tf.ones(len(pos)))))
negatives = tf.data.Dataset.zip((neg, tf.data.Dataset.from_tensor_slices(tf.zeros(len(neg)))))

In [9]:
data = positives.concatenate(negatives)

# 4. Determine Average Length of a capucin call

#### calculate wave cycle length

In [10]:
lengths = []
for file in os.listdir(os.path.join('data','/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing/Parsed_Not_Capuchinbird_Clips')):
    file_path = os.path.join('data','/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing/Parsed_Not_Capuchinbird_Clips', file)
#     print("Current file path:", file_path)
    tensor_wave = load_wav_16k_mono(file_path)
    lengths.append(len(tensor_wave))


#### calculate mean, min  and max

In [11]:
tf.math.reduce_mean(lengths)

<tf.Tensor: shape=(), dtype=int32, numpy=49297>

In [12]:
tf.math.reduce_min(lengths)
tf.math.reduce_max(lengths)

<tf.Tensor: shape=(), dtype=int32, numpy=63087>

# 5. Build Preprocessing Function

#### Build Preprocessing Function

In [13]:
def preprocess(file_path, label): 
    wav = load_wav_16k_mono(file_path)
    wav = wav[:48000]
    zero_padding = tf.zeros([48000] - tf.shape(wav), dtype=tf.float32)
    wav = tf.concat([zero_padding, wav],0)
    spectrogram = tf.signal.stft(wav, frame_length=320, frame_step=32)
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.expand_dims(spectrogram, axis=2)
    return spectrogram, label

#### Test out the function and visualization spectrogram

In [14]:
# Load the file and label
filepath, label = positives.shuffle(buffer_size=10000).as_numpy_iterator().next()

# Preprocess the data
spectrogram, label = preprocess(filepath, label)

# Create the Plotly figure
fig = go.Figure(data=[go.Heatmap(z=tf.transpose(spectrogram)[0], colorscale='viridis')])

# Set the layout
fig.update_layout(title=f'Spectrogram of {label}',
                  width=1200, height=800,  # equivalent to figsize=(30,20)
                  xaxis_title='Time',
                  yaxis_title='Frequency')

# Show the plot
fig.show()

# 6. Creating Training and Testing 

#### create a tensorflow data pipeline

In [15]:
data = data.map(preprocess)
data = data.cache()
data = data.shuffle(buffer_size = 10000)
data = data.batch(16)
data = data.prefetch(8)

#### Split into Training and Testing Partitions ##

In [16]:
train = data.take(36)
test = data.skip(36).take(15)

#### Test One Batch

In [17]:
samples, labels = train.as_numpy_iterator().next()

samples.shape

(16, 1491, 257, 1)

# 7. Build Deep Learning Model

#### Build Sequential Model, Compile and View Summary

In [18]:
model = Sequential()
model.add(Conv2D(16,(3,3),activation='relu', input_shape=(1491,257,1)))
model.add(Conv2D(16,(3,3),activation='relu'))
model.add(Flatten())
# model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [19]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


In [20]:
model.compile('Adam', loss='BinaryCrossentropy', metrics=[tf.keras.metrics.Recall(),tf.keras.metrics.Precision()])


In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 1489, 255, 16)     160       
                                                                 
 conv2d_1 (Conv2D)           (None, 1487, 253, 16)     2320      
                                                                 
 flatten (Flatten)           (None, 6019376)           0         
                                                                 
 dense (Dense)               (None, 1)                 6019377   
                                                                 
Total params: 6021857 (22.97 MB)
Trainable params: 6021857 (22.97 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Fit Model, View Loss and KPl Plots

In [22]:
history = model.fit(train, epochs=15, validation_data=test)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [23]:
model.evaluate(test)



[1.386214307785849e-07, 1.0, 1.0]

In [24]:
model.save('model.h5')


You are saving your model as an HDF5 file via `model.save()`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')`.



In [25]:
import plotly.graph_objects as go

# Create the Plotly figure
fig = go.Figure()

# Add the training loss trace
fig.add_trace(go.Scatter(x=list(range(1, len(history.history['loss']) + 1)), 
                         y=history.history['loss'], 
                         mode='lines', 
                         name='Training Loss', 
                         line=dict(color='red')))

# Add the validation loss trace
fig.add_trace(go.Scatter(x=list(range(1, len(history.history['val_loss']) + 1)), 
                         y=history.history['val_loss'], 
                         mode='lines', 
                         name='Validation Loss', 
                         line=dict(color='blue')))

# Set the layout
fig.update_layout(title='Loss',
                  xaxis_title='Epoch',
                  yaxis_title='Loss')

# Show the plot
fig.show()

# 8. Make Prediction on single clip

#### get one batch and make prediction

In [26]:
xtest,ytest = test.as_numpy_iterator().next()

In [27]:
yhat = model.predict(xtest)



## Convert Logits to classes

In [28]:
yhat = [1 if prediction > 0.5 else 0 for prediction in yhat]

# 9. Build Forest Parsing Function

##### load up mp3s

In [29]:
def load_mp3_16k_mono(filename):
    """ Load an MP3 file, convert it to a float tensor, and resample to 16 kHz single-channel audio. """
    res = tfio.audio.AudioIOTensor(filename)
    
    # Convert to tensor and combine channels
    tensor = res.to_tensor()
    tensor = tf.math.reduce_sum(tensor, axis=1) / 2
    
    # Extract sample rate and cast
    sample_rate = res.rate
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    
    # Resample to 16 KHz
    wav = tfio.audio.resample(tensor, rate_in=sample_rate, rate_out=16000)
    
    return wav


In [30]:
mp3 = os.path.join('data','/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing/Forest Recordings','/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing/Forest Recordings/recording_00.mp3')

In [31]:
wav = load_mp3_16k_mono(mp3)

In [32]:
audio_slice = tf.keras.utils.timeseries_dataset_from_array(wav, wav, sequence_length=48000, sequence_stride=48000, batch_size=1)

In [33]:
sample, index = audio_slice.as_numpy_iterator().next()

#### Build Function to convert clips into windowed spectrogram

In [34]:
def preprocess_mp3(sample, index):
    sample = sample[0]
    zero_padding = tf.zeros([48000] - tf.shape(sample), dtype=tf.float32)
    wav = tf.concat([zero_padding, sample],0)
    spectrogram = tf.signal.stft(wav, frame_length=320, frame_step=32)
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.expand_dims(spectrogram, axis=2)
    return spectrogram

#### convert longer clips into windows and make predictions

In [35]:
audio_slice = tf.keras.utils.timeseries_dataset_from_array(wav, wav, sequence_length=16000, sequence_stride=16000, batch_size=1)
audio_slice = audio_slice.map(preprocess_mp3)
audio_slice = audio_slice.batch(64)

In [36]:
yhat = model.predict(audio_slice)
yhat  = [ 1 if prediction > 0.5 else 0 for prediction in yhat]



#### Group Consenutive Detectons

In [37]:
from itertools import groupby
yhat = [ key for key, group in groupby(yhat)]
calls = tf.math.reduce_sum(yhat).numpy()

In [38]:
calls

6

# 10 Make Prediction

#### Loop over all recordings and make prediction

In [39]:
results = {}
for file in os.listdir(os.path.join('data','/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing/Forest Recordings')):
    FILEPATH = os.path.join('data','/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing/Forest Recordings',file)
    
    wav = load_mp3_16k_mono(FILEPATH)
    audio_slices = tf.keras.utils.timeseries_dataset_from_array(wav, wav, sequence_length=48000, sequence_stride=48000, batch_size=1)
    audio_slices = audio_slices.map(preprocess_mp3)
    audio_slices = audio_slices.batch(64)
    yhat = model.predict(audio_slices)
    results[file] = yhat



## Convert predictions into classes

In [40]:
class_preds = {}
for file, logits in results.items():
    class_preds[file] = [1 if prediction > 0.99 else 0 for prediction in logits]
# class_preds

#### Group Consective Detections

In [41]:
postprocessd = {}
for file, scores in class_preds.items():
    postprocessd[file] = tf.math.reduce_sum([key for key, group in groupby(scores)]).numpy()
postprocessd

{'recording_76.mp3': 0,
 'recording_62.mp3': 2,
 'recording_48.mp3': 4,
 'recording_44.mp3': 1,
 'recording_45.mp3': 3,
 'recording_06.mp3': 5,
 'recording_43.mp3': 5,
 'recording_68.mp3': 1,
 'recording_95.mp3': 5,
 'recording_17.mp3': 10,
 'recording_65.mp3': 5,
 'recording_74.mp3': 0,
 'recording_10.mp3': 5,
 'recording_49.mp3': 0,
 'recording_15.mp3': 2,
 'recording_37.mp3': 3,
 'recording_64.mp3': 3,
 'recording_39.mp3': 10,
 'recording_04.mp3': 4,
 'recording_28.mp3': 14,
 'recording_05.mp3': 0,
 'recording_80.mp3': 1,
 'recording_57.mp3': 3,
 'recording_12.mp3': 0,
 'recording_59.mp3': 14,
 'recording_67.mp3': 0,
 'recording_09.mp3': 0,
 'recording_79.mp3': 0,
 'recording_14.mp3': 0,
 'recording_63.mp3': 11,
 'recording_86.mp3': 12,
 'recording_47.mp3': 14,
 'recording_03.mp3': 0,
 'recording_33.mp3': 0,
 'recording_08.mp3': 24,
 'recording_51.mp3': 3,
 'recording_35.mp3': 0,
 'recording_81.mp3': 5,
 'recording_30.mp3': 3,
 'recording_88.mp3': 0,
 'recording_75.mp3': 1,
 'record

# 11. Export Results

In [42]:
import csv
with open('results.csv','w', newline='') as f:
    writer = csv.writer(f, delimiter=',')
    writer.writerow(['recoding', 'capuchin_calls'])
    for key, value in postprocessd.items():
        writer.writerow([key, value])
    