In [None]:
# This notebook creates spectrogram images from a directory of sound files in .wav format.
# The code is optimized to be processed in parallel on multi-core machines.

In [None]:
# Spectogram generation library
# https://librosa.org/doc/latest/index.html
import sys
!conda install --yes --prefix {sys.prefix} librosa


In [None]:
# imports

# With the exception of the librosa library installed above, all of these modules are 
# either included in the code base or provided by default on Amazon Sagemaker. 

import gc
import glob
import io
import multiprocessing as mp
import os
import os.path
import pickle
import pprint
import random
import time
from multiprocessing import Pool
from pathlib import Path

import librosa
import matplotlib.pyplot as plt
import numpy as np
from db import NABat_DB
from IPython.display import Image
from PIL import Image
from spectrogram import Spectrogram


In [None]:
# Test we have a valid database and enumerate the species represented.
db = NABat_DB()
species = db.query(' select * from species;')
pprint.pprint(species)
db.conn.close()


In [None]:
# Point to a directory containing .wav files organized by species code. 
# Example "../v1.1.0/data/wav/ANPA/p163_g89522_f28390444.wav"
directory = '../v1.1.0/data/wav'

In [None]:
# Optional step.
# Create a plot visualizing our sample size of input data.

plt.close(fig='all')
class_names = []
wav_count = []
for s in species:
    class_names.append(s[1])
    wav_count.append(len(glob.glob('{}/{}/*.wav'.format(directory,s[1]), recursive=True)))

figure = plt.figure(figsize=(40, 10))

labels = class_names
count = np.array(wav_count)
print('Median files per class: ', np.median(count)//1)

width = 0.35 # the width of the bars: can also be len(x) sequence

plt.bar(labels, count, width, color="#4a2eff", label='.wav Files')

plt.ylabel('Count Files')
plt.xlabel('NABat Species Code')

plt.title('Count of .wav Files per Species Code')
plt.legend()
plt.show()


In [None]:

# Get the working directory and define an output directory for spectrograms.
dir_path = os.getcwd()
create_spectrogram_location = 'data/images'

# Given a species code, return a numeric id.
def get_manual_id(species_code):
    for s in species:
        if s.species_code == species_code:
            return s.id


# This method is meant to be called in parallel and will take a single file path
# and produce a spectrogram for each bat pulse detected within the recording.
def process_file(file_name):
    
    # Randomly and proprotionally assign files to the train, validate, and test sets.
    # 80% train, 10% validate, 10% test
    draw = None
    r = random.random()
    if r < 0.80:
        draw = 'train'
    elif r < 0.90:
        draw = 'test'
    else:
        draw = 'validate'
      
    # Open a new database connection.
    db = NABat_DB()
    
    # Get metadata about the recording from the file name. The expected naming convention is:
    # p_{nabat_project_id}_g{nabat_grts_id}_f{nabat_file_id}.wav
    # Example: "p163_g89522_f28390444.wav"
    
    species = file_name.split('/')[-2]
    manual_id = get_manual_id(species)
    grts_id = file_name.split('_')[1][1:]
    file_name_base = file_name.split('/')[-1].replace('.wav','')

    # Process file and return pulse metadata.
    spectrogram = Spectrogram()
    d = spectrogram.process_file(file_name)

    # Add the file to the database.
    file_id, draw = db.add_file(
                    d.name, d.duration, d.sample_rate, manual_id, grts_id, draw=draw)

    # For each pulse within file...
    for i, m in enumerate(d.metadata):
        # ...create a place to put the spectrogram.
        path = '{}/{}/{}/t_{}.png'.format(create_spectogram_location, species, file_name_base, m.offset)
        Path('{}/{}/{}'.format(create_spectogram_location, species, file_name_base)).mkdir(parents=True, exist_ok=True)
        
        # Add the pulse to the database.
        pulse_id = db.add_pulse(file_id, m.frequency,
                                  m.amplitude, 0, m.offset, m.time, None, path)
        # On success...
        if pulse_id:
            # ...create a spectrogram image surrounding the pulse and save to disk.
            img = spectrogram.make_spectrogram(m.window, d.sample_rate)
            img.save(path)
            img.close()
            
    # Close the database connection.
    db.conn.close()
      


In [None]:


# Use as many threads as we can, leaving one available to keep notebook responsive.
thread_count = (mp.cpu_count() - 1)
print('using {} threads'.format(thread_count))
 
# Gather wav files.
files = glob.glob('{}/**/*.wav'.format(directory), recursive=True)
progress = int(len(files) * 0.01)

# Start the creation process in parallel and report progress.
for i in range(0,len(files),progress):
    with Pool(thread_count) as p:
        p.map(process_file, files[i:i+progress])
        gc.collect()
        print('{}%'.format(int(i/progress)))


In [None]:
# Done!