# Generate Dataset

By Alejandro Vega & Ian Flores

### Loading the necessary dependencies
The installation of this dependencies and the Python version (3.6) here used is better suited if doing with Anaconda.

In [None]:
import pylab
import wave
import openpyxl
import yaml
import os
import shutil
import _pickle as cpl
import tarfile
import pandas as pd

### Audio Preprocessing

Given that we have .flac files in our dataset, we need to convert this to .wav format before doing the transformations to spectrograms. To be able to do this, we call bash script, specifically the 'sox' package to help us. For this, you have to have 'sox' installed in your computer. In Ubuntu 17.10, the command is pretty straightforward. 'sudo apt install sox'. After the transformation we store all the files in format .wav in a directory called 'wav_recordings'.

In [None]:
# Get all the recordings and separate them by format. 
flac_files = []
wav_files = []
for file in os.listdir("../dataset/recordings"):
    if file.endswith(".flac"):
        flac_files.append(file)
    elif file.endswith(".wav"):
        wav_files.append(file)

In [None]:
# Convert all the .flac files to .wav files and store them in 'wav_recordings'
for i in range(len(flac_files)):
    string = 'sox ../recordings/' + str(flac_files[i]) + ' ../wav_recordings/' + str(flac_files[i][:-5]) + '.wav'
    os.system(string)

In [None]:
# Store the .wav files in 'wav_recordings'
for i in range(len(wav_files)):
    string = 'mv ../' + str(wav_files[i]) + ' ../wav_recordings'
    os.system(string)

### Transformations and Spectrogram Informations

In this section, we first get the information from the wav files, then the information regarding the spectrogram, and it's Region of Interest, which is where the animal call is focused. Then, after all this information is collected we proceed to plot the spectrogram for the full recording. 

In [None]:
# Get the Info from the .wav file.
def wavInfo(rec_file):
    wav_file = wave.open(rec_file, 'r')
    frames = wav_file.readframes(-1)
    wave_info = pylab.fromstring(frames, 'Int16') #all .wavs in our dataset are 16bit
    framerate = wav_file.getframerate()
    wav_file.close()
    return wave_info, framerate

# Get the info from the Spectrogram, but don't plot it.
def specInfo(rec_file):
    wave_info, framerate = wavInfo(rec_file)
    spectrum, freqs, t, _ = pylab.specgram(wave_info, NFFT=512, noverlap=256, window=pylab.window_hanning, Fs=framerate)
    del _
    return spectrum, freqs, t

In [None]:
# search for the index of the leftmost value in an ordered array 
# (of times or frequencies in our case) that still meet our criteria
def leftmostBinSearch(A, lo, hi, target):
    mid = (lo + hi) // 2
    v1 = A[mid]
    if (v1 >= target):
        if (mid > 0 and A[mid - 1] > target):
            return leftmostBinSearch(A, lo, mid-1, target)
        else:
            return mid
    elif (A[mid] < target):
        return leftmostBinSearch(A, mid+1, hi, target)
    else:
        return leftmostBinSearch(A, lo, mid-1, target)

# search for the index of the rightmost value in an ordered array 
# (of times or frequencies in our case) that still meet our criteria
def rightmostBinSearch(A, lo, hi, target): # something is wrong and it's giving me 1 to the right 
    mid = (lo + hi) // 2
    v1 = A[mid]
    if (v1 <= target):
        if (mid < (len(A) - 1) and A[mid + 1] <= target):
            return rightmostBinSearch(A, mid+1, hi, target)
        else:
            return mid
    elif (A[mid] < target):
        return rightmostBinSearch(A, mid+1, hi, target)
    else:
        return rightmostBinSearch(A, lo, mid-1, target)
    
# Calls on rightmostBinSearch and leftmostBinSearch
def getBounds(A, minVal, maxVal):
    left = leftmostBinSearch(A, 0, len(A)-1, minVal)
    right = rightmostBinSearch(A, 0, len(A)-1, maxVal)
    return left, right

In [None]:
def specMod(spectrum, freqs, times, f1, f2, t1, t2):
    spectrumMod = [spectrum[f1][t1:t2]]
    for f in range(f1+1, f2): # check when fix right limit
            spectrumMod = spectrumMod + [spectrum[f][t1:t2]]
    return spectrumMod

# Plots the spectrogram
def plotModSpecSimple(specMod, freqs, times, file):
    fig, ax = pylab.subplots(1)
    pylab.pcolormesh(times, freqs, 10 * pylab.log10(specMod))
    fig.subplots_adjust(left=0,right=1,bottom=0,top=1)
    ax.axis('tight')
    ax.axis('off')
    pylab.savefig(file)
    return fig

In [None]:
# Creates the speciesData dictionary which is a dict with all the information we have regarding the different species. 
def speciesData(workbook):
    roi_ws = openpyxl.load_workbook(workbook)['ROIs']
    dataset = {}
    # needed format:
    # species specimen per row
    # columns: species name, start_time, end_time, min_freq, max_freq, recording name
    # columns A to F
    sheetMatrix = list(roi_ws.iter_rows())
    # remove row with column names and create array of keys per species. (e.g. start_time, end_time, ...)
    keys = sheetMatrix.pop(0) 
    for row in sheetMatrix:
        speciesName = row[0].value
        if (speciesName not in dataset):
            dataset[speciesName] = {}
        for col in range(1,len(row)):
            cell = ''
            # change recording extension since we are dealing with wav files
            if (col == 5):
                cell = row[col].value
                cell += '.wav'
            else:
                cell = row[col].value
            # if per species key is not present add the key and add the value as the first element in a list
            if (keys[col].value not in dataset[speciesName]): 
                dataset[speciesName][keys[col].value] = [cell]
            # append to the list of attributes 
            else:
                dataset[speciesName][keys[col].value] = dataset[speciesName][keys[col].value] + [cell]
    return dataset

# Convert speciesData dictionary to yaml and save file
def dataToYAML(data, name): 
    # need to check if file exists then delete it
    path = '../dataset/' + name
    dataset = open(path, 'w+')
    dump = yaml.dump(data, dataset, default_flow_style=False)
    dataset.close()

# As it names suggests, it find the maximum. 
def findMax(L):
    Max = float('-inf')
    for n in L:
        if (n > Max):
            Max = n
    return Max

# As it names suggests, it find the minimum. 
def findMin(L):
    Min = float('inf')
    for n in L:
        if (n < Min):
            Min = n
    return Min

In [None]:
def simplifiedSpeciesData(data): 
    simplDat = {}
    for species in data:
        min_freqs = data[species]['min_frequency']
        max_freqs = data[species]['max_frequency']
        min_times = data[species]['start_time']
        max_times = data[species]['end_time']
        min_f = findMin(min_freqs)
        max_f = findMax(max_freqs)
        start = findMin(min_times)
        end = findMax(max_times)
        simplDat[species] = {'min_freq':min_f, 'max_freq':max_f, 'delta_time':(end - start), 'recording name':data[species]['recording name']}
    return simplDat

### Validating Data

Given that this data was manually labeled, and this is very time intensive, we have less validations than recordings, so we have to make sure to only be managing the recordings for which we have validations.

In [None]:
# Loads the validation data
df = pd.ExcelFile('../dataset/validationsAndROIs.xlsx')
df = df.parse('ROIs')

# Gets the name of all the rcordings
all_wav_files = []
for file in os.listdir("../dataset/wav_recordings"):
    all_wav_files.append(file[:file.index('.')])

# Extracts the recording_name column and stores it as a list
recording_name = df["recording name"].tolist()

# Formats the string containing the name of the recording to remove everything after the first dot.
for i in range(len(recording_name)):
    recording_name[i] = recording_name[i][:recording_name[i].index('.')]

# If we don't have a recording, then the validation data is not useful by itself. So we want to remove this data. 
for i in range(len(recording_name)):
    if recording_name[i] not in all_wav_files:
        recording_name[i] = "delete"

# Creates the column 'recording name' initialized with the corresponding values in tecording_name list, 
# then removes the data labeled as delete

df['recording name'] = recording_name
df = df[df['recording name'] != 'delete']

# Writes out the corrected validation data. 
writer = pd.ExcelWriter('../dataset/corrected_validationsAndROIs.xlsx')
df.to_excel(writer, 'ROIs', index=False)
writer.save()

# Standarizes the names of all the recording names
for files in os.listdir("../dataset/wav_recordings"):
    new_name = files[:files.index('.')]
    os.rename("../dataset/wav_recordings/" + files, "../dataset/wav_recordings/" + new_name + '.wav')

### Exporting Data

We need to export the different species recordings in pickle format, because it is easier to manipulate later on. But also want to export the dictionary in a .yaml file for later use. Also, we want a compressed version of this data in case it becomes necessary. 

In [None]:
# save our species data dictionary as a .yaml file for later use
workbook = '../dataset/corrected_validationsAndROIs.xlsx'
data = speciesData(workbook)
dataToYAML(data, 'dataset.yaml')

In [None]:
def getRawSpecDataset(dataset, path='../dataset'):
    
    # make directory to store our spec dataset
    dataset_path = path + '/spectrogram_roi_dataset'
    if not os.path.exists(dataset_path):
        os.makedirs(dataset_path)
    else:
        shutil.rmtree(dataset_path)
        os.makedirs(dataset_path)
    species = dataset.keys()
    
    # image data to be pickled 
    specs = []
    
    for s in species:
        s_dir = dataset_path + '/' + s
        s_spec = []
        os.makedirs(s_dir) # make a directory per species
        
        # load species ROI data
        min_freqs = dataset[s]['min_frequency']
        max_freqs = dataset[s]['max_frequency']
        starts = dataset[s]['start_time']
        ends = dataset[s]['end_time']
        recs = dataset[s]['recording name']
        
        for i in range(0, len(recs)):
            rec = '../dataset/wav_recordings/' + recs[i] # path to ith recording file where s is present
            spectrum, freqs, times = specInfo(rec) # get entire spectrogram data from rec
            
            # get ROI info in rec
            t_0 = starts[i] 
            t_n = ends[i]
            f_0 = min_freqs[i]
            f_n = max_freqs[i]
            
            # find closest times and freqs that match ROI info
            t_start, t_end = getBounds(times, t_0, t_n)
            f_start, f_end = getBounds(freqs, f_0, f_n)
            
            # get modified spectrum, freqs, and times
            spectrumMod = specMod(spectrum, freqs, times, f_start, f_end, t_start, t_end)
            freqMod = freqs[f_start:f_end]
            timeMod = times[t_start:t_end]
            filename = s_dir + '/' + s + '_spec_' + str(i+1) + '.png'
            
            # plot the spectrogram of ROI and save the image 
            f = plotModSpecSimple(spectrumMod, freqMod, timeMod, filename)
            s_spec.append(f) # append image to list of ROI spectrograms per species
            pylab.close
        
        # add dictionary with key <species_name> and value <list_of_spectrogram_figures>
        specs.append({s:s_spec}) 
    
    return specs

In [None]:
yamlData = open('../dataset/dataset.yaml', 'r')
dataset = yaml.load(yamlData)
yamlData.close()
data = getRawSpecDataset(dataset)

In [None]:
# Pickles the data
def serializeDataset(obj, path='../dataset'):
    pickle_path = path + '/pickle_data'
    # create pickle directory if exists, else overwrite it
    if not os.path.exists(pickle_path):
        os.makedirs(pickle_path)
    else:
        shutil.rmtree(pickle_path)
        os.makedirs(pickle_path)
    for s in obj:
        species = list(s.keys())[0]
        data = s[species]
        picklename = pickle_path + '/' + species + '.pickle'
        with open(picklename, 'wb+') as pn:
            cpl.dump(data, pn)


In [None]:
serializeDataset(data)

In [None]:
# Compresses the data 
def archiveAndCompress(path):
    directory = path.split('/')[0:-1]
    directory = '/'.join(directory)
    archive_name = directory + '/' + path.split('/')[-1] + '.tar.bz2'
    with tarfile.open(archive_name, 'w:bz2') as archive:
        folder = os.listdir(path)
        for f in folder:
            f = path + '/' + f
            archive.add(f, arcname=os.path.basename(f))

In [None]:
archiveAndCompress('../dataset/spectrogram_roi_dataset')
archiveAndCompress('../dataset/pickle_data')