In [1]:
# dependencies
import pylab
import wave
import openpyxl
import yaml
import os
import shutil
import _pickle as cpl # import cPickle
import tarfile

In [2]:
def wavInfo(rec_file):
    wav_file = wave.open(rec_file, 'r')
    frames = wav_file.readframes(-1)
    wave_info = pylab.fromstring(frames, 'Int16') #all .wavs in our dataset are 16bit
    framerate = wav_file.getframerate()
    wav_file.close()
    return wave_info, framerate

In [3]:
def specInfo(rec_file):
    wave_info, framerate = wavInfo(rec_file)
    spectrum, freqs, t, _ = pylab.specgram(wave_info, NFFT=512, noverlap=256, window=pylab.window_hanning, Fs=framerate)
    del _
    return spectrum, freqs, t

In [4]:
# search for the index of the leftmost value in an ordered array 
# (of times or frequencies in our case) that still meet our criteria
def leftmostBinSearch(A, lo, hi, target):
    mid = (lo + hi) // 2
    v1 = A[mid]
    if (v1 >= target):
        if (mid > 0 and A[mid - 1] > target):
            return leftmostBinSearch(A, lo, mid-1, target)
        else:
            return mid
    elif (A[mid] < target):
        return leftmostBinSearch(A, mid+1, hi, target)
    else:
        return leftmostBinSearch(A, lo, mid-1, target)

# search for the index of the rightmost value in an ordered array 
# (of times or frequencies in our case) that still meet our criteria
def rightmostBinSearch(A, lo, hi, target): # something is wrong and it's giving me 1 to the right 
    mid = (lo + hi) // 2
    v1 = A[mid]
    if (v1 <= target):
        if (mid < (len(A) - 1) and A[mid + 1] <= target):
            return rightmostBinSearch(A, mid+1, hi, target)
        else:
            return mid
    elif (A[mid] < target):
        return rightmostBinSearch(A, mid+1, hi, target)
    else:
        return rightmostBinSearch(A, lo, mid-1, target)

In [5]:
def getBounds(A, minVal, maxVal):
    left = leftmostBinSearch(A, 0, len(A)-1, minVal)
    right = rightmostBinSearch(A, 0, len(A)-1, maxVal)
    return left, right

In [6]:
def specMod(spectrum, freqs, times, f1, f2, t1, t2):
    spectrumMod = [spectrum[f1][t1:t2]]
    for f in range(f1+1, f2): # check when fix right limit
            spectrumMod = spectrumMod + [spectrum[f][t1:t2]]
    return spectrumMod

In [7]:
def plotModSpecSimple(specMod, freqs, times, file):
    fig, ax = pylab.subplots(1)
    pylab.pcolormesh(times, freqs, 10 * pylab.log10(specMod))
    fig.subplots_adjust(left=0,right=1,bottom=0,top=1)
    ax.axis('tight')
    ax.axis('off')
    pylab.savefig(file)
    return fig

In [8]:
def speciesData(workbook):
    roi_ws = openpyxl.load_workbook(workbook)['ROIs'] # should change accordingly to where and how you data is stored
    dataset = {}
    # needed format:
    # species specimen per row
    # columns: species name, start_time, end_time, min_freq, max_freq, recording name
    # columns A to F
    sheetMatrix = list(roi_ws.iter_rows())
    # remove row with column names and create array of keys per species. (e.g. start_time, end_time, ...)
    keys = sheetMatrix.pop(0) 
    for row in sheetMatrix:
        speciesName = row[0].value
        if (speciesName not in dataset):
            dataset[speciesName] = {}
        for col in range(1,len(row)):
            cell = ''
            # change recording extension since we are dealing with wav files
            if (col == 5):
                cell = row[col].value
                cell += '.wav'
            else:
                cell = row[col].value
            # if per species key is not present add the key and add the value as the first element in a list
            if (keys[col].value not in dataset[speciesName]): 
                dataset[speciesName][keys[col].value] = [cell]
            # append to the list of attributes 
            else:
                dataset[speciesName][keys[col].value] = dataset[speciesName][keys[col].value] + [cell]
    return dataset



In [9]:
def dataToYAML(data, name): # convert speciesData dictionary to yaml and save file
    # need to check if file exists then delete it
    path = '../dataset/' + name
    dataset = open(path, 'w+')
    dump = yaml.dump(data, dataset, default_flow_style=False)
    dataset.close()

In [10]:
def findMax(L):
    Max = float('-inf')
    for n in L:
        if (n > Max):
            Max = n
    return Max

def findMin(L):
    Min = float('inf')
    for n in L:
        if (n < Min):
            Min = n
    return Min

In [11]:
#need to decide if exceed bounds of spectrograms or restrict
# WIP. time data is wrong. need to find average time duration
def simplifiedSpeciesData(data): 
    simplDat = {}
    for species in data:
        min_freqs = data[species]['min_frequency']
        max_freqs = data[species]['max_frequency']
        min_times = data[species]['start_time']
        max_times = data[species]['end_time']
        min_f = findMin(min_freqs)
        max_f = findMax(max_freqs)
        start = findMin(min_times)
        end = findMax(max_times)
        simplDat[species] = {'min_freq':min_f, 'max_freq':max_f, 'delta_time':(end - start), 'recording name':data[species]['recording name']}
    return simplDat

In [12]:
import pandas as pd
df = pd.ExcelFile('../dataset/validationsAndROIs.xlsx')

In [13]:
df = df.parse('ROIs')

In [14]:
all_wav_files = []
for file in os.listdir("../dataset/wav_recordings"):
    all_wav_files.append(file[:file.index('.')])

In [15]:
recording_name = df["recording name"].tolist()

In [16]:
for i in range(len(recording_name)):
    recording_name[i] = recording_name[i][:recording_name[i].index('.')]

In [17]:
for i in range(len(recording_name)):
    if recording_name[i] not in all_wav_files:
        recording_name[i] = "delete"

In [18]:
df['recording name'] = recording_name
df = df[df['recording name'] != 'delete']

In [19]:
writer = pd.ExcelWriter('../dataset/corrected_validationsAndROIs.xlsx')
df.to_excel(writer, 'ROIs', index=False)
writer.save()

In [20]:
from os import rename, listdir

badprefix = "cheese_"
fnames = listdir('.')

for fname in fnames:
    if fname.startswith(badprefix*2):
        rename(fname, fname.replace(badprefix, '', 1))

In [21]:
all_wav_files[0]

'Mona_DSG_1014-2013-01-26_11-00'

In [22]:
for files in os.listdir("../dataset/wav_recordings"):
    new_name = files[:files.index('.')]
    os.rename("../dataset/wav_recordings/" + files, "../dataset/wav_recordings/" + new_name + '.wav')

In [23]:
# save our species data dictionary as a .yaml file for later use
workbook = '../dataset/corrected_validationsAndROIs.xlsx'
#data = speciesData(df)
data = speciesData(workbook)
dataToYAML(data, 'dataset.yaml')
#simpleData = simplifiedSpeciesData(data)
#dataToYAML(simpleData, 'simplifiedDataset.yaml')

In [24]:
data.items()

dict_items([('Basileuterus bivittatus', {'start_time': [40.7895215572, 12.8083224967, 15.8257036283, 3.03018027142, 13.6901070321, 18.1263597598, 52.5765678172, 32.1982545694, 49.0334266425, 22.5654536473, 25.0255228059, 5.92787749053, 5.13437057991513, 15.0212164074], 'end_time': [43.1253410951, 14.6215864759, 18.0535774839, 4.97062993721, 15.4846453936, 20.4734139762, 54.7350510452, 34.1939733246, 50.9501070311, 24.3833360777, 26.9817223777, 7.78527910423, 7.29844413012729, 16.8458274399], 'min_frequency': [1805.81896552, 2318.75, 1900, 2568.75, 2250, 1687.5, 1687.5, 2296.54042243, 2521.37654771, 2810.45156591, 2356.79347826, 2633.79461034, 2585.61544064093, 2553.49599417], 'max_frequency': [5341.42241379, 5131.25, 5412.5, 4793.75, 5081.25, 5350, 5068.75, 5123.05171158, 5026.69337218, 5123.05171158, 5372.69021739, 5042.75309541, 5090.93226511289, 5058.81281865], 'recording name': ['Gb005-2014-09-29_05-10.wav', 'Gb005-2014-10-05_06-40.wav', 'Gb005-2014-10-06_05-20.wav', 'Gb005-2014-10

In [159]:
def getRawSpecDataset(dataset, path='../dataset'):
    
    # make directory to store our spec dataset
    dataset_path = path + '/spectrogram_roi_dataset'
    if not os.path.exists(dataset_path):
        os.makedirs(dataset_path)
    else:
        shutil.rmtree(dataset_path)
        os.makedirs(dataset_path)
    species = dataset.keys()
    
    # image data to be pickled 
    specs = []
    
    for s in species:
        s_dir = dataset_path + '/' + s
        s_spec = []
        os.makedirs(s_dir) # make a directory per species
        
        # load species ROI data
        min_freqs = dataset[s]['min_frequency']
        max_freqs = dataset[s]['max_frequency']
        starts = dataset[s]['start_time']
        ends = dataset[s]['end_time']
        recs = dataset[s]['recording name']
        
        for i in range(0, len(recs)):
            rec = '../dataset/wav_recordings/' + recs[i] # path to ith recording file where s is present
            spectrum, freqs, times = specInfo(rec) # get entire spectrogram data from rec
            
            # get ROI info in rec
            t_0 = starts[i] 
            t_n = ends[i]
            f_0 = min_freqs[i]
            f_n = max_freqs[i]
            
            # find closest times and freqs that match ROI info
            t_start, t_end = getBounds(times, t_0, t_n)
            f_start, f_end = getBounds(freqs, f_0, f_n)
            
            # get modified spectrum, freqs, and times
            spectrumMod = specMod(spectrum, freqs, times, f_start, f_end, t_start, t_end)
            freqMod = freqs[f_start:f_end]
            timeMod = times[t_start:t_end]
            filename = s_dir + '/' + s + '_spec_' + str(i+1) + '.png'
            
            # plot the spectrogram of ROI and save the image 
            f = plotModSpecSimple(spectrumMod, freqMod, timeMod, filename)
            s_spec.append(f) # append image to list of ROI spectrograms per species
            pylab.close
        
        # add dictionary with key <species_name> and value <list_of_spectrogram_figures>
        specs.append({s:s_spec}) 
    
    return specs

In [161]:
yamlData = open('../dataset/dataset.yaml', 'r')
dataset = yaml.load(yamlData)
yamlData.close()
data = getRawSpecDataset(dataset)

FileNotFoundError: [Errno 2] No such file or directory: '../dataset/wav_recordings/Gb005-2014-09-29_05-10.wav'