In [1]:
import numpy as np
import librosa
import os
from tqdm.notebook import tqdm
import csv
import json

In [2]:
def PCEN(dataset):
    """Apply PCEN to spectrograms in dataset.
    
    # Arguments
        dataset: dataset in format (id, spectro, label).
    
    # Returns
        dataset in same format but spectrograms have had PCEN applied.
    """
    x = dataset[:, 1]
    for i in range(len(x)):
        x[i] = librosa.pcen(x[i] * (2**31))
    dataset[:, 1] = x
    return dataset

In [3]:
# data_processed = '/home/ines/Dropbox/QMUL/PHD/manx_shearwaters/adult_vs_chicks/'
data_processed = '/import/c4dm-datasets/manxShearwaters/adult_vs_chick/'

# raw_annotations_path ='/home/ines/Dropbox/QMUL/PHD/manx_shearwaters/data/Annotations/'
with open('/import/c4dm-datasets/manxShearwaters/adult_vs_chick/labels_key.json', "r") as read_file:
    labels_dict = json.load(read_file)
# labels_dict = {1:'male_in', 2:'female_in', 3:'chick', 4:'male_bout', 5:'female_bout', 6:'flapping', 7: 'male_grunt', 8:'female_grunt', 9:'unk_grunt' }



In [4]:
# dataset = np.load(data_processed+'dataset_adult_bouts_grunts_chicks.npy', allow_pickle=True)
dataset = np.load(data_processed+'dataset_adult_bouts_grunts_chicks_no_negatives.npy', allow_pickle=True)

In [5]:
dataset.shape

(37935, 3)

## Remove silences:

In [10]:
# remove silences: filter to include only clips calls in
dataset_positivesonly = np.asarray([e for e in dataset if 1 in e[2]])
dataset_positivesonly.shape

(3385, 3)

##  create train test and val sets completely randomly":

In [12]:
#
np.random.shuffle(dataset_positivesonly)
indexes = np.arange(0, len(dataset))
test_indexes = indexes[:int(len(indexes)*0.05)]
val_indexes = indexes[int(len(indexes)*0.05):int(len(indexes)*0.05) + int(len(indexes)*0.2)]
train_indexes =  indexes[int(len(indexes)*0.05)+int(len(indexes)*0.2):]
train = dataset[train_indexes]
test = dataset[test_indexes]
val = dataset[val_indexes]

## or create train test and val sets by burrow:

In [26]:
#1 list burrows
# select burrows for each train, val, test (leave one burrow for testing?) 10 burrows: 7train + val 3 test
burrows_list = list(set([e[0].split('_')[0] for e in dataset]))
print(burrows_list)
# train_proportion = len(burrows_list)/2
import random
train_burrows = np.random.choice(burrows_list, size=7, replace=False )
print(train_burrows)
test_burrows = np.setdiff1d(burrows_list,train_burrows)
print(test_burrows)



['3', '206', '49', 'DB14', '18', '56', '208', '23', '95', '135']
['3' '208' '18' '135' '95' '23' '206']
['49' '56' 'DB14']


In [49]:
train_val_set=[]
test_set = []
for e in dataset:
    if any(br == e[0].split('_')[0] for br in train_burrows):

        train_val_set.append(e)
    else:
        test_set.append(e)
train_val_set = np.asarray(train_val_set)
test = np.asarray(test_set)

In [50]:
print(np.asarray(train_val_set).shape)
print(np.asarray(test_set).shape)

(25871, 3)
(12064, 3)


In [51]:
val_set_indexes = np.random.choice(range(len(train_val_set)), size=round(0.2*len(train_val_set)), replace=False)
print(val_set_indexes)
print(round(0.2*len(train_val_set)))
print(len(train_val_set))

[ 1834 12765 22858 ... 11909 20837 12565]
5174
25871


In [52]:

val = train_val_set[val_set_indexes]
train_indexes = np.setdiff1d(range(len(train_val_set)), val_set_indexes)
train = train_val_set[train_indexes]

print(val.shape)
print(train.shape)

(5174, 3)
(20697, 3)


## save train, val, test sets


In [54]:
np.save(data_processed+'test.npy', test)
np.save(data_processed+'val.npy', val)
np.save(data_processed+'train.npy', train)

In [55]:
print(test.shape)
print(val.shape)
print(train.shape)

(12064, 3)
(5174, 3)
(20697, 3)


In [56]:
# apply PCEN
test_PCEN = PCEN(test)
val_PCEN = PCEN(val)
train_PCEN = PCEN(train)

In [57]:
# save PCEN train, val, test sets
np.save(data_processed+'test_PCEN', test_PCEN)
np.save(data_processed+'val_PCEN', val_PCEN)
np.save(data_processed+'train_PCEN', train_PCEN)