## Code structure:   
annotation file structure:

    onset, class-value , duration 
    20.321666667,3,0.059166667
    20.424,3,0.093
    20.600333333,3,0.107


1 open annotation and corresponding wav file

2 split audio file into segments, 

3 feature extraction for each segment - mel spectrograms for now, 

4 read annotations and create label for each segment 


--> class-value key:

        1- Male inhale
        2- Female inhale
        3- Chick
        4- Male bout
        5- Female bout
        6- Flapping
        7- Male grunt/noise
        8- Female grunt/noise
        9- Unknown grunt/noise

5 create dataset


In [None]:


def process_raw_annotations_file(raw_file_name, annotations_file_path, labels_dict):

    processed_annotations = []
#     processed_annotation_filename =raw_file_name[0:-4]+'_processed.npy'
    
    if os.path.isfile(annotations_file_path + raw_file_name):
        
        with open(annotations_file_path + raw_file_name, 'r') as rawfile:
            reader = csv.reader(rawfile)
            
            for row in reader:

                onset_sec = float(row[0])
                try:
                    label_value = int(row[1])
                except ValueError as e:
                    print("annotation error, will make it as closest label!")
                    label_value = int(np.floor(1.5))
                    
                try:
                    label_str = labels_dict[label_value]
                except Exception as e:
                    continue
                duration_sec = float(row[2])
                offset_sec =  onset_sec + duration_sec

                processed_annotations.append([onset_sec, offset_sec, label_str])
            
#     if save_path:
# #         save the processed annotation:
#         data_array = np.asarray(processed_annotations)
#         np.save(processed_annotation_filename, data_array) 
            
            
    return processed_annotations

def get_label_matrix_per_segment(start_sec, end_sec, sr, timesteps, timesteps_per_second, annotations_list, labels_dict ):
    
    ldict = dict([[v,k] for k,v in labels_dict.items()])
    nb_classes = len(ldict.keys())
    label_matrix = np.zeros((nb_classes, timesteps))
    
    for onset_sec, offset_sec, label in annotations_list:

        # go trhough anotations file and check if any onset or offset is inside start-to-end period
        if (start_sec <= onset_sec and end_sec >= onset_sec) or (start_sec <= offset_sec and end_sec >= offset_sec):
            
            onset_timeframe_index =  np.floor((onset_sec - start_sec )*timesteps_per_second)
            offset_timeframe_index = np.ceil((offset_sec - start_sec )*timesteps_per_second)
            
#             print(onset_timeframe_index)
#             print(onset_sec)
#             print(offset_timeframe_index)
#             print(offset_sec)
            print(label +' '+str(int(ldict[label])-1))
           
            try:
                label_matrix[int(ldict[label])-1][int(round(onset_timeframe_index)):int(round(offset_timeframe_index))] = 1
            except Exception as e:
                pdb.set_trace()
        else:
            continue
    
    return label_matrix


In [None]:
audio_path = "/home/ines/Dropbox/QMUL/PHD/manx_shearwaters/data/wavs/"
save_path = "/home/ines/Dropbox/QMUL/PHD/manx_shearwaters/data/data_processed/"
annotations_path = "/home/ines/Dropbox/QMUL/PHD/manx_shearwaters/data/Annotations/"

labels_dict = {1:'male_in', 2:'female_in', 3:'chick', 4:'male_bout', 5:'female_bout', 6:'flapping', 7: 'male_grunt', 8:'female_grunt', 9:'unk_grunt' }


segment_size = 6
slide = 6
mels = 64
sample_rate = 22_050
timesteps = 259 # number of frames in a segment! =((SR* segment_size)/ hop_length)
timesteps_per_second = timesteps / segment_size


In [None]:
import pdb

dataset = [] # [dt_id, spectrogram, labelmatrix]

for audio_file in tqdm(os.listdir(audio_path), desc='load_audio'):
    print('load audio')
    dataset_per_file = []
    y, sr = librosa.load(audio_path+audio_file, sr=sample_rate)
    length = int(len(y) / sr) #seconds
    remainder = length % segment_size
    #     import pdb; pdb.set_trace()

    audio_id = audio_file[:-4]

    raw_annotation_file = audio_id  + '_a.csv'

    processed_annotations = process_raw_annotations_file(raw_annotation_file, annotations_path, labels_dict)    

    for t in tqdm(range(0, length - remainder - segment_size, slide),
                  desc='create_spectros'):
        start = t
    #         print('start', str(start))
        stop = t + segment_size

        print('start in frames', str(sr*start))
        print('stop in frames: ', str((sr*stop)))

        current_y = y[sr*start:(sr*stop)]
        # create spectrogram
        spectro = librosa.feature.melspectrogram(y=current_y, sr=sr, n_mels=mels,
                                                 fmax=sr/2)

        dt_id = audio_id + '_' + str(start) + 's_to' + str(stop) + 's'
        print(dt_id)

        label_matrix = get_label_matrix_per_segment(start, stop, sr, timesteps, timesteps_per_second, processed_annotations, labels_dict )
        dataset_per_file.append([dt_id, spectro, label_matrix])
        dataset.append([dt_id, spectro, label_matrix])
        
    
    #save data processed for this file
    d_array = np.asarray(dataset_per_file)
    np.save(save_path+audio_id, d_array)

data_array = np.asarray(dataset)
np.save(save_path+'data_25_03', data_array)


In [None]:
int(np.floor(1.5))

In [1]:
import pandas as pd
import numpy as np
import os
import librosa
from tqdm.notebook import tqdm
import csv


In [2]:
dataset = []
for file in os.listdir('/home/ines/Dropbox/QMUL/PHD/manx_shearwaters/data/data_processed/'):
#     print(file)
    data = np.load('/home/ines/Dropbox/QMUL/PHD/manx_shearwaters/data/data_processed/' + file, allow_pickle=True)
#     print(data.shape)
    dataset.extend(data)
        

In [3]:
dataset_array = np.asarray(dataset)

In [4]:
dataset_array.shape

(38049, 3)

In [None]:
np.save('/home/ines/Dropbox/QMUL/PHD/manx_shearwaters/data/data_processed/dataset_25_03.npy', dataset_array)