## Code structure:   
i want to adappt and create a subdataset for different tasks,
    this comprises: joining labels together, 
    filtering out some classes
    or selecting others only. 
    
    
1 - load dataset:
    structure is:
        segmentID, spectrogram, matrixlabel (9*259) each row corresponds to the following classes
                    
        1- Male inhale
        2- Female inhale
        3- Chick
        4- Male bout
        5- Female bout
        6- Flapping
        7- Male grunt/noise
        8- Female grunt/noise
        9- Unknown grunt/noise

2 - define manipulation to do
    
    
    


In [1]:
import pandas as pd
import numpy as np
import os
import librosa
from tqdm.notebook import tqdm
import csv
import json

In [2]:
def remove_label(dataset,labels_dict, new_labels_dict  ):
    '''remove classes from label_matrix '''
    inve_new_label_dict = {v: k for k, v in new_labels_dict.items()}
    inv_labels_dic = {v: k for k, v in labels_dict.items()} 
    
    
    new_dataset = []
    for segm_id, spectro, label_matrix in dataset:
        new_label_matrix = np.zeros((len(new_labels_dict.keys()), 259))
        for lb, new_indx  in inve_new_label_dict.items():
            new_label_matrix[new_indx,:] = label_matrix[inv_labels_dic[lb]]
        
        new_dataset.append([segm_id, spectro, new_label_matrix])

    new_dataset = np.asarray(new_dataset)
    return new_dataset, new_labels_dict

def join_labels(labels_2_join, dataset, labels_dict, new_labels_dict, new_class_label):
    '''join information in different rows,   and create aditional row for joint labels
    '''
        
    inve_new_label_dict = {v: k for k, v in new_labels_dict.items()}
    inv_labels_dic = {v: k for k, v in labels_dict.items()}
    labels_2_join_indx = []
    for label in labels_2_join:
        labels_2_join_indx.append(inv_labels_dic[label])
        # based on labels_dict!

    new_dataset = []


    for segm_id, spectro, label_matrix in dataset:
        
        new_label_matrix = np.zeros((len(new_labels_dict.keys()), 259))
        new_label_matrix[0:label_matrix.shape[0], :] = label_matrix

        joined_labels_row = np.sum(label_matrix[labels_2_join_indx, :],0)
        #just normalizing to 1s and zeros...
        joined_labels_row = np.divide(joined_labels_row, joined_labels_row, out=np.zeros_like(joined_labels_row), where=joined_labels_row!=0)
        
        indx = inve_new_label_dict[new_class_label]
        new_label_matrix[int(indx),:] = joined_labels_row

        new_dataset.append([segm_id, spectro, new_label_matrix])

    new_dataset = np.asarray(new_dataset)
    
    # We should remove the joined collumns before returning,
    return new_dataset, new_labels_dict


def remove_examples_based_labels(dataset, neg_classes_indexes, labels_dict, new_labels_dict, mode='keepPosOnly' ):
    '''keepPosOnly: keep only examples that have ones in the given rows (positive_labels) 
    (remove everything else  included examles with only zeros!)
        
        removeNeg: remove all examples that have ones in the non target classes (this results in positives + silences)
        
        keepSilencesOnly: select examples that have zeros through the whole matrix! 
    (becareful between the distinction of negs and silences depends on the dataset given!)
    
    '''        
  
    
    if mode == 'removeNeg':
        examples_2_remove = []
        
        for i, example in enumerate(dataset):
            label_matrix = example[2]
            if 1 in label_matrix[neg_classes_indexes, :] :
                examples_2_remove.append(i)
        
        new_dataset = np.delete(dataset, examples_2_remove, 0)
        new_dataset, new_labels_dict =  remove_label(new_dataset, labels_dict, new_labels_dict )
      
    return np.asarray(new_dataset), new_labels_dict
    



In [3]:
# data_processed = '/home/ines/Dropbox/QMUL/PHD/manx_shearwaters/data/data_processed/'
data_processed = '/import/c4dm-datasets/manxShearwaters/'

labels_dict = {0:'male_in', 1:'female_in', 2:'chick', 3:'male_bout', 4:'female_bout', 5:'flapping', 6: 'male_grunt', 7:'female_grunt', 8:'unk_grunt' }

In [4]:
dataset = np.load(data_processed+'dataset_25_03.npy', allow_pickle=True)

In [8]:
# Adult_vs_chick_dataset:

# lets create dataset with the following 3 classes: Adult_bouts, chicks and adult_grunts

#0- get silences dataset
#1 - join female_bouts and male bouts into adult_bouts_class
# 2 - join the 3 grunt classes into adult_grunts (fem_grunt, male_grunt, unk_grunt)
# 3 - remove all other classes

# silences_dataset = select_examples_based_labels(adult_chick_dataset_joined)

#1
new_labels_dict_1 = {0:'male_in', 1:'female_in', 2:'chick', 3:'male_bout', 4:'female_bout', 5:'flapping', 6: 'male_grunt', 7:'female_grunt', 8:'unk_grunt', 9:'adult_bout' }
new_class_label =  'adult_bout'
dataset, new_labels_dict_1= join_labels(['male_bout', 'female_bout'], dataset, labels_dict, new_labels_dict_1, new_class_label)


#2
new_labels_dict_2 = {0:'male_in', 1:'female_in', 2:'chick', 3:'male_bout', 4:'female_bout', 5:'flapping', 6: 'male_grunt', 7:'female_grunt', 8:'unk_grunt', 9:'adult_bout' , 10: 'adult_grunt'}
new_class_label =  'adult_grunt'
dataset, new_labels_dict_2= join_labels(['male_grunt', 'female_grunt', 'unk_grunt'], dataset, new_labels_dict_1, new_labels_dict_2, new_class_label)


#3
new_labels_dict_3 = {0:'adult_grunt', 1:'adult_bout', 2:'chick', }
dataset, new_labels_dict_3= remove_label( dataset,new_labels_dict_2, new_labels_dict_3)


#this data contains non marked negatives and silents!!!

#if we want exaamples of silences as well  we need to remove the negatives vefore step 3!


# adult_chick_dataset_1, new_labels_dict = join_labels([0,1], dataset, labels_dict)
# # this will have examples of adults chicks, plus all other classes (negatives) and silences

# adult_chick_dataset, new_labels_dict = remove_labels([2,3,4,5,6,7,8], adult_chick_dataset_joined, new_labels_dict)
# #this will have the same examples as before but only adults and cchicks are explicitly labeled . for binary classification!

# adult_chick_dataset_only_positive = select_examples_based_labels(adult_chick_dataset, [0,1])
# #this removes all examples of non chicks or non adults plus silences


# adult_chick_dataset_negs_all_only = select_examples_based_labels[adult_chick_dataset]
# # this removes all the examples of adults and chicks and leaves silences and negative classes examples
# # ??

# adult_chick_dataset_silences_only = select_examples_based_labels(adult_chick_dataset_joined)
# # this will only contain silences! (no negatives nor positives!)


In [None]:
# verify:

# print(new_labels_dict_3)
# print(adult_chick_dataset_3.shape)
# print(adult_chick_dataset_3[0,-1].shape)


# print(new_labels_dict_2)
# print(adult_chick_dataset_2.shape)
# print(adult_chick_dataset_2[0,-1].shape)

print(new_labels_dict_3)
print(adult_chick_dataset_1.shape)
print(adult_chick_dataset_1[0,-1].shape)

print(labels_dict)
print(dataset.shape)
print(dataset[0,-1].shape)

for n in range(adult_chick_dataset_1.shape[0]):
    print(sum(adult_chick_dataset_1[n,-1]))


In [9]:
# np.save('/home/ines/Dropbox/QMUL/PHD/manx_shearwaters/adult_vs_chicks/dataset_adult_bouts_grunts_chicks.npy', dataset)
np.save(data_processed+ 'dataset_adult_bouts_grunts_chicks.npy', dataset)

with open(data_processed +'labels_key.json', 'w') as oputfile:
    json.dump(new_labels_dict_3, oputfile) 

In [5]:
#dataset with without negatives!
# Adults vs chicks (without examples of negative classes, in this case only flapping makes sense to remove! since the otheres are contained in the classes we want to consider.)


#1
new_labels_dict_1 = {0:'male_in', 1:'female_in', 2:'chick', 3:'male_bout', 4:'female_bout', 5:'flapping', 6: 'male_grunt', 7:'female_grunt', 8:'unk_grunt', 9:'adult_bout' }
new_class_label =  'adult_bout'
dataset, new_labels_dict_1= join_labels(['male_bout', 'female_bout'], dataset, labels_dict, new_labels_dict_1, new_class_label)


#2
new_labels_dict_2 = {0:'male_in', 1:'female_in', 2:'chick', 3:'male_bout', 4:'female_bout', 5:'flapping', 6: 'male_grunt', 7:'female_grunt', 8:'unk_grunt', 9:'adult_bout' , 10: 'adult_grunt'}
new_class_label =  'adult_grunt'
dataset, new_labels_dict_2= join_labels(['male_grunt', 'female_grunt', 'unk_grunt'], dataset, new_labels_dict_1, new_labels_dict_2, new_class_label)


#2.5 remove examples with negatives
new_labels_dict_3 = {0:'adult_grunt', 1:'adult_bout', 2:'chick', }
new_dataset, labels = remove_examples_based_labels(dataset, [5], new_labels_dict_2, new_labels_dict_3, mode='removeNeg' )

# #3
# new_labels_dict_3 = {0:'adult_grunt', 1:'adult_bout', 2:'chick', }
# dataset, new_labels_dict_3= remove_label( dataset,new_labels_dict_2, new_labels_dict_3)

In [6]:
print(dataset.shape)
print(new_dataset.shape)
print(labels)

(38049, 3)
(37935, 3)
{0: 'adult_grunt', 1: 'adult_bout', 2: 'chick'}


In [7]:
# np.save('/home/ines/Dropbox/QMUL/PHD/manx_shearwaters/adult_vs_chicks/dataset_adult_bouts_grunts_chicks.npy', dataset)
np.save(data_processed+ 'dataset_adult_bouts_grunts_chicks_no_negatives.npy', new_dataset)

# with open(data_processed +'labels_key.json', 'w') as oputfile:
#     json.dump(new_labels_dict_3, oputfile) 

### TESTS:

In [None]:

dataset_test = dataset[0:10, :]

In [None]:
dataset_test[:,-1].shape

In [None]:
for i in range(dataset_test[:,-1].shape[0]):
    dataset_test[:,-1][i][1, 0:100] = 1
    dataset_test[:,-1][i][4, 50:200] = 1


In [None]:
dataset_test[0,-1]

In [None]:
#Test functions!
new_labels_dict = labels_dict
new_labels_dict[9] = 'testing!'
new_class_label =  'testing!'
test_dataset, labels_dict_result = join_labels(['female_in', 'female_bout'], dataset_test, labels_dict, new_labels_dict, new_class_label)


new_labels_dict2 = {0:'testing!', 1:'male_bout', 2: 'female_in'}
test_dataset_removed, last_labels_dict = remove_label(test_dataset,new_labels_dict, new_labels_dict2  )

In [None]:
print(test_dataset.shape)
print(test_dataset[0,-1].shape)
print(dataset_test[0,-1].shape)
print(labels_dict_result)
# print(dataset_test[0,-1][0])
# print(test_dataset[0,-1][0])

# print(dataset_test[0,-1][4])
print(test_dataset[0,-1][4])

# print(dataset_test[0,-1][1])
print(test_dataset[0,-1][1])

print('resu;lt joining rows 1 and 4:')
print(test_dataset[0,-1][-1])

In [None]:
print(test_dataset_removed.shape)
print(test_dataset_removed[0,-1].shape)
print(dataset_test[0,-1].shape)
print(test_dataset[0,-1].shape)
print(last_labels_dict)
# print(dataset_test[0,-1][0])
# print(test_dataset[0,-1][0])

# print(dataset_test[0,-1][4])
print(test_dataset_removed[0,-1])

In [10]:

#1
new_labels_dict_1 = {0:'male_in', 1:'female_in', 2:'chick', 3:'male_bout', 4:'female_bout', 5:'flapping', 6: 'male_grunt', 7:'female_grunt', 8:'unk_grunt', 9:'adult_bout' }
new_class_label =  'adult_bout'
dataset, new_labels_dict_1= join_labels(['male_bout', 'female_bout'], dataset, labels_dict, new_labels_dict_1, new_class_label)


#2
new_labels_dict_2 = {0:'male_in', 1:'female_in', 2:'chick', 3:'male_bout', 4:'female_bout', 5:'flapping', 6: 'male_grunt', 7:'female_grunt', 8:'unk_grunt', 9:'adult_bout' , 10: 'adult_grunt'}
new_class_label =  'adult_grunt'
dataset, new_labels_dict_2= join_labels(['male_grunt', 'female_grunt', 'unk_grunt'], dataset, new_labels_dict_1, new_labels_dict_2, new_class_label)



In [25]:

new_labels_dict_3 = {0:'adult_grunt', 1:'adult_bout', 2:'chick', }


examples_2_remove = []
neg_classes_indexes = [0,1,5]   # these should be selected automatically, the issue is that because we have overlapping classes, if we remove exemples of female grunt we are removing examples of adult grunt which are the same!
# i.e negative classes IS NOt the same as the difference between current labels-dict and new_labels dict!


for i, example in enumerate(dataset):
    label_matrix = example[2]
    if 1 in label_matrix[neg_classes_indexes,:] : 
#         print(label_matrix[neg_classes_indexes,:])
#         print('\n')
        examples_2_remove.append(i)
      
print(len(examples_2_remove))

new_dataset = np.delete(dataset, examples_2_remove, 0)
print(np.asarray(new_dataset).shape)
new_dataset , test_labels =  remove_label(new_dataset, new_labels_dict_2, new_labels_dict_3 )
print(new_dataset.shape)


print(dataset.shape)
# testing_data = remove_examples_based_labels(dataset, positive_labels, labels_dict, new_labels_dict, mode='keepPosOnly' )

922
(37127, 3)
(37127, 3)
(38049, 3)


In [13]:
np.asarray(new_dataset).shape


(2,)

In [14]:
print(dataset.shape)

(38049, 3)
