In [17]:
import numpy as np
import torch
import torch.nn as nn
import torch.utils.data as data
import tqdm
from sklearn.model_selection import train_test_split
import pandas as pd
import abc
import yaml
import h5py
import librosa
import os
import hydra
from hydra import compose, initialize
from glob import glob
from itertools import chain

In [2]:
'''
How to make the framework flexible enough that one can point to which samples in a batch are meant to be
support/query per class? The implementation in DCASE2021 does not handle this.


Currently return the pcen transposed. Where to transpose it back?
Batcher? Most important thing is just to not forget i think.

The code only allows one positive class per segment for now I think.
This might be something we would like to fix?
'''

'\nHow to make the framework flexible enough that one can point to which samples in a batch are meant to be\nsupport/query per class? The implementation in DCASE2021 does not handle this.\n'

## Prototypical net

In [3]:
#DCASE2021

def conv_block(in_channels,out_channels):

    return nn.Sequential(
        nn.Conv2d(in_channels,out_channels,3,padding=1),
        nn.BatchNorm2d(out_channels),
        nn.ReLU(),
        nn.MaxPool2d(2)
    )

In [4]:
#DCASE2021

#TODO introduce parametrization of conv blocks?
class Protonet(nn.Module):
    def __init__(self, raw_transformer=None):
        super(Protonet,self).__init__()
        self.raw_transformer = raw_transformer
        self.encoder = nn.Sequential(
            conv_block(1,128),
            conv_block(128,128),
            conv_block(128,128),
            conv_block(128,128)
        )
    def forward(self,x):
        #Is there risk for this to be super slow?
        #A naive approach might transform the same data more than once?
        #Lookup tables?
        if self.raw_transformer is not None:
            x = self.raw_transformer.rtoi_standard(x)
        (num_samples,seq_len,mel_bins) = x.shape
        x = x.view(-1,1,seq_len,mel_bins)
        x = self.encoder(x)
        return x.view(x.size(0),-1)

In [5]:
'''
Will most likely lean heavily on the implementation of the DCASE2021 task 5 baseline implementation.

'''
def prototypical_loss(input, target, n_support, supp_idxs=None):
    
    target_cpu = target.to('cpu')
    input_cpu = input.to('cpu')
    classes = torch.unique(target_cpu)
    n_classes = len(classes)
    n_query = target.eq(classes[0].item()).sum().item() - n_support
    if supp_idxs is None:
        #Rewrite, need to select only n_support. We might have n_query > n_support
        supp_idxs = torch.stack(list(map(lambda c: target_cpu.eq(c).nonzero()[:n_support], classes))).squeeze(1)
        q_idxs = torch.stack(list(map(lambda c: target_cpu.eq(c).nonzero()[n_support:], classes))).view(-1)
    else:
        #Work from supp_idxs
        q_idxs = None
        
    prototypes = torch.stack([input_cpu[idx_list].mean(0) for idx_list in supp_idxs])
    query_samples = input_cpu[q_idxs]
    dists = euclidean_dist(query_samples, prototypes)
    
    #Check
    log_p_y = F.log_softmax(-dists, dim=1).view(n_classes, n_query, -1)
    target_inds = torch.arange(0, n_classes)
    target_inds = target_inds.view(n_classes, 1, 1)
    target_inds = target_inds.expand(n_classes, n_query, 1).long()
    #.mean() -> 1/NcNq
    loss_val = -log_p_y.gather(2, target_inds).squeeze().view(-1).mean()
    _, y_hat = log_p_y.max(2)
    acc_val = y_hat.eq(target_inds.squeeze()).float().mean()
    return loss_val, acc_val
    

## Data processing

In [6]:
'''
    * Design choice: Handle most of pre-processing as part of the model (torchlibrosa)?
      May ultimately lead to simpler augmentation etc down the line. Work with raw audio as far as possible?
      
    * Make use of h5py library for storing training, validation and test sets?
      Still raw audio sets?
    
    * Incorporate pytorch Dataloader, seems prudent and a good design choice.
      read(h5py) file + Episodic sampler -> Dataloader?
      
    * Slight change of mind. Datagen and FeatureExtractor is not really worth spending time on for now.
      Sure they could be interfaces for a framework up the road but can do without for now since the loop
      will most likely be quite task dependent for now.
      
'''

'\n    * Design choice: Handle most of pre-processing as part of the model (torchlibrosa)?\n      May ultimately lead to simpler augmentation etc down the line. Work with raw audio as far as possible?\n      \n    * Make use of h5py library for storing training, validation and test sets?\n      Still raw audio sets?\n    \n    * Incorporate pytorch Dataloader, seems prudent and a good design choice.\n      read(h5py) file + Episodic sampler -> Dataloader?\n      \n    * Slight change of mind. Datagen and FeatureExtractor is not really worth spending time on for now.\n      Sure they could be interfaces for a framework up the road but can do without for now since the loop\n      will most likely be quite task dependent for now.\n      \n'

In [26]:
'''
Possibly take a h5 file as input and return X_train, Y_train, X_val, Y_val
Is this an approach that we like? Is it commonly used for deep learning?
'''

#Heavily DCASE inspired. I think what they did is ok though.
class Datagen():
    
    def __init__(self, config):
        
        self.config = config
        
        if config.features.raw:
            #These obviosly requires more processing down the pipe but that is application dependent.
            hf = h5py.File(os.path.join(config.path.train_w, 'raw_train.h5'))
        else:
            hf = h5py.File(os.path.join(config.path.train_w, 'mel_train.h5'))
            self.x = hf['features'][:]
            self.labels = [s.decode() for s in hf['labels'][:]]
            if config.datagen.ltoi:
                self.y = class_to_int(self.labels)
            else:
                self.y = self.labels
            if config.datagen.balance:
                self.x, self.y = balance_class_distribtuion(self.x, self.y)
            
            array_train = np.arange(len(self.x))
            if config.datagen.stratify:
                _,_,_,_,train_array,valid_array = train_test_split(self.x, self.y, array_train, \
                                                    random_state=config.datagen.random_state, stratify=self.y)
            else:
                _,_,_,_,train_array,valid_array = train_test_split(self.x, self.y, array_train, \
                                                    random_state=config.datagen.random_state)
                
            self.train_index = train_array
            self.valid_index = valid_array
            if config.datagen.normalize:
                self.mean, self.std = norm_params(self.x[train_array])
            else:
                self.mean = None
                self.std = None
                
    def feature_scale(self, x):
        return (x - self.mean)/self.std
    
    def generate_train(self):
        train_array = sorted(self.train_index)
        valid_array = sorted(self.valid_index)
        X_train = self.x[train_array]
        Y_train = self.y[train_array]
        X_val = self.x[valid_array]
        Y_val = self.y[valid_array]
        if self.config.datagen.normalize:
            X_train = self.feature_scale(X_train)
            X_val = self.feature_scale(X_val)
        return X_train, Y_train, X_val, Y_val
        

class TestDatagen(Datagen):
    
    def __init__(self, config):
        super().__init__(config)

In [8]:
'''
This could be an interface / abstract class to build audio 
to some other format instance to plug into feature extractor
'''

class Spectralizer():
    
    def __init__(self, config):
        self.config = config
        
        self.sr = config.features.sr
        self.n_fft = config.features.n_fft
        self.hop = config.features.hop_mel
        self.n_mels = config.features.n_mels
        self.fmax = config.features.fmax
        

    def raw_to_spec(self, audio, config):

        #Supposedly suggested by librosa.
        audio = audio * (2**32)

        mel_spec = librosa.feature.melspectrogram(audio, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop,
                                                 n_mels=self.n_mels, fmax=self.fmax)

        pcen = librosa.core.pcen(mel_spec, sr=self.sr)
        pcen = pcen.astype(np.float32)
        
        #Note that we transform the features here and therefor have time/frame along dim 0.
        #Transform back when loading data? Smaksak
        return pcen.T
    

In [9]:
'''
Possibly work on an raw files and annotations and return/write h5 files.
This might be clunky to include in a framework since this most likely is dataset dependent.
Might however benfit from having an interface which is inherited by classes working on specific datasets.
'''

class FeatureExtractor(abc.ABC):
    
    def __init__(self):
        pass

'''
Takes the data from the DCASE (all files one folder) and returns h5 file with the datasets 'features' and 'labels'
This takes no heed to unlabeled segments and therefor we will have no unlabeled data to work with.
This is an interesting TODO. Most likely need to rework some of the mechanisms here to work with limited RAM.
Extract segment -> write to file etc... Look at DCASE code for example
Unlabeled data could be saved to a new dataset 'unlabeled' for example.


TODO: MemError already present even before processing unlabeled data and only one of the smaller audio files.
Atleast for the non raw data. Need to fix this. Probably not hard for data processed into spectrograms since
we beforehand know the dimensions. Harder for raw audio segments.

Why are we getting MemError though? Could run the DCASE program from home with 16GB RAM.
Does not load all features into memory at once? Wonky h5py thing? Check this out!

It seems the DCASE code loads all the features into memory.

Found a bug, this however does not nessecarily discard the above comments.
Working well with memory is still most likely of importance when extracting from large sets.
'''
class MyF_Ext(FeatureExtractor):
    
    def __init__(self, config, spectralizer=None):
        self.config = config
        self.spectralizer = spectralizer
        
    def extract_features(self):
        
        if self.config.set.train:
            self.extract_train()
        else:
            self.extract_test()
    
    '''
    Assumes all *.csv and *.wav files are in the same folder which path is in config.
    Either creates spectrograms as features or raw audio segments containing events.
    Assumes annotations as those provided in 
    '''
    
    def extract_train(self):
        
        
        csv_files = [file for file in glob(os.path.join(self.config.path.data_train, '*.csv'))]
        
        if self.config.features.raw:
            
            print('Raw extraction')
            
            events = []
            labels = []
            
            for file in csv_files:
            
                print('Processing ' + file.replace('csv', 'wav'))
                audio, sr = librosa.load(file.replace('csv', 'wav'), self.config.features.sr)
                df = pd.read_csv(file, header=0, index_col=False)
                df_pos = df[(df == 'POS').any(axis=1)]
                
                #Add config options for window size around event
                df_pos.loc[:, 'Starttime'] = df_pos['Starttime'] - 0.025
                df_pos.loc[:, 'Endtime'] = df_pos['Endtime'] + 0.025
                start_time = [int(np.floor(start * sr)) for start in df_pos['Starttime']]
                end_time = [int(np.floor(end * sr)) for end in df_pos['Endtime']]
                
                #Better way of doing this?
                for i in range(len(start_time)):
                    events += [audio[start_time[i]:end_time[i]]]
                    
                labels += list(chain.from_iterable(
                    [df_pos.columns[(df_pos == 'POS').loc[index]].values for index, _ in df_pos.iterrows()]))
            
            print('Padding')
            #Pad arrays in events and format for write
            max_len = 0
            for e in events:
                if len(e) > max_len:
                    max_len = len(e)
                    
            for i in range(len(events)):
                if len(events[i]) < max_len:
                    events[i] = np.append(events[i], np.array([self.config.features.raw_pad]*(max_len-len(events[i]))))
            
            events = np.array(events)
            
            print('Writing to file')
            
            hf = h5py.File(os.path.join(self.config.path.train_w, 'raw_train.h5'), 'w')
            hf.create_dataset('features', data=events)
            hf.create_dataset('labels', data=[s.encode() for s in labels], dtype='S20')
            hf.close()
            
            print('Done')
            
        else:
            
            #DCASE more or less
            
            print('Spectrogram extraction')
            
            fps = self.config.features.sr / self.config.features.hop_mel
            seg_len = int(round(self.config.features.seg_len * fps))
            hop_seg = int(round(self.config.features.hop_seg * fps))
            
            print(seg_len)
            
            labels = []
            events = []
            
            for file in csv_files:
                
                print('Processing ' + file.replace('csv', 'wav'))
                audio, sr = librosa.load(file.replace('csv', 'wav'), self.config.features.sr)
                
                print('Spectral transform')
                pcen = self.spectralizer.raw_to_spec(audio, self.config)
                print('Done')
                
                df = pd.read_csv(file, header=0, index_col=False)
                df_pos = df[(df == 'POS').any(axis=1)]
                
                start_time, end_time = time_2_frame(df_pos, fps)
                label_f = list(chain.from_iterable(
                    [df_pos.columns[(df_pos == 'POS').loc[index]].values for index, _ in df_pos.iterrows()]))
                
                print('Slicing spectrogram')
                
                for index in range(len(start_time)):
                    
                    str_ind = start_time[index]
                    end_ind = end_time[index]
                    label = label_f[index]
                    
                    #Event longer than a segment?
                    if end_ind - str_ind > seg_len:
                        shift = 0
                        while end_ind - (str_ind + shift) > seg_len:
                            
                            pcen_patch = pcen[int(str_ind + shift):int(str_ind + shift + seg_len)]
                            events += [pcen_patch]
                            labels.append(label)
                            shift += hop_seg
                        
                        pcen_patch = pcen[end_ind - seg_len:end_ind]
                        events += [pcen_patch]
                        labels.append(label)
                    
                    #Event shorter than a segment!
                    else:
                        
                        #Repeat the patch til segment length.
                        pcen_patch = pcen[str_ind:end_ind]
                        if pcen_patch.shape[0] == 0:
                            continue
                        
                        repeats = int(seg_len/(pcen_patch.shape[0])) + 1
                        pcen_patch_new = np.tile(pcen_patch, (repeats, 1))
                        pcen_patch_new = pcen_patch_new[0:int(seg_len)]
                        events += [pcen_patch_new]
                        labels.append(label)
                        
            print('Writing to file')
            
            print(set([len(e) for e in events]))
            
            events = np.array(events)
            
            hf = h5py.File(os.path.join(self.config.path.train_w, 'mel_train.h5'), 'w')
            hf.create_dataset('features', data=events)
            hf.create_dataset('labels', data=[s.encode() for s in labels], dtype='S20')
            hf.close()
            
            print('Done')
                        
                
                        
                
    
    def extract_test(self, config):
        pass

In [10]:
#Instance with torchlibrosa to be included in model if input is raw.

class RawTransformer:
    
    def __init__(self, config):
        #Mel stuff etc
        self.config = config
    
    #Input is a training batch?
    def rtoi_standard(input):
        pass

## Episodic constructor

In [11]:
#DCASE 2021 ish
#Instance given to DataLoader on argument batch_sampler

class RandomEpisodicSampler(data.Sampler):
    
    #Include the option to choose the number of query samples
    #Y_train -> labels, just a list of the targets (list of ints?)
    def __init__(self, labels, n_episodes, n_way, n_support, n_query):
        
        #Number of episodes per epoch. len(labels)/(n_support * n_query) ?
        self.n_episodes = n_episodes
        self.n_way = n_way
        self.n_support = n_support
        self.n_query = n_query
        
        self.sample_indices = []
        for i in range(max(labels) + 1):
            ix = np.argwhere(labels == i).reshape(-1)
            ix = torch.from_numpy(ix)
            self.sample_indices.append(ix)
            
        if self.n_way > len(self.samples_indices):
            raise ValueError('Error: "n_way" parameter is higher than the unique number of classes')
    
    def __len__(self):
        return self.n_episodes
    
    def __iter__(self):
        for batch in range(self.n_episodes):
            batch = []
            #Is not not possible for the same class to be chosen more than once here or am i stupid?
            #No we only choose on class once!
            classes = torch.randperm(len(self.samples_indices))[:self.n_way]
            for c in classes:
                #l is a list of indexes of elements in target belonging to class c
                l = self.samples_indices[c]
                pos = torch.randperm(len(l))[:self.n_samples]
                batch.append(l[pos])
            batch = torch.stack(batch).t().reshape(-1)
            yield batch
        

In [None]:
class ActiveEpisodicSampler(data.Sampler):
    
    def __init__(self):
        pass

## Config/parse/util

In [12]:
#DCASE

def time_2_frame(df,fps):


    #Margin of 25 ms around the onset and offsets
    #TODO: Should be in config

    df.loc[:,'Starttime'] = df['Starttime'] - 0.025
    df.loc[:,'Endtime'] = df['Endtime'] + 0.025

    #Converting time to frames

    start_time = [int(np.floor(start * fps)) for start in df['Starttime']]

    end_time = [int(np.floor(end * fps)) for end in df['Endtime']]

    return start_time,end_time

In [16]:
#DCASE

def class_to_int(labels):
    
    class_set = set(labels)
    ltoix = {label:index for index, label in enumerate(class_set)}
    return np.array([ltoix[label] for label in labels])

In [None]:
#DCASE

#Check over this
def balance_class_distribution(X,Y):

    '''  Class balancing through Random oversampling
    Args:
    -X: Feature
    -Y: labels

    Out:
    -X_new: Feature after oversampling
    -Y_new: Oversampled label list
    '''

    x_index = [[index] for index in range(len(X))]
    set_y = set(Y)


    ros = RandomOverSampler(random_state=42)
    x_unifm, y_unifm = ros.fit_resample(x_index, Y)
    unifm_index = [index_new[0] for index_new in x_unifm]

    X_new = np.array([X[index] for index in unifm_index])

    sampled_index = [idx[0] for idx in x_unifm]
    Y_new = np.array([Y[idx] for idx in sampled_index])

    return X_new,Y_new

In [None]:
#DCASE

def norm_params(X):

    '''  Normalize features
        Args:
        - X : Features

        Out:
        - mean : Mean of the feature set
        - std: Standard deviation of the feature set
        '''


    mean = np.mean(X)

    std = np.std(X)
    return mean, std

In [None]:
def euclidean_dist(x, y):
    '''
    Compute euclidean distance between two tensors
    '''
    # x: N x D
    # y: M x D
    n = x.size(0)
    m = y.size(0)
    d = x.size(1)
    if d != y.size(1):
        raise Exception

    x = x.unsqueeze(1).expand(n, m, d)

    y = y.unsqueeze(0).expand(n, m, d)

    return torch.pow(x - y, 2).sum(2)


## Loop

In [None]:
#After loss is in place what more do we need to just start a training loop (No testing)?

## Test


In [13]:
initialize(job_name='test')

See https://hydra.cc/docs/next/upgrades/1.0_to_1.1/changes_to_hydra_main_config_path for more information.
  initialize(job_name='test')


hydra.initialize()

In [None]:
cfg = compose(config_name='config')

s = Spectralizer(cfg)
f_ext = MyF_Ext(cfg, s)
f_ext.extract_features()