In [None]:
# default_exp core

In [None]:
import nbdev.showdoc as literacy

In [None]:
#export
from speechsep.imports import *
from speechsep.utils import stft,istft,ResampleSignal
from speechsep.plot import setup_graph

# Core

This contains most of the basic functions and spectrogram class types. To visualize the spectrograms we will also include a special color map since this makes it easier to notice differences in audio intensities.

The most important things to remember are
- How to create an AudioItem from file, both mono and multi-channel.
- Creating a SpecImage and how the parameters influence the final result.
- Basic SpecImage Visualizer (more indepth explanation here***)

## Loading Data

In [None]:
#export
@delegates(load)
def load_audio(fn, **kwargs):
    return load(fn)

In [None]:
fn = Path("../data/AudioTest1.wav")
sig, sr = load_audio(fn)
display(Audio(sig, rate=sr))

test_eq(type(sig), np.ndarray)
test_eq(type(sr), int)

## AudioBase
The current base class for audio which is used for mono and multi-channel audio types.

In [None]:
#export
class AudioBase():
    def __init__(self,sig,_sr,fn=None):
        store_attr(self, 'sig,_sr,fn')
    def __repr__(self): display(Audio(self.sig, rate=self.sr)); return f'{self.__str__()}'
    def __str__(self): return f'{self.fn}, {len(self.sig)/self.sr}secs at {self.sr} samples per second'
    @delegates(Line2D)
    def show(self, **kwargs): plt.plot(self.sig, **kwargs)

## MonoAudios
Audios with only one channel. For now this is the only Audio type, if the file has more channels they will be averaged out into one.

In [None]:
#export
class AudioMono(AudioBase):
    @classmethod
    def create(cls, fn, sr=None):
        audio = cls(*load_audio(fn),fn)
        if sr: audio.sr = sr
        return audio
    load_file = create
    @property
    def sr(self): return self._sr
    @sr.setter
    def sr(self, new_sr):
        if self._sr != new_sr: self.sig = ResampleSignal(new_sr)(self.sig, self.sr)
        self._sr = new_sr

In [None]:
aud1 = AudioMono.create(fn) #default file sample rate
aud2 = AudioMono.create(fn, sr=2205) #custom sample rate, could cause loss of quality

In [None]:
test_eq(type(aud1), AudioMono)
test_eq(aud1.sr, 22050)
test_eq(aud1.fn, fn)

In [None]:
#hide
test_eq(aud2.sr, 2205)
test_eq(type(aud2.sig), np.ndarray)
test_eq(type(aud2.sr), int)

In [None]:
@patch_property
def duration(x:AudioMono):
    return len(x.sig)/x.sr

In [None]:
#hide
test_eq(type(aud1.duration), float)
test_eq(round(aud1.duration), 4)

In [None]:
aud1.sr = 48000

In [None]:
#hide
test_eq(aud1.sr, 48000)
test_eq(round(aud1.duration), 4)

### Convert to Tensor and Array

In [None]:
class ArrayAudioBase(ArrayBase):
    _show_args = {}
    def show(self, **kwargs):
        return show_audio(self, ctx=ctx, **{**self._show_args, **kwargs})

In [None]:
class TensorAudio(TensorBase): 
    _show_args = ArrayAudioBase._show_args
    def show(self, ctx=None, **kwargs):
        return show_image(self, ctx=ctx, **{**self._show_args, **kwargs})

In [None]:
def audio2tensor(aud:AudioBase): return Tensor(aud.sig)

In [None]:
AudioMono._tensor_cls = TensorAudio
@ToTensor
def encodes(self, o:AudioBase): return o._tensor_cls(audio2tensor(o))

In [None]:
@typedispatch
def show_batch(x:AudioMono, y, samples, ctxs=None, max_n=10, rows=None, cols=None, figsize=None, **kwargs):
    if ctxs is None: ctxs = get_grid(min(len(samples), max_n), rows=rows, cols=cols, figsize=figsize)
    ctxs = show_batch[object](x, y, samples, ctxs=ctxs, max_n=max_n, **kwargs)
    return ctxs

In [None]:
TensorAudio(Tensor(aud1.sig))

test_eq(type(ToTensor()(aud1)), TensorAudio)

In [None]:
#export
def show_audio(aud):
    display(Audio())

## Spectrograms

### SpecImage
Gives the template for the rest of the Spectrogram classes. There will be transforms to add mel-bin and decibels

In [None]:
#export
class SpecImage():
    def __init__(self, data, sr, fn=None):
        store_attr(self, 'data, sr, fn')
        self._plt_params = {}
    @property
    def plt_params(self): return self._plt_params
    @plt_params.setter
    @delegates(plt.pcolormesh)
    def plt_params(self, **kwargs):
        self._plot = partial(plt.pcolormesh, **kwargs)
        self._plt_params = dict(**kwargs)

### Spectify
Transform that turns AudioItem into a Spectrogram, it can take the parameters for decibel and mel_bin, which are the main transformations that are used. Standard problems will require decibels because it resembles human hearing. Mel-bins also achieve this but it requires us to loose large portion of the phase which reduces the intelligibility of the audio.

In [None]:
#export
class Spectify(Transform):
    def __init__(self, fftsize=512, win_mult=2, overlap=0.5, decibel=False, mel_bin=False):
        store_attr(self, 'fftsize, win_mult, overlap, decibel, mel_bin')
    def encodes(self, audio:AudioMono):
        spec = stft(audio.sig, self.fftsize, self.win_mult, self.overlap)
        if self.decibel: pass #TODO Encode
        if self.mel_bin: pass #TODO Encode
        return SpecImage(spec, audio.sr, audio.fn)
    def decodes(self, spec):
        audio = istft(spec.data, self.fftsize, self.win_mult, self.overlap)
        if self.decibel: pass #TODO Decode
        if self.mel_bin: pass #TODO Decode
        return AudioMono(audio, spec.sr, spec.fn)

### Decibelify
Turn spectrogram amplitude to decibel, is automatically called in `Spectify` with `decibel=True`. Decibel is the same as amplitude (intensity of each "pixel") in log-scale.

In [None]:
class Decibelify(Transform):
    def __init__(self): pass
    def encodes(self,spec): pass
    def decodes(self,spec): pass

### Mel-binify
Transforms the frequency to mel-bin. Just like decibels, this transform also resembles human hearing better than linear frequencies do. Sadly making mel-bins also makes it dificult to reconstruct the audio since the phase and data loss is very high. Recommended for classification problems.

In [None]:
class Mel_Binify(Transform):
    def __init__(self): pass
    def encodes(self,spec): pass
    def decodes(self,spec): pass

In [None]:
audio = AudioMono.load_file(fn)
Audio2Spec = Spectify()
spec = Audio2Spec(audio)

In [None]:
#hide
test_eq(type(spec), SpecImage)
test_eq(type(spec.data), np.ndarray)
test_eq(spec.fn, fn)
test_eq(spec.sr, 22050)

In [None]:
#export
@patch
@delegates(setup_graph)
def show(x:SpecImage, ctx=None, **kwargs):
    setup_graph(**kwargs)
    plt.pcolormesh(abs(x.data[:x.data.shape[0]//2]))

In [None]:
spec.show(title='one two', x_label='time', y_label='frequency', fig_size = [12,8])

In [None]:
audio_r = Audio2Spec.decodes(spec)
audio_r

In [None]:
#hide
test_eq(type(audio_r), AudioMono)
test_eq(type(audio_r.sig), np.ndarray)
test_eq(audio_r.sr, 22050)
test_eq(audio_r.fn, fn)

### Create Function

In [None]:
@patch_clsmthd
@delegates(to=Spectify)
def create(cls:SpecImage, fn, sr=None, **kwargs):
    #Open an `Audio` from path `fn`
    if isinstance(fn,(Path,str)): return cls.create(AudioMono.create(fn,sr))
    elif isinstance(fn,AudioMono): return Spectify(**kwargs)(fn)
    raise ValueError('fn must be AudioMono, Path or str')

In [None]:
spec = SpecImage.create(fn)
spec.show(fig_size=[12,8])

## Masks

In [None]:
#export
class MaskBase():
    def __init__(self, data):
        store_attr(self, 'data')
    @property
    def shape(self):
        return self.data.shape
    @classmethod
    def create(cls, audios):
        self.adjust(audios)
        joined = join_audios(audios)
        return [cls(self.generate(joined, aud)) for aud in audios]
    def adjust(self, audios):
        pass
    def __mult__(self, spec):
        raise NotImplementedError('This function needs to be implemented before use')
    def generate(self, joined, aud):
        raise NotImplementedError('This function needs to be implemented before use')

In [None]:
#export
class MaskBinary(MaskBase):
    def __mult__(self, spec): pass
    def __generate__(self, joined, aud): pass