In [None]:
# default_exp core

In [None]:
import nbdev.showdoc as literacy

In [None]:
#export
from speechsep.imports import *
from speechsep.utils import *
from speechsep.plot import *
from speechsep.base import *

# Core

This contains most of the basic functions and spectrogram class types. To visualize the spectrograms we will also include a special color map since this makes it easier to notice differences in audio intensities.

The most important things to remember are
- How to create an AudioItem from file, both mono and multi-channel.
- Creating a SpecImage and how the parameters influence the final result.
- Basic SpecImage Visualizer (more indepth explanation here***)

# Loading Data

In [None]:
#export
load_audio = load

In [None]:
fn = Path("../data/AudioTest1.wav")
sig, sr = load_audio(fn)
display(Audio(sig, rate=sr))

In [None]:
#hide
test_eq(type(sig), np.ndarray)
test_eq(type(sr), int)

### MonoAudios
Audios with only one channel. For now this is the only Audio type, if the file has more channels they will be averaged out into one.

In [None]:
#export
class AudioMono(AudioBase):
    _show_args={}
    @classmethod
    def create(cls, fn:Path, sr=None):
        audio = cls(*load_audio(fn),fn)
        if sr: audio.sr = sr
        return audio
    load_file = create

In [None]:
aud1 = AudioMono.create(fn) #default file sample rate
aud2 = AudioMono.create(fn, sr=2205) #custom sample rate, could cause loss of quality

In [None]:
#hide
test_eq(type(aud1), AudioMono)
test_eq(aud1.sr, 22050)
test_eq(aud1.fn, fn)

test_eq(aud2.sr, 2205)
test_eq(type(aud2.sig), np.ndarray)
test_eq(type(aud2.sr), int)

In [None]:
#export
@patch_property
def duration(x:AudioMono):
    return len(x.sig)/x.sr

In [None]:
#hide
test_eq(type(aud1.duration), float)
test_eq(round(aud1.duration), 4)

aud1.sr = 48000

test_eq(aud1.sr, 48000)
test_eq(round(aud1.duration), 4)

# Spectrograms

## SpecImage
Gives the template for the rest of the Spectrogram classes. There will be transforms to add mel-bin and decibels

In [None]:
#export
class SpecImage(SpecBase): pass

# Transforms

## Tensors

In [None]:
#export
class ArrayAudioBase(ArrayBase):
    _show_args = {}
    def show(self, **kwargs):
        return show_audio(self, ctx=ctx, **{**self._show_args, **kwargs})

class ArraySpecBase(ArrayBase):
    _show_args = {}
    def show(self, **kwargs):
        return show_spec(self, ctx=ctx, **{**self._show_args, **kwargs})

class ArrayMaskBase(ArrayBase):
    _show_args = {}
    def show(self, **kwargs):
        return show_mask(self, ctx=ctx, **{**self._show_args, **kwargs})

class TensorAudio(TensorBase): 
    _show_args = ArrayAudioBase._show_args
    def show(self, ctx=None, **kwargs):
        return show_audio(self, ctx=ctx, **{**self._show_args, **kwargs})
    
class TensorSpec(TensorBase): 
    _show_args = ArraySpecBase._show_args
    def show(self, ctx=None, **kwargs):
        return show_spec(self, ctx=ctx, **{**self._show_args, **kwargs})
    
class TensorMask(TensorBase): 
    _show_args = ArrayMaskBase._show_args
    def show(self, ctx=None, **kwargs):
        return show_mask(self, ctx=ctx, **{**self._show_args, **kwargs})

In [None]:
#export
AudioMono._tensor_cls = TensorAudio
SpecImage._tensor_cls = TensorSpec

@ToTensor
def encodes(self, o:AudioBase): return o._tensor_cls(audio2tensor(o))
@ToTensor
def encodes(self, o:SpecBase): return o._tensor_cls(spec2tensor(o))

def audio2tensor(aud:AudioBase): return TensorAudio(aud.sig)
def spec2tensor(spec:SpecBase): return TensorSpec(spec.data)

In [None]:
#hide
test_eq(type(ToTensor()(aud1)), TensorAudio)

## Spectify
Transform that turns AudioItem into a Spectrogram, it can take the parameters for decibel and mel_bin, which are the main transformations that are used. Standard problems will require decibels because it resembles human hearing. Mel-bins also achieve this but it requires us to loose large portion of the phase which reduces the intelligibility of the audio.

In [None]:
#export
class Spectify(Transform):
    def __init__(self, sr=48000, stft=stft, istft=istft):
        store_attr(self, 'sr, stft, istft')
    def encodes(self, audio:AudioMono):
        spec = self.stft(audio.sig)
        return SpecImage(spec, audio.sr, audio.fn)
    def decodes(self, spec:SpecBase):
        audio = self.istft(spec.data)
        return AudioMono(audio, spec.sr, spec.fn)
    def decodes(self, data:ArraySpecBase):
        return SpecImage(data, self.sr)

In [None]:
audio = AudioMono.load_file(fn)
Audio2Spec = Spectify()
spec = Audio2Spec(audio)

In [None]:
#hide
test_eq(type(spec), SpecImage)
test_eq(type(spec.data), np.ndarray)
test_eq(spec.fn, fn)
test_eq(spec.sr, 22050)

In [None]:
audio_r = Audio2Spec.decodes(spec)

In [None]:
#hide
test_eq(type(audio_r), AudioMono)
test_eq(type(audio_r.sig), np.ndarray)
test_eq(audio_r.sr, 22050)
test_eq(audio_r.fn, fn)

## Decibelify
Turn spectrogram amplitude to decibel, is automatically called in `Spectify`. Decibel is the same as amplitude (intensity of each "pixel") in log-scale.

In [None]:
#export
class Decibelify(Transform):
    def __init__(self): pass
    def encodes(self,spec:SpecImage):
        spec.data = np.log(spec.data)
        return spec
    def decodes(self,spec:SpecImage):
        spec.data = np.exp(spec.data)
        return spec

## Mel-binify
Transforms the frequency to mel-bin. Just like decibels, this transform also resembles human hearing better than linear frequencies do. Sadly making mel-bins also makes it dificult to reconstruct the audio since the phase and data loss is very high. Recommended for classification problems.

In [None]:
#export
from librosa.feature import melspectrogram
class Mel_Binify_lib(Transform):
    @delegates(melspectrogram)
    def __init__(self, **kwargs):
        self.audio2mel = partial(melspectrogram, **kwargs)
    def encodes(self,audio:AudioBase):
        spec = self.audio2mel(audio.sig, audio.sr)
        return SpecImage(spec, audio.sr)

In [None]:
class Mel_Binify(Transform):
    def __init__(self): pass #TODO Parameters f_max f_min | check more on librosas melbin
    #TODO Add librosa melbin straight from audio?
    def encodes(self,spec:SpecBase): pass
    def decodes(self,spec:SpecBase): pass

## MFCCify

In [None]:
#export
from librosa.feature import mfcc
class MFCCify(Transform):
    @delegates(mfcc)
    def __init__(self, **kwargs):
        self.audio2mfcc = partial(mfcc, **kwargs)
    def encodes(self,audio:AudioBase):
        spec = self.audio2mfcc(audio.sig, audio.sr)
        return SpecImage(spec, audio.sr)

### Create Spec

In [None]:
#export
@patch_clsmthd
@delegates(to=Spectify)
def create(cls:SpecImage, fn, sr=None, **kwargs):
    #Open an `Audio` from path `fn`
    if isinstance(fn,(Path,str)): return cls.create(AudioMono.create(fn,sr))
    elif isinstance(fn,AudioMono): return Spectify(**kwargs)(fn)
    raise ValueError('fn must be AudioMono, Path or str')

In [None]:
spec = SpecImage.create(fn)

In [None]:
#hide
test_eq(type(spec), SpecImage)
test_eq(type(spec.data), np.ndarray)
test_eq(spec.sr, 22050)
test_eq(spec.fn, fn)

## BasicTransforms

In [None]:
#export
class Resample(Transform):
    def __init__(self, sr): self.sr = sr
    def encodes(self, x:AudioBase): x.sr = self.sr; return x

In [None]:
audio_resamp = Resample(sr//2)(AudioMono.create(fn))

test_eq(audio_resamp.sr, 11025)
test_eq(audio_resamp._sr, 11025)
audio_resamp

../data/AudioTest1.wav, 4.115102040816327secs at 11025 samples per second

In [None]:
#export
class Clip(Transform):
    def __init__(self, time): self.time = time
    def encodes(self, x:AudioBase):
        new_sig_len = int(self.time*x.sr)
        diff = abs(len(x.sig) - new_sig_len)
        if len(x.sig) <= new_sig_len:
            x.sig = np.pad(x.sig, (0,diff), 'constant', constant_values=(0, 0))
        else:
            x.sig = x.sig[:new_sig_len]
        return x

In [None]:
audio_ext = Clip(5)(AudioMono.create(fn))

fn_long = Path("../data/AudioTest1_full.wav")
audio_clip = Clip(4)(AudioMono.create(fn_long))

In [None]:
#hide
test_eq(audio_ext.duration, 5.0)
test_eq(len(audio_ext.sig), 5.0*audio_ext.sr)
test_eq(audio_clip.duration, 4.0)
test_eq(len(audio_clip.sig), 4.0*audio_clip.sr)

## Phase and Complex Tensor Managing

In [None]:
#export
class PhaseManager(Transform):
    def __init__(self, mthd="new_dim", cls=SpecImage):
        assert mthd in ['new_dim', 'remove', 'replace'], 'phase method must be either new_dim, remove or replace'
        store_attr(self, 'mthd, cls')
        
    def encodes(self, spec:SpecImage):
        if self.mthd == 'new_dim': return complex2real(spec)
        
    #BUG ArraySpecBase not Casting to return value
    def decodes(self, spec:TensorSpec)->ArraySpecBase:
        if self.mthd == 'new_dim':
            spec = real2complex(spec)
            #HACK not sure how else to get the output to be and ArraySpecBase
            # If this is removed Spectify would have to decode a numpy array and that's not always what we want.
            # If it doesn't find how to decode an ndarray it will try to show and ndarray doesn't have that function
            temp = ArraySpecBase(spec.shape, dtype=np.complex)
            temp[:,:] = spec
            return temp