In [None]:
import plotly.express as px
import plotly.graph_objects as go
from plotly_resampler import FigureWidgetResampler
import numpy as np
import librosa
import librosa.display as display
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
import torchaudio.functional as AF
import torch.fft as fft
from IPython.display import Audio
import torchcrepe
from einops.layers.torch import Rearrange, Reduce
from einops import rearrange
from dsp import HarmonicOscillator, FilteredNoise, ConvolutionalReverb

In [None]:
TEST_FILE_PATH = '/home/kureta/Music/cello-test.wav'
SAMPLE_RATE = 48000
N_FFT = 1024 * 3
HOP_LENGTH = 256 * 3

In [None]:
def normalized(x):
    scale = x.max() - x.min()
    bias = x.min()
    
    return (x - bias) / scale

In [None]:
audio = torch.load('/home/kureta/Music/audio.pth')
f0 = torch.load('/home/kureta/Music/f0.pth')
loudness = torch.load('/home/kureta/Music/loudness.pth')

In [None]:
audio.shape, f0.shape, loudness.shape

In [None]:
time_ = np.arange(audio.shape[-1]) / SAMPLE_RATE
time_pitch = librosa.frames_to_time(np.arange(f0.shape[-1]), sr=16000, hop_length=HOP_LENGTH//3)

In [None]:
fig = FigureWidgetResampler(go.Figure())
fig.add_trace(go.Scattergl(name='audio', showlegend=True), hf_x=time_, hf_y=audio[0, 0])
fig.add_trace(go.Scattergl(name='f0', showlegend=True), hf_x=time_pitch, hf_y=normalized(librosa.hz_to_midi(f0[0, 0])))
fig.add_trace(go.Scattergl(name='loudness', showlegend=True), hf_x=time_pitch, hf_y=normalized(loudness[3, 0]))
fig.show()
Audio(audio[0], rate=SAMPLE_RATE)

In [None]:
audio, sr = librosa.load(TEST_FILE_PATH, sr=SAMPLE_RATE, mono=False)
if len(audio.shape) == 1:
    audio = audio[None, :]
audio = audio[:, :-(audio.shape[-1] % HOP_LENGTH)]
time_ = np.arange(audio.shape[-1]) / sr

In [None]:
audio_ = torch.from_numpy(audio)
S = torch.stft(
    audio_,
    N_FFT,
    HOP_LENGTH,
    N_FFT,
    torch.hann_window(N_FFT).to(audio_),
    center=True,
    normalized=False,
    return_complex=True,
).abs()

In [None]:
stft = librosa.stft(audio, n_fft=N_FFT, hop_length=HOP_LENGTH)
time_stft = librosa.frames_to_time(np.arange(stft.shape[-1]), sr=sr, hop_length=HOP_LENGTH)
freqs = librosa.core.fft_frequencies(sr=sr, n_fft=N_FFT)

weights = librosa.A_weighting(freqs)
Xmag = weights[None, :, None] + librosa.amplitude_to_db(np.abs(stft))

trace = [go.Heatmap(
    x= time_stft,
    y= freqs,
    z= Xmag[0],
    colorscale='Jet',
    )]
layout = go.Layout(
    title = 'Spectrogram',
    yaxis = dict(title = 'Frequency'), # x-axis label
    xaxis = dict(title = 'Time'), # y-axis label
    )

go.Figure(data=trace, layout=layout)

In [None]:
stft = S.numpy()
time_stft = librosa.frames_to_time(np.arange(stft.shape[-1]), sr=sr, hop_length=HOP_LENGTH)
freqs = librosa.core.fft_frequencies(sr=sr, n_fft=N_FFT)

weights = librosa.A_weighting(freqs)
Xmag_ = weights[None, :, None] + librosa.amplitude_to_db(np.abs(stft))

trace = [go.Heatmap(
    x= time_stft,
    y= freqs,
    z= Xmag[0],
    colorscale='Jet',
    )]
layout = go.Layout(
    title = 'Spectrogram',
    yaxis = dict(title = 'Frequency'), # x-axis label
    xaxis = dict(title = 'Time'), # y-axis label
    )

go.Figure(data=trace, layout=layout)

In [None]:
loudness = np.mean(Xmag, axis=-2)
norm_loudness = normalized(loudness)
loudness_ = np.mean(Xmag_, axis=-2)
norm_loudness_ = normalized(loudness_)

In [None]:
# audio shape is [batch, channel, time]
# merge batch and channel before sending to crepe, and unmerge after
resampled_audio = librosa.resample(audio, orig_sr=SAMPLE_RATE, target_sr=16000)
freq, periodicity, probs = torchcrepe.predict(torch.from_numpy(resampled_audio), 16000, hop_length=HOP_LENGTH//3, decoder=torchcrepe.decode.weighted_argmax,
                                       device='cuda', return_periodicity=True)
freq = freq.numpy()
periodicity = periodicity.numpy()
time_pitch = librosa.frames_to_time(np.arange(freq.shape[-1]), sr=16000, hop_length=HOP_LENGTH//3)

pitch = librosa.hz_to_midi(freq)
norm_pitch = normalized(pitch)

In [None]:
fig = FigureWidgetResampler(go.Figure())
# fig.add_trace(go.Scattergl(name='audio', showlegend=True), hf_x=time_, hf_y=audio[0])
fig.add_trace(go.Scattergl(name='loudness', showlegend=True), hf_x=time_stft, hf_y=loudness[0]/2)
fig.add_trace(go.Scattergl(name='loudness', showlegend=True), hf_x=time_stft, hf_y=loudness_[0]/2)
# fig.add_trace(go.Scattergl(name='pitch', showlegend=True), hf_x=time_pitch, hf_y=norm_pitch[0]/2)
# fig.add_trace(go.Scattergl(name='periodicity', showlegend=True), hf_x=time_pitch, hf_y=periodicity[0]/2)
fig.show()
Audio(audio, rate=sr)

In [None]:
shit = HarmonicOscillator(n_harmonics=32, n_channels=2)

In [None]:
# add batch dimension
val1 = torch.from_numpy(freq[None, ...])
val2 = torch.from_numpy(norm_loudness[None, ...])
val3 = torch.einsum('bcot,o->bcot', torch.ones(1, 2, 32, freq.shape[-1]), torch.arange(1, 33)**2)
val3 = 1 / val3

In [None]:
val1.shape, val2.shape, val3.shape

In [None]:
with torch.no_grad():
    pitched_sound = shit(val1, val2, val3)

In [None]:
Audio(pitched_sound[0].numpy(), rate=SAMPLE_RATE)

In [None]:
stft = librosa.stft(pitched_sound[0].numpy(), n_fft=N_FFT, hop_length=HOP_LENGTH)
time_stft = librosa.frames_to_time(np.arange(stft.shape[-1]), sr=sr, hop_length=HOP_LENGTH)
freqs = librosa.core.fft_frequencies(sr=sr, n_fft=N_FFT)

weights = librosa.A_weighting(freqs)
Xmag = weights[None, :, None] + librosa.amplitude_to_db(np.abs(stft))

trace = [go.Heatmap(
    x= time_stft,
    y= freqs,
    z= Xmag[0],
    colorscale='Jet',
    )]
layout = go.Layout(
    title = 'Spectrogram',
    yaxis = dict(title = 'Frequency'), # x-axis label
    xaxis = dict(title = 'Time'), # y-axis label
    )

go.Figure(data=trace, layout=layout)

In [None]:
n_bands = 64

In [None]:
kaka = FilteredNoise(n_bands)

In [None]:
# [batch, channel, n_bands, time]
bands = torch.zeros(1, freq.shape[0], n_bands, freq.shape[-1])
for idx in range(freq.shape[-1]):
    bands[0, :, idx%n_bands, idx] = 1.0

In [None]:
noise_sound = kaka(bands)

In [None]:
Audio(noise_sound[0].numpy(), rate=SAMPLE_RATE)

In [None]:
stft = librosa.stft(noise_sound[0].numpy(), n_fft=N_FFT, hop_length=HOP_LENGTH)
time_stft = librosa.frames_to_time(np.arange(stft.shape[-1]), sr=sr, hop_length=HOP_LENGTH)
freqs = librosa.core.fft_frequencies(sr=sr, n_fft=N_FFT)

weights = librosa.A_weighting(freqs)
Xmag = weights[None, :, None] + librosa.amplitude_to_db(np.abs(stft))

trace = [go.Heatmap(
    x= time_stft,
    y= freqs,
    z= Xmag[0],
    colorscale='Jet',
    )]
layout = go.Layout(
    title = 'Spectrogram',
    yaxis = dict(title = 'Frequency'), # x-axis label
    xaxis = dict(title = 'Time'), # y-axis label
    )

go.Figure(data=trace, layout=layout)

In [None]:
dodo = ConvolutionalReverb(in_ch=2, out_ch=2)

In [None]:
pitched_sound.shape

In [None]:
with torch.no_grad():
    signal = dodo(pitched_sound+noise_sound)

In [None]:
signal.shape, pitched_sound.shape

In [None]:
fig = FigureWidgetResampler(px.line(signal[0, 0]))
fig.show()
Audio(signal[0], rate=SAMPLE_RATE)

In [None]:
stft = librosa.stft(np.zeros_like(audio), n_fft=N_FFT, hop_length=HOP_LENGTH)
freqs = librosa.core.fft_frequencies(sr=sr, n_fft=N_FFT)
weights = librosa.A_weighting(freqs)
Xmag = weights[None, :, None] + librosa.amplitude_to_db(np.abs(stft))

loudness = np.mean(Xmag, axis=-2)

In [None]:
Xmag.min(), Xmag.max()

In [None]:
Xmag.min(), Xmag.max()

In [None]:
Xmag.min(), Xmag.max()

In [None]:
loudness.shape

In [None]:
signal.shape, audio.shape, pitched_sound.shape, noise_sound.shape