In [None]:
%config InlineBackend.figure_formats = ['svg']
import matplotlib.pyplot as plt

%matplotlib inline
import librosa
import librosa.display
import numpy as np
import torch
from IPython.display import Audio
from torch.nn import functional as F

In [None]:
sample_rate = 48000
n_fft = 1024
hop_ratio = 2
hop_length = n_fft // hop_ratio
frames = n_fft // 2

In [None]:
y, sr = librosa.load("../data/test/samples/violin-1.wav", sr=sample_rate, mono=False)
Audio(data=y, rate=sample_rate)

In [None]:
assert sr == sample_rate
print(f"audio sample shape:\t{y.shape}")
square_stft_duration = librosa.frames_to_time(
    frames, sr=sample_rate, hop_length=hop_length, n_fft=n_fft
)
print(f"square stft duration:\t{square_stft_duration:.2f} sec.")

In [None]:
D = librosa.stft(y[0], n_fft=n_fft, hop_length=hop_length)
mag, phase = librosa.magphase(D)
angle = np.angle(phase)
alpha = librosa.amplitude_to_db(mag, ref=np.max) / 80 + 1
freqs = librosa.fft_frequencies(sr=sample_rate, n_fft=n_fft)
times = librosa.times_like(D, sr=sample_rate, hop_length=hop_length, n_fft=n_fft)
unwrapped_pahse_diff = np.diff(np.unwrap(angle, axis=1), axis=1, prepend=0)
angle.shape, unwrapped_pahse_diff.shape

In [None]:
fig, ax = plt.subplots()
img = librosa.display.specshow(
    unwrapped_pahse_diff, cmap="hsv", alpha=alpha, ax=ax, y_axis="linear", x_axis="time"
)
ax.set_facecolor("#000")
cbar = fig.colorbar(img, ticks=[-np.pi, -np.pi / 2, 0, np.pi / 2, np.pi])
cbar.ax.set(yticklabels=["-π", "-π/2", "0", "π/2", "π"])
plt.show()

## Notes

Real and imaginary part of complex phase can be 2 color channels. We can multiply them with alpha.

We can sample phase from a uniform distribution then take linear interp between that and the actual phase to generate audio.

In [None]:
undiffed_phase = np.cumsum(unwrapped_pahse_diff[:, 1:], axis=1)
# noise_phase = np.random.uniform(low=-np.pi, high=np.pi, size=undiffed_phase.shape)
# phase_hat = alpha * undiffed_phase + (1. - alpha) * noise_phase
# D_hat = mag[:, 1:] * np.exp(1j*undiffed_phase)

y_hat = librosa.griffinlim(
    np.concatenate([np.zeros((1, mag.shape[1])), mag[1:, :]]),
    n_iter=128,
    hop_length=hop_length,
    n_fft=n_fft,
    init="random",
)
# y_hat = librosa.istft(D_hat, hop_length=hop_length, win_length=n_fft)

Audio(data=y_hat, rate=sample_rate)