In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import matplotlib.image as img

import numpy as np
import librosa
import librosa.display
import soundfile as sf

from IPython.display import Audio

## TODO:

Maybe pick peaks, or lowpass filter or something. I guess we are taking all the indices surrounding a peak in this method.

Especially for the FFT case, try to fin most important bands somehow.

In [None]:
DEO_PATH = "/home/kureta/Music/deo.wav"
SAMPLE_RATE = 48000
N_FFT = 1024
HOP_LENGTH = 512

In [None]:
audio, sr = librosa.load(DEO_PATH, sr=SAMPLE_RATE, mono=False)

## Canon for 1 Voice in STFT

In [None]:
stft = librosa.stft(audio, n_fft=N_FFT, hop_length=HOP_LENGTH)
sdb = librosa.amplitude_to_db(np.abs(stft), top_db=120)
mono = sdb.mean(axis=0)

In [None]:
freq = librosa.fft_frequencies(sr=SAMPLE_RATE, n_fft=N_FFT)

In [None]:
indices = np.where(mono.mean(axis=1) > -24.)[0]
indices = sorted(indices, key=lambda x: mono.mean(axis=1)[x])
indices = np.array(indices)
# indices = indices[(indices > 4) & (indices < 128)]
print(len(indices))

sounds = []
previous = None
cents = []
hzs = []
for i, idx in enumerate(indices):
    pick = np.zeros_like(stft)
    pick[:, idx, :] = 1
    modified = stft * pick
    midi_cents = librosa.hz_to_midi(np.maximum(freq[idx], 1e-5))
    cents.append(midi_cents)
    hzs.append(freq[idx])

    image = librosa.feature.melspectrogram(S=modified, sr=SAMPLE_RATE, hop_length=HOP_LENGTH)
    image = np.log(np.maximum(np.abs(image).mean(0), 1e-5))
    image -= image.min()
    image /= np.maximum(image.max(), 1e-5)
    image = 1-image
    previous = image if previous is None else previous + image

    img.imsave(f'stft-frames/{i}.png', image, origin='lower', cmap='gray')
    img.imsave(f'stft-frames/c-{i}.png', previous, origin='lower', cmap='gray')

    # modified = np.repeat(modified, 3, axis=2)
    s = librosa.istft(modified, hop_length=HOP_LENGTH)
    sounds.append(s)

canon = np.concatenate(np.cumsum(sounds, axis=0), axis=1)
melody = np.concatenate(sounds, axis=1)
harmony = np.sum(sounds, axis=0)
loop = np.concatenate([harmony] * 8, axis=1)

In [None]:
# indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 17, 18, 19, 20, 22, 24]  # 27, 29, 31
select_indices = [1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 24, 27, 28, 29, 30, 31]
picked_sounds = [sounds[idx] for idx in select_indices]

melody = np.concatenate(picked_sounds, axis=1)
canon = np.concatenate(np.cumsum(picked_sounds, axis=0), axis=1)
melody = np.concatenate(picked_sounds, axis=1)
harmony = np.sum(picked_sounds, axis=0)
loop = np.concatenate([harmony] * 8, axis=1)

In [None]:
for idx in select_indices:
    sf.write(f'../sounds/{idx}.wav', sounds[idx].T, SAMPLE_RATE, subtype='PCM_24')

In [None]:
Audio(data=canon, rate=SAMPLE_RATE)

In [None]:
Audio(data=melody, rate=SAMPLE_RATE)

In [None]:
Audio(data=harmony, rate=SAMPLE_RATE)

In [None]:
Audio(data=loop, rate=SAMPLE_RATE)

In [None]:
sf.write('/home/kureta/Downloads/deo-solo.wav', melody.T, SAMPLE_RATE, subtype='PCM_24')

In [None]:
for idx, sound in enumerate(sounds):
    sf.write(f'/home/kureta/Music/deo-{idx+1:02d}.wav', sound.T, SAMPLE_RATE, subtype='PCM_24')

## Music 21

In [None]:
from music21 import *

In [None]:
cents_ = [cents[idx] for idx in select_indices]
# cents_.sort()

In [None]:
s = stream.Stream()
for i, j in enumerate(cents_):
    p = pitch.Pitch(j - 12)
    s.insert(i, note.Note(p))
s.show()
s.show('midi')
s.write('midi', fp='/home/kureta/Downloads/deodorant.midi')

## Make it play

In [None]:
SAMPLE_RATE = 48000
CREPE_SAMPLE_RATE = 16000
SR_RATIO = SAMPLE_RATE // CREPE_SAMPLE_RATE
CREPE_N_FFT = 1024
N_FFT = 1024 * SR_RATIO

# TODO: FRAME_RATE should be adjustable but valid values depend on audio example duration
FRAME_RATE = 250
HOP_LENGTH = SAMPLE_RATE // FRAME_RATE
CREPE_HOP_LENGTH = HOP_LENGTH // SR_RATIO

In [None]:
import torch
from torch.nn import functional as F
from performer.models.ddsp_module import DDSP
from performer.utils.features import Loudness, get_f0

In [None]:
class Preprocess:
    def __init__(self, device):
        self.ld = Loudness().to(device)
    
    def do(self, y):
        if (diff := len(y) % HOP_LENGTH) != 0:
            F.pad(y, (0, HOP_LENGTH - diff))
        
        audio = F.pad(y[None, None, :], (N_FFT // 2, N_FFT // 2))
        loudness = self.ld.get_amp(audio)
        f0 = get_f0(audio)
        
        return f0, loudness

In [None]:
vln_ckpt = '../checkpoints/violin_longrun.ckpt'
vlc_ckpt = '../checkpoints/cello_longrun.ckpt'
flt_ckpt = '../checkpoints/flute_longrun.ckpt'

In [None]:
with torch.inference_mode():
    model = DDSP.load_from_checkpoint(vlc_ckpt, map_location='cpu')
    model = model.to('cpu')
    model.eval()
    pass

In [None]:
preprocessor = Preprocess('cuda')

In [None]:
def moving_average(x: torch.Tensor, window_size: int) -> torch.Tensor:
    # Compute the moving average using a sliding window
    weights = torch.ones(window_size) / window_size
    x_avg = torch.nn.functional.conv1d(x, weights.view(1, 1, -1), padding=window_size//2)

    return x_avg

In [None]:
np.diff(cents_)

In [None]:
sortado = sorted(list(set(np.round(cents_, 1) % 12)))
print(sorted(list(set([f'{s:.1f}' for s in sortado]))))
print([f'{n:.1f}' for n in np.diff(sortado)])
octave_reduced = [0.1, 0.5, 1.3, 3.0, 3.9, 4.8, 5.1, 5.4, 6.2, 7., 7.3, 8.3, 8.7, 9, 9.2, 10.1, 10.7, 11.7]
centos = [o + 60 for o in octave_reduced]

In [None]:
print([np.round(cents[idx] % 12, 1) for idx in select_indices])
print(octave_reduced)
print(len(octave_reduced))
np.diff(octave_reduced)

In [None]:
melisma = [np.round(cents[idx] % 12, 1) for idx in select_indices]
print([octave_reduced.index(m) for m in melisma])

In [None]:
f0s = []
amps = []
for idx, sound, cc in zip(select_indices, picked_sounds, centos):
    prepared = torch.from_numpy(sound.mean(axis=0)).float().cuda()
    prepared /= prepared.abs().max()
    f0, amp = preprocessor.do(prepared)
    
    f0[..., :] = librosa.midi_to_hz(cc - 12)
    
    f0s.append(f0)
    amps.append(amp)

In [None]:
amps = [amps[11]] * 25

In [None]:
f0_batch = torch.cat(f0s)
# averaged = moving_average(f0_batch.cpu(), 5)
# f0_batch *= 2 ** (-5/12)

for idx, val in enumerate(f0_batch):
    condition = torch.abs(torch.log2(val) - torch.log2(val.median()))
    f0_batch[idx][(condition > 1/12)] = val.median()

In [None]:
zaks = []
for f0, amp in zip(f0_batch, amps):
    with torch.inference_mode():
        y = model(f0.unsqueeze(0).cpu(), amp.cpu())

    zaks.append(y.squeeze().numpy())

In [None]:
for idx, y in enumerate(zaks):
    sf.write(f'../sounds/ordered-{idx}.wav', y.T, SAMPLE_RATE, subtype='PCM_24')

In [None]:
zak_canon = np.concatenate(np.cumsum(zaks, axis=0), axis=1)
zak_melody = np.concatenate(zaks, axis=1)
zak_harmony = np.sum(zaks, axis=0)
zak_loop = np.concatenate([zak_harmony] * 8, axis=1)

In [None]:
Audio(data=zak_melody, rate=48000)