In [None]:
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
#%matplotlib notebook

from sax_synth_inference import run_on_validation_set, g_mel, load_generator_model, synthetic2octaves, generate_from_audio

# Init the generator and Mel spec class

In [None]:
import torch

gen_path = '../results/2023_05_28_hifigan_ssynth44khz_synthesized_input_16k_spl0.5_nonorm/hifigan_gen_checkpoint_3000.pt'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' # 'cuda'
gen, denoiser, train_cfg = load_generator_model(gen_path, device = DEVICE)
sampling_rate = train_cfg['sampling_rate']
MEL_IMPL = 'hifigan'
g_mel.init(train_cfg, MEL_IMPL)
print('done loading generator')

# Run inference on validation set, optionally sample one file to play

In [None]:
flist_path = '../data_ssynth/filelists' #--- where to look for list of validation files
num_files = 2 #--- take first N files of the whole validation set (optional)
mel_loss, mel_len, yret, times_lens = run_on_validation_set(gen, denoiser, flist_path, num_files, return_file_index = 0)

In [None]:
play_normalize = True #False

print('Original audio:')
ipd.display(ipd.Audio(yret['y'], rate = sampling_rate, normalize = play_normalize))

#print('Synthesized input:')
#ipd.display(ipd.Audio(y_, rate = sampling_rate, normalize = play_normalize))

print('Generated audio:')
ipd.display(ipd.Audio(yret['y_hat'], rate = sampling_rate, normalize = play_normalize))

print('Generated audio (denoised):')
ipd.display(ipd.Audio(yret['y_hat_den'], rate = sampling_rate, normalize = play_normalize))

# Generate from synthetic input (2 octaves major scale)
This uses naive linear envelopes, so it is expected to sound not natural

In [None]:
x_hat, x_hat_den = synthetic2octaves(gen, denoiser, sampling_rate)

In [None]:
play_normalize = True #False

print('Generated audio:')
x_hat_ = x_hat[0].cpu().detach().numpy()[0]
ipd.display(ipd.Audio(x_hat_, rate = sampling_rate, normalize = play_normalize))

print('Generated audio (denoised):')
x_hat_den_ = x_hat_den[0].cpu().detach().numpy()[0]
ipd.display(ipd.Audio(x_hat_den_, rate = sampling_rate, normalize = play_normalize))

# Generate from parallel audio+midi, and compare
## 1. Generate using the real GT envelopes (pitch and amplitude) of the input audio.
This step uses the midi data of notes on/off for better tracking.
### Read midi file and phrase data, choose a phrase (by index), and generate

In [None]:
import ssynth.set_python_path

from ssynth.utils.synthesis import wav_midi_to_synth
from ssynth.utils import midi
from ssynth.utils.midi import midi_phrase_from_dataframe

import librosa
import os
import pandas as pd

#--- read phrase info for 1 file
if False:
    file_id = 'Funky_Nadley'
    midi_fnm = f'../data_ssynth_TMP/midi/{file_id}.mid'
    data_dir = '../data_ssynth_TMP/wavs'
    phrase_df_fnm = '../data_ssynth_TMP/phrase_df.csv'
else:
    file_id = '01_Free_Improv_dynamic_mic'
    midi_fnm = f'../data_ssynth/auto_midi/{file_id}.mid'
    data_dir = '../data_ssynth/wavs'
    phrase_df_fnm = '../data_ssynth/phrase_df.csv'

print(f'reading midi file {os.path.basename(midi_fnm)}')
midi_df, midi_pitch, midi_aftertouch, midi_cc = midi.read_midi_to_df(midi_fnm)
midi.verify_midi(midi_df)


phrase_df = pd.read_csv(phrase_df_fnm, index_col = 0).reset_index(drop = True)
phrase_df = phrase_df[phrase_df.file_nm.str.contains(file_id)]

In [None]:
#--- pitch detection config
pd_cfg = dict(win = 1024,
              ac_win = 512, # autocorrelation window
              hop = 256)

#--- choose a phrase (locally I copied a small number of files:)
#---     Phrases from "Funky_Nadley_dynamic_mic": 5,42,14,12
#---     Phrases from "01_Free_Improv_dynamic_mic": 14,17,26,30
phrase_ind = 26
p = phrase_df.iloc[phrase_ind]

t0 = p.sample_start / sampling_rate
wav_fnm = f'{data_dir}/{p.phrase_id}.wav'
seg, sr = librosa.load(wav_fnm, sr = sampling_rate)

midi_p = midi_phrase_from_dataframe(p, midi_df, sampling_rate)
midi_p_cc = midi_phrase_from_dataframe(p, midi_cc, sampling_rate)

#--- filter 'errors'
min_velocity = 0 # some "real" notes have velocity 1, so we cannot filter errors based on velocity (so set threshold to 0)
err_notes = (midi_p.type == 'note_on') & (midi_p.velocity <= min_velocity)
err_notes.loc[err_notes[err_notes].index + 1] = True #--- add the corresponding note-off
midi_p = midi_p[~err_notes]

### Generate and play

In [None]:
#x, env, freq = phrase_to_synth(seg, sr, midi_p, t0, num_harmonics = 30, spline_smoothing = 2, verbose = False)
x, env, freq, _ = wav_midi_to_synth(seg, sr, midi_p, t0, pd_cfg, max_freq_hz = 16000, spline_smoothing = .5, verbose = False)

#--- apply hifi-gan
pre_gain = 1 #0.6
x_hat = generate_from_audio(pre_gain * x, gen)

In [None]:
play_normalize = False
play_rate_factor = 1 # .5
ipd.display(ipd.Audio(seg, rate = play_rate_factor * sr, normalize=play_normalize))
ipd.display(ipd.Audio(x_hat, rate = play_rate_factor * sr, normalize=play_normalize))

In [None]:
from ssynth.utils.synthesis import additive_synth_sawtooth

x1_s = additive_synth_sawtooth(freq, env, sampling_rate, max_freq_hz = 16000)

freq[freq < 1] = 0
freq_q = librosa.midi_to_hz(np.round(librosa.hz_to_midi(freq))) #--- quantize frequency to notes
x1_s_q = additive_synth_sawtooth(freq_q, env, sampling_rate, max_freq_hz = 16000)

x1 = generate_from_audio(x1_s, gen, return_numpy_arr = True)
x1_q = generate_from_audio(x1_s_q, gen, return_numpy_arr = True)

In [None]:
play_normalize = False
ipd.display(ipd.Audio(x1, rate = play_rate_factor * sr, normalize=play_normalize))
ipd.display(ipd.Audio(x1_q, rate = play_rate_factor * sr, normalize=play_normalize))

In [None]:
%matplotlib widget
fig, ax = plt.subplots(figsize = (12,4))
ax.plot(np.arange(len(env)) / sampling_rate, env,'o')
try:
    c = env.max() / midi_p_cc.value.max() # normalize view
    ax.plot(midi_p_cc.ts_sec - t0, c * midi_p_cc.value,'.-')
except:
    pass
ax.legend(['audio amplitude envelope', 'midi cc value'])
ax.grid()

## 2. Generate from midi, using synthetic envelopes (pitch and amplitude)
### Side task: fit amplitude envelope for synthesis from midi
choose a note and fit (manually...) an ADSR env using cubic-Bezier curves

In [None]:
import importlib
import ssynth.utils.envelopes
importlib.reload(ssynth.utils.envelopes)

from ssynth.utils.envelopes import ADSRBezier
from scipy.interpolate import CubicSpline

In [None]:
#--- compare 2 spline smoothing params (0.5 was used for training)
%matplotlib widget
x1, env1, freq1 = wav_midi_to_synth(seg, sr, midi_p, t0, pd_cfg, max_freq_hz=16000, spline_smoothing = 2, verbose = False)
x2, env2, freq2 = wav_midi_to_synth(seg, sr, midi_p, t0, pd_cfg, max_freq_hz=16000, spline_smoothing = .5, verbose = False)
k1, k2 = 52283, 67500
env0 = env1[k1:k2] # use this to manually fit an ADSR env using Bezier etc.

fig, ax = plt.subplots(figsize = (14,6))
ax.plot(env1,'.')
ax.plot(env2,':')
ax.grid()

In [None]:
adsr_cfg = dict(a_t_msec = 43, d_lvl = 0.85, d_t_msec = 39, s_lvl = 0.73, s_t_msec = 100, r_t_msec = 160)
adsr_cfg_default = adsr_cfg.copy()

adsr_cfg = dict(a_t_msec = 40, d_lvl = 0.8, d_t_msec = 35, s_lvl = 0.7, s_t_msec = 100, r_t_msec = 40)
adsr_bez = ADSRBezier(adsr_cfg, sr)
#env3 = adsr_bez.get_envelope(sustain_msec = 100)

### Plot ADSR env with the Bezier control points

In [None]:
%matplotlib widget

plt.close('all')
fig, ax = plt.subplots(figsize = (12,4))
sustain_msec = 80
env4, ctrl_p = adsr_bez.get_envelope(sustain_msec)
a,d,s,r = ctrl_p

#ax.plot(env1, 'o')
#ax.plot(env2, 'r.')
#ax.plot(env3, 'g.')
ax.plot(env4, 'c.', markersize=1)
for p in [a,d,s,r]:
    p = np.array(p)
    ax.plot(p[0:2,0], p[0:2,1], marker = 'x')
    ax.plot(p[2:4,0], p[2:4,1], marker = 'o')

ax.grid()

In [None]:
num_notes = midi_p.shape[0]
global_gain = .2
last_k = -1
onset_sec = (adsr_cfg['a_t_msec'] +  adsr_cfg['d_t_msec']) / 1000
attack_samples = int(adsr_cfg['a_t_msec'] / 1000 * sr)

phrase_dur_sec = midi_p.ts_sec.iloc[-1] - midi_p.ts_sec.iloc[0] + 1 #--- add a 1 sec tail for "release" of last note
phrase_dur_samples = int(phrase_dur_sec * sr)
env_midi = np.zeros(phrase_dur_samples)

In [None]:
plt.close('all')
fig, ax = plt.subplots(figsize = (12,4))
#--- if true, use midi note 'velocity' for gain. if false, extract from audio (which can be done only if we have the original phrase...)
get_gain_from_midi = False
'''
#--- TODO
- finish the x-fade impl
- fix k_on, k_off below 

'''
for k in np.arange(0, num_notes, 2):
    row_on = midi_p.iloc[k]      
    row_off = midi_p.iloc[k + 1]
    
    t_on, t_off = row_on.ts_sec, row_off.ts_sec
    k_on, k_off = int((t_on - t0) * sr), int((t_off - t0) * sr)

    if get_gain_from_midi:
        gain = row_on.velocity / 127
    else:
        #--- the "RMS" but use 0.95 quantile instead of mean
        gain = 4 * np.sqrt(np.quantile(seg[k_on:k_off] ** 2, 0.95))
    
    gain *= global_gain
        

    sustain_msec = (t_off - t_on - onset_sec) * 1000 #(k_off - k_on) * 1000 / sr
    if sustain_msec < 15:
        print(f'k={k}, t={t_on - t0:.2f}, sustain={sustain_msec:.1f} < 40 msec, skipping note, TODO')
        continue
    
    env_k, _ = adsr_bez.get_envelope(sustain_msec, gain)
    #--- cross-fade with last env
    if k_on < last_k:
        xfade_len = last_k - k_on       
        fade_in = np.linspace(0, 1, min(attack_samples, xfade_len))
        n_fade_in = fade_in.shape[0]
        #--- check if we need to add a constant env
        if n_fade_in < xfade_len:
            fade_in = np.r_[fade_in, np.ones(xfade_len - n_fade_in)]
        fade_out = 1 - fade_in
        env_k[:xfade_len] *= fade_in
        env_midi[k_on : last_k] *= fade_out

    env_midi[k_on : k_on + len(env_k)] += env_k
    #ax.plot(np.arange(k_on, k_on + len(env_k)) / sr, env_k, '.')
    last_k = k_on + len(env_k)
    #last_env = env

ax.plot(np.arange(len(env)) / sr, env, 'k')
ax.plot(np.arange(len(env_midi)) / sr, env_midi,'r')
ax.grid()

In [None]:
from ssynth.utils.synthesis import additive_synth_sawtooth
env_midi = env_midi[:len(env)]
x_midi = additive_synth_sawtooth(freq, env_midi, sampling_rate, max_freq_hz = 16000)
x_midi_hat = generate_from_audio(x_midi, gen, return_numpy_arr = True)

# denoised version
denoising_strength = 2*0.05
x_midi_hat_TMP = generate_from_audio(x_midi, gen, return_numpy_arr = False)
x_midi_hat_den = denoiser(x_midi_hat_TMP.squeeze(1), denoising_strength).numpy()[0]

In [None]:
play_normalize = False
ipd.display(ipd.Audio(seg, rate = sr, normalize=play_normalize))
ipd.display(ipd.Audio(x_hat, rate = sr, normalize=play_normalize))
ipd.display(ipd.Audio(x_midi_hat, rate = sr, normalize=play_normalize))
ipd.display(ipd.Audio(x_midi_hat_den, rate = sr, normalize=play_normalize))