In [None]:
import IPython.display as ipd
#%matplotlib notebook

from sax_synth_inference import run_on_validation_set, g_mel, load_generator_model, synthetic2octaves, generate_from_audio

# Init the generator and Mel spec class

In [None]:
gen_path = '../results/2023_05_28_hifigan_ssynth44khz_synthesized_input_16k_spl0.5_nonorm/hifigan_gen_checkpoint_3000.pt'
gen, denoiser, train_cfg = load_generator_model(gen_path)
sampling_rate = train_cfg['sampling_rate']
MEL_IMPL = 'hifigan'
g_mel.init(train_cfg, MEL_IMPL)
print('done loading generator')

# Run inference on validation set, optionally sample one file to play

In [None]:
flist_path = '../data_ssynth/filelists' #--- where to look for list of validation files
num_files = 50 #--- take first N files of the whole validation set (optional)
mel_loss, mel_len, yret = run_on_validation_set(gen, denoiser, flist_path, num_files, return_file_index = 0)

In [None]:
play_normalize = True #False

print('Original audio:')
ipd.display(ipd.Audio(yret['y'], rate = sampling_rate, normalize = play_normalize))

#print('Synthesized input:')
#ipd.display(ipd.Audio(y_, rate = sampling_rate, normalize = play_normalize))

print('Generated audio:')
ipd.display(ipd.Audio(yret['y_hat'], rate = sampling_rate, normalize = play_normalize))

print('Generated audio (denoised):')
ipd.display(ipd.Audio(yret['y_hat_den'], rate = sampling_rate, normalize = play_normalize))

# Generate from synthetic input (2 octaves major scale)
This uses naive linear envelopes, so it is expected to sound not natural

In [None]:
x_hat, x_hat_den = synthetic2octaves(gen, denoiser, sampling_rate)

In [None]:
play_normalize = True #False

print('Generated audio:')
x_hat_ = x_hat[0].cpu().detach().numpy()[0]
ipd.display(ipd.Audio(x_hat_, rate = sampling_rate, normalize = play_normalize))

print('Generated audio (denoised):')
x_hat_den_ = x_hat_den[0].cpu().detach().numpy()[0]
ipd.display(ipd.Audio(x_hat_den_, rate = sampling_rate, normalize = play_normalize))

# Generate from parallel audio+midi, and compare
## 1. Generate using the real GT envelopes (pitch and amplitude) of the input audio.
This step uses the midi data of notes on/off for better tracking.
### Read midi file and phrase data, choose a phrase (by index), and generate

In [None]:
import ssynth.set_python_path

from ssynth.utils.synthesis import wav_midi_to_synth
from ssynth.utils import midi
from ssynth.utils.midi import midi_phrase_from_dataframe

import librosa
import os
import pandas as pd

#--- read phrase info for 1 file
file_id = 'Funky_Nadley'
midi_fnm = f'../data_ssynth_TMP/midi/{file_id}.mid'
print(f'reading midi file {os.path.basename(midi_fnm)}')
midi_df, midi_pitch, midi_aftertouch, midi_cc = midi.read_midi_to_df(midi_fnm)
midi.verify_midi(midi_df)

data_dir = '../data_ssynth_TMP/wavs'
phrase_df_fnm = '../data_ssynth_TMP/phrase_df.csv'
phrase_df = pd.read_csv(phrase_df_fnm, index_col = 0).reset_index(drop = True)
phrase_df = phrase_df[phrase_df.file_nm.str.contains(file_id)]

In [None]:
#--- pitch detection config
pd_cfg = dict(win = 1024,
              ac_win = 512, # autocorrelation window
              hop = 256)

#--- choose a phrase
phrase_ind = 42 #14 #12 #5
p = phrase_df.iloc[phrase_ind]

t0 = p.sample_start / sampling_rate
wav_fnm = f'{data_dir}/{p.phrase_id}.wav'
seg, sr = librosa.load(wav_fnm, sr = sampling_rate)

midi_p = midi_phrase_from_dataframe(p, midi_df, sampling_rate)
midi_p_cc = midi_phrase_from_dataframe(p, midi_cc, sampling_rate)

#--- filter 'errors'
min_velocity = 3
err_notes = (midi_p.type == 'note_on') & (midi_p.velocity <= min_velocity)
err_notes.loc[err_notes[err_notes].index + 1] = True #--- add the corresponding note-off
midi_p = midi_p[~err_notes]

### Generate and play

In [None]:
#x, env, freq = phrase_to_synth(seg, sr, midi_p, t0, num_harmonics = 30, spline_smoothing = 2, verbose = False)
x, env, freq = wav_midi_to_synth(seg, sr, midi_p, t0, pd_cfg, max_freq_hz = 16000, spline_smoothing = .5, verbose = False)

#--- apply hifi-gan
pre_gain = 1 #0.6
x_hat = generate_from_audio(pre_gain * x, gen)
play_normalize = True
ipd.display(ipd.Audio(seg, rate = sr, normalize=play_normalize))
ipd.display(ipd.Audio(x_hat, rate = sr, normalize=play_normalize))

## 2. Generate from midi, using synthetic envelopes (pitch and amplitude)
### Side task: fit amplitude envelope for synthesis from midi
choose a note and fit (manually...) an ADSR env using cubic-Bezier curves

In [210]:
import importlib
import ssynth.utils.envelopes
importlib.reload(ssynth.utils.envelopes)
from ssynth.utils.envelopes import ADSRBezier
from scipy.interpolate import CubicSpline
import numpy as np

In [None]:
#--- compare 2 spline smoothing params (0.5 was used for training)
x1, env1, freq1 = wav_midi_to_synth(seg, sr, midi_p, t0, pd_cfg, max_freq_hz=16000, spline_smoothing = 2, verbose = False)
x2, env2, freq2 = wav_midi_to_synth(seg, sr, midi_p, t0, pd_cfg, max_freq_hz=16000, spline_smoothing = .1, verbose = False)
k1, k2 = 52283, 67500
env0 = env1[k1:k2] # use this to manually fir an ADSR env using Bezier etc.

In [None]:
#%matplotlib widget
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
assert (phrase_ind == 42 and file_id == 'Funky_Nadley'), 'the following params were manually chosen based on file "Funky_Nadley" and phrase 42, make sure to use the same file and phrase'

fig, ax = plt.subplots(figsize = (14,6))
#ax.plot(env1,'.')
#ax.plot(env2,':')
adsr = np.array([0, 1900, 3600, 8100, len(env0) - 1])
n0, n1, n2, n3, n4 = adsr
e0, e1, e2, e3, e4 = [env0[k] for k in adsr]
ax.plot(env0, '.-')

a = e1
cols = ['b', 'r', 'g', 'c']
for k in range(4):
    poly = Polygon([[adsr[k], 0], [adsr[k+1], 0], [adsr[k+1], a], [adsr[k],a]], facecolor=cols[k], alpha = 0.15, edgecolor='0.2', closed=True)
    ax.add_patch(poly)

nA, nD, nS, nR = np.diff(adsr)

#curveA = get_bezier_parameters(np.arange(adsr[1]), env0[:nA])
#Ax, Ay =  bezier_curve(curveA, nA)

#=== Attack
Ax, Ay =  env.bezier_curve([[n0, e0],  [.8 * n1, e0],      [.5 * n1, e1],          [n1, e1]], nA)

#=== Decay
de1 = (e1 - e2)
Dx, Dy =  env.bezier_curve([[n1, e1], [n1 + .5 * nD, e1], [n2 - .5 * nD, e2 + .4 * de1], [n2, e2]], nD)

#--- Sustain
de2 = (e2 - e3)
Sx, Sy =  env.bezier_curve([[n2, e2], [n2 + .5 * nD, e2 - .4 * de1], [n3 - .4 * nS, e3 + .1 * de2], [n3, e3]], nS)

#--- Release
de3 = (e3 - e4)
Rx, Ry =  env.bezier_curve([[n3, e3], [n3 + .4 * nS, e3 - .1 * de2], [n4 - 1.2 * nR, e4 + 0 * de3], [n4, e4]], nR)

ax.plot(Ax, Ay, 'r')
ax.plot(Dx, Dy, 'r')
ax.plot(Sx, Sy, 'r')
ax.plot(Rx, Ry, 'r')

#--- compare with cubic spline
spl = CubicSpline(adsr, [env0[k-1] for k in adsr], bc_type='clamped')
env_spl = spl(np.arange(n4))
ax.plot(env_spl, ':')

#ax.plot(np.arange(len(env1)) - k1, env1,':')

#ax.set_xlim([n1,15000])
#ax.set_ylim([.1,.14])
#ax.legend(['original envelope', 'piece-wise cubic Bezier', 'cubic spline'])
ax.grid()
fig.savefig('bezier_adsr.png')
plt.close(fig)

In [211]:
adsr_cfg = dict(a_t_msec = 43, d_lvl = 0.85, d_t_msec = 39, s_lvl = 0.73, s_t_msec = 100, r_t_msec = 160)
adsr_cfg = dict(a_t_msec = 43, d_lvl = 0.85, d_t_msec = 39, s_lvl = 0.73, s_t_msec = 100, r_t_msec = 80)
adsr_bez = ADSRBezier(adsr_cfg, sr)
env3 = adsr_bez.get_envelope(sustain_msec = 100)
env4 = adsr_bez.get_envelope(sustain_msec = 150)

In [None]:
x = np.r_[Ax, Dx, Sx, Rx]
y = np.r_[Ay, Dy, Sy, Ry]
ind = np.argsort(x)
x1 = x[ind]
y1 = y[ind]

x2, ind = np.unique(x1, return_index = True)
y2 = y1[ind]

In [None]:
np.array([0, 1900, 3600, 8100, len(env0) - 1])

In [212]:
env4 = adsr_bez.get_envelope(sustain_msec = 150, verbose=True)

attack:  [[0, 0.0], [1516.8000000000002, 0.0], [948.0, 1.0], [1896, 1.0]]
decay:   [[1896, 1.0], [2756.0, 1.0], [2756.0, 0.91], [3616, 0.85]]
sustain: [[3616, 0.85], [4476.0, 0.7899999999999999], [7585.0, 0.742], [10231, 0.73]]
release: [[10231, 0.73], [12877.0, 0.718], [9525.400000000001, 0.0], [13759, 0.0]]


In [None]:
%matplotlib widget
fig, ax = plt.subplots(figsize = (12,4))

#ax.plot(env1, 'o')
#ax.plot(env2, 'r.')
#ax.plot(env3, 'g.')
ax.plot(env4, 'c.')

ax.grid()