In [None]:
import sys
import pickle
import torch
import numpy as np
import glob
import librosa

#--- import HiFiGAN modules
sys.path.append('../')
import models
import common.layers as layers 
from common.utils import load_wav
from hifigan.data_function import mel_spectrogram
from hifigan.models import Denoiser

import IPython.display as ipd
ipd.display(ipd.HTML("<style>.container { width:85% !important; }</style>"))

# load cfg and generator model from checkpoint
also create denoiser instance

In [None]:
#--- get config from checkpoint, so no need to load args from disk
#args = pickle.load(open('../TMP_args.p', 'rb'))
#gen_config = models.get_model_config('HiFi-GAN', args)

DEVICE = 'cuda' # 'cpu' or 'cuda'
assert DEVICE == 'cuda', 'ERROR: cpu not supported yet (mel code assumes torch tensors)'

m_path = '../results/2023_01_20_hifigan_ssynth44khz_synthesized_input/hifigan_gen_checkpoint_10000.pt'

checkpoint = torch.load(m_path)
train_config = checkpoint['train_setup']
sampling_rate = train_config['sampling_rate']
gen_config = checkpoint['config']
gen_config['num_mel_filters'] = train_config['num_mels']

gen = models.get_model('HiFi-GAN', gen_config, DEVICE, forward_is_infer = True)
gen.load_state_dict(checkpoint['generator'])
gen.remove_weight_norm()

denoising_strength = 0.05
denoiser = Denoiser(gen, win_length = train_config['win_length'], num_mel_filters = train_config['num_mels']).to(DEVICE)

# Mel spectrum class
make it identical to code in training, so we get the same features exactly <br/>
NOTE: this is the code used for mel of target audio, for source there is another impl. <br/>
TODO: verify and fix if needed

In [None]:
class MelSpec:
    def __init__(self, cfg):
        filter_length = cfg['filter_length']
        hop_length = cfg['hop_length']
        win_length = cfg['win_length']
        n_mel_channels = cfg['num_mels']
        sampling_rate = cfg['sampling_rate']
        mel_fmin = cfg['mel_fmin']
        mel_fmax = cfg['mel_fmax']
        self.stft = layers.TacotronSTFT(filter_length, hop_length, win_length,n_mel_channels, sampling_rate, mel_fmin, mel_fmax)        
    
    def get_mel(self, audio):
        #audio_norm = audio / self.max_wav_value
        #audio_norm = audio_norm.unsqueeze(0)
        #audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio)  
        
        return melspec

mel_spec = MelSpec(train_config)

## load wav (from validation set) and get mel spectrum
Note: synthesis method should fit the one used to train the model (i.e., "10 harmonics" or "16 khz" etc.)

In [None]:
flist_validation = open('../data_ssynth/filelists/ssynth_audio_val.txt', 'r').readlines()
flist_validation = [fnm.rstrip() for fnm in flist_validation]

#wav_fnm = '../data_ssynth/wavs_synth_10h/01_Free_Improv_dynamic_mic_phrase000.wav'
file_index = 1
wav_fnm = flist_validation[file_index].replace('wavs/', 'wavs_synth_10h/')
y, sample_rate, sample_type = load_wav(f'../data_ssynth/{wav_fnm}')

if sample_type == 'PCM_24':
    max_wav_value = 2**31 # data type in this case is int32
elif sample_type == 'PCM_16':
    max_wav_value = 2**15

#--- convert to float in [-1., 1.]
y = y.astype(np.float32) / np.float32(max_wav_value)

if DEVICE == 'cuda':
    y = torch.FloatTensor(y.astype(np.float32))
    y = torch.autograd.Variable(y, requires_grad = False)
    y = y.unsqueeze(0)
else:
    y = y[np.newaxis, :]

In [None]:
mel = mel_spec.get_mel(y)
y_hat = gen(mel.cuda())
y_hat_den = denoiser(y_hat.squeeze(1), denoising_strength)

y_hat = y_hat[0].cpu().detach().numpy()[0]
y_hat_den = y_hat_den[0].cpu().detach().numpy()[0]

## play result

In [None]:
y_ = y.numpy()[0]

print('Original synthesized input:')
ipd.display(ipd.Audio(y_, rate = sampling_rate, normalize = False))
print('Generated audio:')
ipd.display(ipd.Audio(y_hat, rate = sampling_rate, normalize = False))

print('Generated audio (denoised):')
ipd.display(ipd.Audio(y_hat_den, rate = sampling_rate, normalize = False))

In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt
import librosa.display

## Now try with synthetic input
### I define a naive ADSR envelopes with straight lines, probably not the best option

In [None]:
def additive_synth_sawtooth(freq, env, sampling_rate, additive_synth_k = 10):
    ''' given input frequency and envelope sampled at sampling_rate, synthesize a band-limited
        sawtooth wave using additive synthesis of 10 (or k) harmonies
    '''
    dt = 1 / sampling_rate
    #--- phase is the integral of instantanous freq
    phi = np.cumsum(2 * np.pi * freq * dt)
    # to wrap: phi = (phi + np.pi) % (2 * np.pi) - np.pi 
        
    x = np.sin(phi) #(np.sin(phi) + .5*np.sin(2*phi) + .333*np.sin(3*phi) + .25*np.sin(4*phi))
    for k in range(2, additive_synth_k + 1):
        x += (-1)**(k-1) * np.sin(k * phi) / k
    
    x *= env
    
    return x

## 2 octaves major scale in the range of the alto sax

In [None]:
#range_notes = ['C3', 'A#5'] # alto sax range is ['Db3', 'A5'], take half-step below/above
#alto_sax_range = librosa.note_to_hz(range_notes)

#--- envelope parameters
note_len_samples = 20000 #20000
onset_samples = 3000
amp = 0.04
amp_sustain = 0.5 # decay envelope to this relative level at the end of the note
freq_glide_level = 0.8 #--- during onset, glide into target frequency starting at this pitch (relative)

freq = np.zeros(note_len_samples)
env = np.zeros(note_len_samples)

#--- single note envelope
env_single = np.r_[np.linspace(0, amp, onset_samples),  np.linspace(amp, amp * amp_sustain, note_len_samples - onset_samples)]

#--- major scale in the alto sax range
for note in ['D3', 'E3', 'F#3', 'G3', 'A3', 'B3', 'C#4', 'D4', 'E4', 'F#4', 'G4', 'A4', 'B4', 'C#5', 'D5', 'E5', 'F#5', 'G5', 'A5']:
    f0 = librosa.note_to_hz(note)
    freq_env = np.ones(note_len_samples)
    freq_env[:onset_samples] *= np.linspace(freq_glide_level, 1, onset_samples)
    freq = np.r_[freq, f0 * freq_env]
    env = np.r_[env, env_single]
    
freq = np.r_[freq, np.zeros(note_len_samples)]
env = np.r_[env, np.zeros(note_len_samples)]

In [None]:
x = additive_synth_sawtooth(freq, env, sampling_rate)
x = torch.FloatTensor(x.astype(np.float32))
x = torch.autograd.Variable(x, requires_grad = False)
x = x.unsqueeze(0)
mel = mel_spec.get_mel(x)

x_hat = gen(mel.cuda())
x_hat_den = denoiser(x_hat.squeeze(1), denoising_strength)
x = x.numpy()[0]

In [None]:
print('Original synthesized input:')
ipd.display(ipd.Audio(x, rate = sampling_rate, normalize = False))

print('Generated audio:')
x_hat = x_hat[0].cpu().detach().numpy()[0]
ipd.display(ipd.Audio(x_hat, rate = sampling_rate, normalize = False))

print('Generated audio (denoised):')
x_hat_den = x_hat_den[0].cpu().detach().numpy()[0]
ipd.display(ipd.Audio(x_hat, rate = sampling_rate, normalize = False))

## Compare MEL spectra of the 2 implementations that are used in the HiFiGAN code
(they are not the same :-( )

In [None]:
#--- this is the implementation used to calculate mel spec of input during training (and for inference)
from functools import partial
mel_fmax = train_config['mel_fmax'] #--- in train.py, there's option to use different fmax for computing the loss.
mel_spec2 = partial(mel_spectrogram, n_fft=train_config['filter_length'],
                   num_mels = train_config['num_mels'],
                   sampling_rate = train_config['sampling_rate'],
                   hop_size = train_config['hop_length'], 
                   win_size = train_config['win_length'],
                   fmin = train_config['mel_fmin'],
                   fmax = mel_fmax)

mel1 = mel_spec.get_mel(y)
mel2 = mel_spec2(y)

In [None]:
fig, ax = plt.subplots(figsize = (8,4))
k = 25
ax.plot(mel1[0, :,k], 'bo')
ax.plot(mel2[0, :,k], 'r.')
ax.legend(['mel-1', 'mel-2'])

## measure timing of mel + inference