In [1]:
%matplotlib inline

from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.visual import plot_spectrogram
from TTS.config import load_config

import IPython.display as ipd
import glob

In [2]:
config_path = "/home/arya/Documents/coquitts/TTS/tests/inputs/test_glow_tts.json"
data_path = "/home/arya/Documents/coquitts/TTS/datasets/telugu_openslr/"

file_paths = glob.glob(data_path + "/**/*.wav", recursive=True)
CONFIG = load_config(config_path)

# Change this to the index of the desired file listed below
# sample_file_index = 0

# SAMPLE_FILE_PATH = file_paths[sample_file_index]

# print("File list, by index:")
# dict(enumerate(file_paths))

for i, file_path in enumerate(file_paths):
    if "tef_01033_000071" in file_path:
        SAMPLE_FILE_PATH = file_path

print (SAMPLE_FILE_PATH)

/home/arya/Documents/coquitts/TTS/datasets/telugu_openslr/te_in_female_old/tef_01033_00007107192.wav


### Setup Audio Processor
Play with the AP parameters until you find a good fit with the synthesis speech below.

The default values are loaded from your config.json file, so you only need to
uncomment and modify values below that you'd like to tune.

In [3]:
tune_params={
#  'audio_processor': 'audio',
#  'num_mels': 80,          # In general, you don't need to change this. 
#  'fft_size': 1024,        # In general, you don't need to change this.
#  'sample_rate': 22050,    # This must match the sample rate of the dataset.
#  'hop_length': 256,       # In general, you don't need to change this.
#  'win_length': 1024,      # In general, you don't need to change this.
#  'preemphasis': 0.98,     # In general, 0 gives better voice recovery but makes training harder. If your model does not train, try 0.97 - 0.99.
#  'min_level_db': -100,
#  'ref_level_db': 0,       # The base DB; increase until all background noise is removed in the spectrogram, then lower until you hear better speech below.
#  'power': 1.5,            # Change this value and listen to the synthesized voice. 1.2 - 1.5 are resonable values.
#  'griffin_lim_iters': 60, # Quality does not improve for values > 60
#  'mel_fmin': 0.0,         # Adjust this and check mel-spectrogram-based voice synthesis below.
#  'mel_fmax': 8000.0,      # Adjust this and check mel-spectrogram-based voice synthesis below.
#  'do_trim_silence': True  # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.
}

# These options have to be forced off in order to avoid errors about the 
# pre-calculated not matching the options being tuned.
reset={
 'signal_norm': True,  # check this if you want to test normalization parameters.
 'stats_path': None,
 'symmetric_norm': False,
 'max_norm': 1,
 'clip_norm': True,
 'sample_rate' : 48000,
 'resample' :False
}

# Override select parts of loaded config with parameters above
tuned_config = CONFIG.audio.copy()
tuned_config.update(reset)
tuned_config.update(tune_params)

AP = AudioProcessor(**tuned_config);

 > Setting up Audio Processor...
 | > sample_rate:48000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:0
 | > fft_size:1024
 | > power:1.1
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:False
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


### Check audio loading 

In [4]:
wav = AP.load_wav(SAMPLE_FILE_PATH)
print (SAMPLE_FILE_PATH)
ipd.Audio(data=wav, rate=AP.sample_rate) 

/home/arya/Documents/coquitts/TTS/datasets/telugu_openslr/te_in_female_old/tef_01033_00007107192.wav


### Generate Mel-Spectrogram and Re-synthesis with GL

In [5]:
AP.power = 1.5

In [7]:
mel = AP.melspectrogram(wav)
print("Max:", mel.max())
print("Min:", mel.min())
print("Mean:", mel.mean())
plot_spectrogram(mel.T, AP, output_fig=True)

wav_gen = AP.inv_melspectrogram(mel)
ipd.Audio(wav_gen, rate=AP.sample_rate)

Max: 0.9985666
Min: 0.95
Mean: 0.9684934


### Generate Linear-Spectrogram and Re-synthesis with GL

In [10]:
spec = AP.spectrogram(wav)
print("Max:", spec.max())
print("Min:", spec.min())
print("Mean:", spec.mean())
plot_spectrogram(spec.T, AP, output_fig=True)

wav_gen = AP.inv_spectrogram(spec)
ipd.Audio(wav_gen, rate=AP.sample_rate)

Max: 1.0
Min: 0.95
Mean: 0.974845


### Compare values for a certain parameter

Optimize your parameters by comparing different values per parameter at a time.

In [11]:
from librosa import display
from matplotlib import pylab as plt
import IPython
plt.rcParams['figure.figsize'] = (20.0, 16.0)

def compare_values(attribute, values):
    """
    attributes (str): the names of the attribute you like to test.
    values (list): list of values to compare.
    """
    file = SAMPLE_FILE_PATH
    wavs = []
    for idx, val in enumerate(values):
        set_val_cmd = "AP.{}={}".format(attribute, val)
        exec(set_val_cmd)
        wav = AP.load_wav(file)
        spec = AP.spectrogram(wav)
        spec_norm = AP.denormalize(spec.T)
        plt.subplot(len(values), 2, 2*idx + 1)
        plt.imshow(spec_norm.T, aspect="auto", origin="lower")
        #         plt.colorbar()
        plt.tight_layout()
        wav_gen = AP.inv_spectrogram(spec)
        wavs.append(wav_gen)
        plt.subplot(len(values), 2, 2*idx + 2)
        display.waveplot(wav, alpha=0.5)
        display.waveplot(wav_gen, alpha=0.25)
        plt.title("{}={}".format(attribute, val))
        plt.tight_layout()
    
    wav = AP.load_wav(file)
    print(" > Ground-truth")
    IPython.display.display(IPython.display.Audio(wav, rate=AP.sample_rate))
    
    for idx, wav_gen in enumerate(wavs):
        val = values[idx]
        print(" > {} = {}".format(attribute, val))
        IPython.display.display(IPython.display.Audio(wav_gen, rate=AP.sample_rate))

In [12]:
compare_values("preemphasis", [0, 0.5, 0.97, 0.98, 0.99])

 > Ground-truth


 > preemphasis = 0


 > preemphasis = 0.5


 > preemphasis = 0.97


 > preemphasis = 0.98


 > preemphasis = 0.99


In [13]:
compare_values("ref_level_db", [2, 5, 10, 15, 20, 25, 30, 35, 40, 1000])

  plt.tight_layout()
  plt.tight_layout()
  return np.power(10, x)
  axes.set_xlim([locs.min(), locs.max()])


 [!] Waveform is not finite everywhere. Skipping the GL.
 > Ground-truth


 > ref_level_db = 2


 > ref_level_db = 5


 > ref_level_db = 10


 > ref_level_db = 15


 > ref_level_db = 20


 > ref_level_db = 25


 > ref_level_db = 30


 > ref_level_db = 35


 > ref_level_db = 40


 > ref_level_db = 1000


  scaled = data / normalization_factor * 32767
