From 7c00023e54497eafbb2aaae92826de0abe49787d Mon Sep 17 00:00:00 2001 From: begeekmyfriend Date: Wed, 28 Mar 2018 10:10:25 +0800 Subject: [PATCH] Adjust fft sampling points and frame length according to audio SR Signed-off-by: begeekmyfriend --- datasets/thchs30.py | 52 ++++++++++++++++++++++++++++++++++++++++++++- hparams.py | 10 ++++----- 2 files changed, 56 insertions(+), 6 deletions(-) diff --git a/datasets/thchs30.py b/datasets/thchs30.py index 4fbeff3..3e806a4 100644 --- a/datasets/thchs30.py +++ b/datasets/thchs30.py @@ -25,7 +25,57 @@ def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): futures = [] index = 1 - trn_files = glob.glob(os.path.join(in_dir, 'data', '*.trn')) + # male voice (do not use) A5 A8 A9 A33 A35 B6 B8 B21 B34 C6 C8 D8 + # too silent (do not use) A36 B33 C14 D32 + + trn_files = [] + + trn_files += glob.glob(os.path.join(in_dir, 'data', 'A2_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'A4_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'A11_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'A12_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'A13_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'A14_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'A19_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'A22_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'A23_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'A32_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'A34_*.trn')) + + trn_files += glob.glob(os.path.join(in_dir, 'data', 'B2_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'B4_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'B7_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'B11_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'B12_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'B15_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'B22_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'B31_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'B32_*.trn')) + + trn_files += glob.glob(os.path.join(in_dir, 'data', 'C2_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'C4_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'C7_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'C12_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'C13_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'C17_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'C18_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'C19_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'C20_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'C21_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'C22_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'C23_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'C31_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'C32_*.trn')) + + trn_files += glob.glob(os.path.join(in_dir, 'data', 'D4_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'D6_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'D7_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'D11_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'D12_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'D13_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'D21_*.trn')) + trn_files += glob.glob(os.path.join(in_dir, 'data', 'D31_*.trn')) + for trn in trn_files: with open(trn) as f: f.readline() diff --git a/hparams.py b/hparams.py index 3f588ee..3231dc7 100644 --- a/hparams.py +++ b/hparams.py @@ -8,11 +8,11 @@ cleaners='basic_cleaners', # Audio: - num_mels=80, - num_freq=1025, + num_mels=64, + num_freq=513, # n_fft = 1024, n_fft / 2 + 1 sample_rate=16000, - frame_length_ms=50, - frame_shift_ms=12.5, + frame_length_ms=64, + frame_shift_ms=16, preemphasis=0.97, min_level_db=-100, ref_level_db=20, @@ -30,7 +30,7 @@ use_cmudict=False, # Use CMUDict during training to learn pronunciation of ARPAbet phonemes # Eval: - max_iters=300, + max_iters=200, griffin_lim_iters=60, power=1.5, # Power to raise magnitudes to prior to Griffin-Lim )