-
Notifications
You must be signed in to change notification settings - Fork 270
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add Common Voice for multidataset * Add prepare_multidataset.sh * Add dataset mixing * Update prepare_multidataset.sh * Update prepare_giga_speech.sh * update comments * Add split and shuffle mechanism * Add multi-dataset train * Fix for deleting * Fix for modifying * Add comments * Change type for perturb_speed * Fix for style check * Small fix * Add filter * Remove warning
- Loading branch information
Showing
7 changed files
with
624 additions
and
38 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -eou pipefail | ||
|
||
nj=16 | ||
stage=-1 | ||
stop_stage=100 | ||
|
||
# Split data/${lang}set to this number of pieces | ||
# This is to avoid OOM during feature extraction. | ||
num_splits=1000 | ||
|
||
# We assume dl_dir (download dir) contains the following | ||
# directories and files. If not, they will be downloaded | ||
# by this script automatically. | ||
# | ||
# - $dl_dir/$release/$lang | ||
# This directory contains the following files downloaded from | ||
# https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/${release}/${release}-${lang}.tar.gz | ||
# | ||
# - clips | ||
# - dev.tsv | ||
# - invalidated.tsv | ||
# - other.tsv | ||
# - reported.tsv | ||
# - test.tsv | ||
# - train.tsv | ||
# - validated.tsv | ||
|
||
dl_dir=$PWD/download | ||
release=cv-corpus-13.0-2023-03-09 | ||
lang=en | ||
|
||
. shared/parse_options.sh || exit 1 | ||
|
||
# All files generated by this script are saved in "data/${lang}". | ||
# You can safely remove "data/${lang}" and rerun this script to regenerate it. | ||
mkdir -p data/${lang} | ||
|
||
log() { | ||
# This function is from espnet | ||
local fname=${BASH_SOURCE[1]##*/} | ||
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" | ||
} | ||
|
||
log "dl_dir: $dl_dir" | ||
|
||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then | ||
log "Stage 0: Download data" | ||
|
||
# If you have pre-downloaded it to /path/to/$release, | ||
# you can create a symlink | ||
# | ||
# ln -sfv /path/to/$release $dl_dir/$release | ||
# | ||
if [ ! -d $dl_dir/$release/$lang/clips ]; then | ||
lhotse download commonvoice --languages $lang --release $release $dl_dir | ||
fi | ||
fi | ||
|
||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then | ||
log "Stage 1: Prepare CommonVoice manifest" | ||
# We assume that you have downloaded the CommonVoice corpus | ||
# to $dl_dir/$release | ||
mkdir -p data/${lang}/manifests | ||
if [ ! -e data/${lang}/manifests/.cv-${lang}.done ]; then | ||
lhotse prepare commonvoice --language $lang -j $nj $dl_dir/$release data/${lang}/manifests | ||
touch data/${lang}/manifests/.cv-${lang}.done | ||
fi | ||
fi | ||
|
||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then | ||
log "Stage 2: Preprocess CommonVoice manifest" | ||
if [ ! -e data/${lang}/fbank/.preprocess_complete ]; then | ||
./local/preprocess_commonvoice.py --language $lang | ||
touch data/${lang}/fbank/.preprocess_complete | ||
fi | ||
fi | ||
|
||
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then | ||
log "Stage 3: Compute fbank for dev and test subsets of CommonVoice" | ||
mkdir -p data/${lang}/fbank | ||
if [ ! -e data/${lang}/fbank/.cv-${lang}_dev_test.done ]; then | ||
./local/compute_fbank_commonvoice_dev_test.py --language $lang | ||
touch data/${lang}/fbank/.cv-${lang}_dev_test.done | ||
fi | ||
fi | ||
|
||
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then | ||
log "Stage 4: Split train subset into ${num_splits} pieces" | ||
split_dir=data/${lang}/fbank/cv-${lang}_train_split_${num_splits} | ||
if [ ! -e $split_dir/.cv-${lang}_train_split.done ]; then | ||
lhotse split $num_splits ./data/${lang}/fbank/cv-${lang}_cuts_train_raw.jsonl.gz $split_dir | ||
touch $split_dir/.cv-${lang}_train_split.done | ||
fi | ||
fi | ||
|
||
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then | ||
log "Stage 5: Compute features for train subset of CommonVoice" | ||
if [ ! -e data/${lang}/fbank/.cv-${lang}_train.done ]; then | ||
./local/compute_fbank_commonvoice_splits.py \ | ||
--num-workers $nj \ | ||
--batch-duration 600 \ | ||
--start 0 \ | ||
--num-splits $num_splits \ | ||
--language $lang | ||
touch data/${lang}/fbank/.cv-${lang}_train.done | ||
fi | ||
fi | ||
|
||
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then | ||
log "Stage 6: Combine features for train" | ||
if [ ! -f data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz ]; then | ||
pieces=$(find data/${lang}/fbank/cv-${lang}_train_split_${num_splits} -name "cv-${lang}_cuts_train.*.jsonl.gz") | ||
lhotse combine $pieces data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz | ||
fi | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.