Add multidataset (#1010)

* Add Common Voice for multidataset * Add prepare_multidataset.sh * Add dataset mixing * Update prepare_multidataset.sh * Update prepare_giga_speech.sh * update comments * Add split and shuffle mechanism * Add multi-dataset train * Fix for deleting * Fix for modifying * Add comments * Change type for perturb_speed * Fix for style check * Small fix * Add filter * Remove warning
k2-fsa · Apr 21, 2023 · d67a49a · d67a49a
1 parent 57d6482
commit d67a49a
Show file tree

Hide file tree

Showing 7 changed files with 624 additions and 38 deletions.
diff --git a/egs/librispeech/ASR/local/compute_fbank_librispeech.py b/egs/librispeech/ASR/local/compute_fbank_librispeech.py
@@ -35,7 +35,7 @@
 from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached
 
-from icefall.utils import get_executor
+from icefall.utils import get_executor, str2bool
 
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
@@ -61,12 +61,20 @@ def get_args():
         help="""Dataset parts to compute fbank. If None, we will use all""",
     )
 
+    parser.add_argument(
+        "--perturb-speed",
+        type=str2bool,
+        default=True,
+        help="""Perturb speed with factor 0.9 and 1.1 on train subset.""",
+    )
+
     return parser.parse_args()
 
 
 def compute_fbank_librispeech(
     bpe_model: Optional[str] = None,
     dataset: Optional[str] = None,
+    perturb_speed: Optional[bool] = True,
 ):
     src_dir = Path("data/manifests")
     output_dir = Path("data/fbank")
@@ -125,9 +133,13 @@ def compute_fbank_librispeech(
             if "train" in partition:
                 if bpe_model:
                     cut_set = filter_cuts(cut_set, sp)
-                cut_set = (
-                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
-                )
+                if perturb_speed:
+                    logging.info(f"Doing speed perturb")
+                    cut_set = (
+                        cut_set
+                        + cut_set.perturb_speed(0.9)
+                        + cut_set.perturb_speed(1.1)
+                    )
             cut_set = cut_set.compute_and_store_features(
                 extractor=extractor,
                 storage_path=f"{output_dir}/{prefix}_feats_{partition}",
@@ -145,4 +157,8 @@ def compute_fbank_librispeech(
     logging.basicConfig(format=formatter, level=logging.INFO)
     args = get_args()
     logging.info(vars(args))
-    compute_fbank_librispeech(bpe_model=args.bpe_model, dataset=args.dataset)
+    compute_fbank_librispeech(
+        bpe_model=args.bpe_model,
+        dataset=args.dataset,
+        perturb_speed=args.perturb_speed,
+    )
diff --git a/egs/librispeech/ASR/prepare_common_voice.sh b/egs/librispeech/ASR/prepare_common_voice.sh
@@ -0,0 +1,117 @@
+#!/usr/bin/env bash
+
+set -eou pipefail
+
+nj=16
+stage=-1
+stop_stage=100
+
+# Split data/${lang}set to this number of pieces
+# This is to avoid OOM during feature extraction.
+num_splits=1000
+
+# We assume dl_dir (download dir) contains the following
+# directories and files. If not, they will be downloaded
+# by this script automatically.
+#
+#  - $dl_dir/$release/$lang
+#      This directory contains the following files downloaded from
+#       https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/${release}/${release}-${lang}.tar.gz
+#
+#     - clips
+#     - dev.tsv
+#     - invalidated.tsv
+#     - other.tsv
+#     - reported.tsv
+#     - test.tsv
+#     - train.tsv
+#     - validated.tsv
+
+dl_dir=$PWD/download
+release=cv-corpus-13.0-2023-03-09
+lang=en
+
+. shared/parse_options.sh || exit 1
+
+# All files generated by this script are saved in "data/${lang}".
+# You can safely remove "data/${lang}" and rerun this script to regenerate it.
+mkdir -p data/${lang}
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+log "dl_dir: $dl_dir"
+
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
+  log "Stage 0: Download data"
+
+  # If you have pre-downloaded it to /path/to/$release,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/$release $dl_dir/$release
+  #
+  if [ ! -d $dl_dir/$release/$lang/clips ]; then
+    lhotse download commonvoice --languages $lang --release $release $dl_dir
+  fi
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  log "Stage 1: Prepare CommonVoice manifest"
+  # We assume that you have downloaded the CommonVoice corpus
+  # to $dl_dir/$release
+  mkdir -p data/${lang}/manifests
+  if [ ! -e data/${lang}/manifests/.cv-${lang}.done ]; then
+    lhotse prepare commonvoice --language $lang -j $nj $dl_dir/$release data/${lang}/manifests
+    touch data/${lang}/manifests/.cv-${lang}.done
+  fi
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  log "Stage 2: Preprocess CommonVoice manifest"
+  if [ ! -e data/${lang}/fbank/.preprocess_complete ]; then
+    ./local/preprocess_commonvoice.py  --language $lang
+    touch data/${lang}/fbank/.preprocess_complete
+  fi
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+  log "Stage 3: Compute fbank for dev and test subsets of CommonVoice"
+  mkdir -p data/${lang}/fbank
+  if [ ! -e data/${lang}/fbank/.cv-${lang}_dev_test.done ]; then
+    ./local/compute_fbank_commonvoice_dev_test.py --language $lang
+    touch data/${lang}/fbank/.cv-${lang}_dev_test.done
+  fi
+fi
+
+if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
+  log "Stage 4: Split train subset into ${num_splits} pieces"
+  split_dir=data/${lang}/fbank/cv-${lang}_train_split_${num_splits}
+  if [ ! -e $split_dir/.cv-${lang}_train_split.done ]; then
+    lhotse split $num_splits ./data/${lang}/fbank/cv-${lang}_cuts_train_raw.jsonl.gz $split_dir
+    touch $split_dir/.cv-${lang}_train_split.done
+  fi
+fi
+
+if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
+  log "Stage 5: Compute features for train subset of CommonVoice"
+  if [ ! -e data/${lang}/fbank/.cv-${lang}_train.done ]; then
+    ./local/compute_fbank_commonvoice_splits.py \
+      --num-workers $nj \
+      --batch-duration 600 \
+      --start 0 \
+      --num-splits $num_splits \
+      --language $lang
+    touch data/${lang}/fbank/.cv-${lang}_train.done
+  fi
+fi
+
+if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
+  log "Stage 6: Combine features for train"
+  if [ ! -f data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz ]; then
+    pieces=$(find data/${lang}/fbank/cv-${lang}_train_split_${num_splits} -name "cv-${lang}_cuts_train.*.jsonl.gz")
+    lhotse combine $pieces data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz
+  fi
+fi
diff --git a/egs/librispeech/ASR/prepare_giga_speech.sh b/egs/librispeech/ASR/prepare_giga_speech.sh
@@ -95,48 +95,65 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
   log "Stage 1: Prepare GigaSpeech manifest (may take 30 minutes)"
   # We assume that you have downloaded the GigaSpeech corpus
   # to $dl_dir/GigaSpeech
-  mkdir -p data/manifests
-  lhotse prepare gigaspeech \
-    --subset XL \
-    --subset L \
-    --subset M \
-    --subset S \
-    --subset XS \
-    --subset DEV \
-    --subset TEST \
-    -j $nj \
-    $dl_dir/GigaSpeech data/manifests
+  if [ ! -f data/manifests/.gigaspeech.done ]; then
+    mkdir -p data/manifests
+    lhotse prepare gigaspeech \
+      --subset XL \
+      --subset L \
+      --subset M \
+      --subset S \
+      --subset XS \
+      --subset DEV \
+      --subset TEST \
+      -j $nj \
+      $dl_dir/GigaSpeech data/manifests
+    touch data/manifests/.gigaspeech.done
+  fi
 fi
 
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
   log "Stage 2: Preprocess GigaSpeech manifest"
-  if [ ! -f data/fbank/.preprocess_complete ]; then
-   log "It may take 2 hours for this stage"
-   python3 ./local/preprocess_gigaspeech.py
-   touch data/fbank/.preprocess_complete
+  if [ ! -f data/fbank/.gigaspeech_preprocess.done ]; then
+    log "It may take 2 hours for this stage"
+    ./local/preprocess_gigaspeech.py
+    touch data/fbank/.gigaspeech_preprocess.done
   fi
 fi
 
 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
   log "Stage 3: Compute features for DEV and TEST subsets of GigaSpeech (may take 2 minutes)"
-  python3 ./local/compute_fbank_gigaspeech_dev_test.py
+  if [ ! -f data/fbank/.gigaspeech_dev_test.done ]; then 
+    ./local/compute_fbank_gigaspeech_dev_test.py
+    touch data/fbank/.gigaspeech_dev_test.done
+  fi
 fi
 
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
   log "Stage 4: Split XL subset into ${num_splits} pieces"
   split_dir=data/fbank/gigaspeech_XL_split_${num_splits}
-  if [ ! -f $split_dir/.split_completed ]; then
+  if [ ! -f $split_dir/.gigaspeech_XL_split.done ]; then
     lhotse split-lazy ./data/fbank/gigaspeech_cuts_XL_raw.jsonl.gz $split_dir $chunk_size
-    touch $split_dir/.split_completed
+    touch $split_dir/.gigaspeech_XL_split.done
   fi
 fi
 
 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
   log "Stage 5: Compute features for XL"
   # Note: The script supports --start and --stop options.
   # You can use several machines to compute the features in parallel.
-  python3 ./local/compute_fbank_gigaspeech_splits.py \
-    --num-workers $nj \
-    --batch-duration 600 \
-    --num-splits $num_splits
+  if [ ! -f data/fbank/.gigaspeech_XL.done ]; then
+    ./local/compute_fbank_gigaspeech_splits.py \
+      --num-workers $nj \
+      --batch-duration 600 \
+      --num-splits $num_splits
+    touch data/fbank/.gigaspeech_XL.done
+  fi
+fi
+
+if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
+  log "Stage 6: Combine features for XL (may take 15 hours)"
+  if [ ! -f data/fbank/gigaspeech_cuts_XL.jsonl.gz ]; then
+    pieces=$(find data/fbank/gigaspeech_XL_split_${num_splits} -name "gigaspeech_cuts_XL.*.jsonl.gz")
+    lhotse combine $pieces data/fbank/gigaspeech_cuts_XL.jsonl.gz
+  fi
 fi