Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
yfy62 committed Sep 19, 2023
1 parent 0d1b63b commit be71c5a
Show file tree
Hide file tree
Showing 8 changed files with 1,122 additions and 71 deletions.
13 changes: 10 additions & 3 deletions egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,12 @@
from pathlib import Path

import torch
from lhotse import CutSet, KaldifeatFbank, KaldifeatFbankConfig
from lhotse import (
CutSet,
KaldifeatFbank,
KaldifeatFbankConfig,
set_audio_duration_mismatch_tolerance,
)

# Torch's multithreaded behavior needs to be disabled or
# it wastes a lot of CPU and slow things down.
Expand Down Expand Up @@ -80,7 +85,7 @@ def compute_fbank_gigaspeech_splits(args):
output_dir = Path(output_dir)
assert output_dir.exists(), f"{output_dir} does not exist!"

num_digits = 8 # num_digits is fixed by lhotse split-lazy
num_digits = 4

start = args.start
stop = args.stop
Expand All @@ -95,8 +100,10 @@ def compute_fbank_gigaspeech_splits(args):
extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
logging.info(f"device: {device}")

set_audio_duration_mismatch_tolerance(0.1)

for i in range(start, stop):
idx = f"{i + 1}".zfill(num_digits)
idx = f"{i}".zfill(num_digits)
logging.info(f"Processing {idx}/{num_splits}")

cuts_path = output_dir / f"gigaspeech_cuts_XL.{idx}.jsonl.gz"
Expand Down
10 changes: 1 addition & 9 deletions egs/gigaspeech/ASR/local/preprocess_gigaspeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def preprocess_giga_speech():

for partition, m in manifests.items():
logging.info(f"Processing {partition}")
raw_cuts_path = output_dir / f"cuts_{partition}_raw.jsonl.gz"
raw_cuts_path = output_dir / f"gigaspeech_cuts_{partition}_raw.jsonl.gz"
if raw_cuts_path.is_file():
logging.info(f"{partition} already exists - skipping")
continue
Expand All @@ -91,14 +91,6 @@ def preprocess_giga_speech():
recordings=m["recordings"],
supervisions=m["supervisions"],
)
# Run data augmentation that needs to be done in the
# time domain.
if partition not in ["DEV", "TEST"]:
logging.info(
f"Speed perturb for {partition} with factors 0.9 and 1.1 "
"(Perturbing may take 8 minutes and saving may take 20 minutes)"
)
cut_set = cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
logging.info(f"Saving to {raw_cuts_path}")
cut_set.to_file(raw_cuts_path)

Expand Down
10 changes: 5 additions & 5 deletions egs/gigaspeech/ASR/prepare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ nj=16
stage=0
stop_stage=100

# Split XL subset to a number of pieces (about 2000)
# Split XL subset to a number of pieces
# This is to avoid OOM during feature extraction.
num_per_split=50
num_splits=1000

# We assume dl_dir (download dir) contains the following
# directories and files. If not, they will be downloaded
Expand Down Expand Up @@ -122,7 +122,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
$dl_dir/GigaSpeech data/manifests
fi

if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
if 0 && [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Prepare musan manifest"
# We assume that you have downloaded the musan corpus
# to $dl_dir/musan
Expand All @@ -147,7 +147,7 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Split XL subset into pieces (may take 30 minutes)"
split_dir=data/fbank/XL_split
if [ ! -f $split_dir/.split_completed ]; then
lhotse split-lazy ./data/fbank/gigapspeech_cuts_XL_raw.jsonl.gz $split_dir $num_per_split
lhotse split -s $num_splits ./data/fbank/gigaspeech_cuts_XL_raw.jsonl.gz $split_dir
touch $split_dir/.split_completed
fi
fi
Expand All @@ -161,7 +161,7 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
--num-splits $num_splits
fi

if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
if 0 && [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
log "Stage 7: Compute fbank for musan"
mkdir -p data/fbank
./local/compute_fbank_musan.py
Expand Down
27 changes: 12 additions & 15 deletions egs/gigaspeech/ASR/zipformer/asr_datamodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,13 @@ def add_arguments(cls, parser: argparse.ArgumentParser):
"with training dataset. ",
)

group.add_argument(
"--input-strategy",
type=str,
default="PrecomputedFeatures",
help="AudioSamples or PrecomputedFeatures",
)

# GigaSpeech specific arguments
group.add_argument(
"--subset",
Expand All @@ -209,9 +216,7 @@ def add_arguments(cls, parser: argparse.ArgumentParser):
)

def train_dataloaders(
self,
cuts_train: CutSet,
sampler_state_dict: Optional[Dict[str, Any]] = None,
self, cuts_train: CutSet, sampler_state_dict: Optional[Dict[str, Any]] = None,
) -> DataLoader:
"""
Args:
Expand Down Expand Up @@ -353,13 +358,10 @@ def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
)
else:
validate = K2SpeechRecognitionDataset(
cut_transforms=transforms,
return_cuts=self.args.return_cuts,
cut_transforms=transforms, return_cuts=self.args.return_cuts,
)
valid_sampler = DynamicBucketingSampler(
cuts_valid,
max_duration=self.args.max_duration,
shuffle=False,
cuts_valid, max_duration=self.args.max_duration, shuffle=False,
)
logging.info("About to create dev dataloader")
valid_dl = DataLoader(
Expand All @@ -381,16 +383,11 @@ def test_dataloaders(self, cuts: CutSet) -> DataLoader:
return_cuts=self.args.return_cuts,
)
sampler = DynamicBucketingSampler(
cuts,
max_duration=self.args.max_duration,
shuffle=False,
cuts, max_duration=self.args.max_duration, shuffle=False,
)
logging.debug("About to create test dataloader")
test_dl = DataLoader(
test,
batch_size=None,
sampler=sampler,
num_workers=self.args.num_workers,
test, batch_size=None, sampler=sampler, num_workers=self.args.num_workers,
)
return test_dl

Expand Down
1 change: 0 additions & 1 deletion egs/gigaspeech/ASR/zipformer/decode.py

This file was deleted.

0 comments on commit be71c5a

Please sign in to comment.