In [None]:
import os

from pathlib import Path

import matplotlib.pyplot as plt
import torch

from lhotse.recipes.ami import download, prepare_ami
from lhotse.features import Fbank, FeatureSetBuilder
from lhotse.cut import CutSet
from lhotse.dataset.speech_recognition import SpeechRecognitionDataset
from lhotse.dataset.vad import VadDataset

# Settings for paths

In [None]:
root_dir = Path('data')
output_dir = root_dir / 'ami_nb'

# Download and untar

In [None]:
download(root_dir)

# Prepare audio and supervision manifests

In [None]:
ami_manifests = prepare_ami(root_dir, output_dir)

# Extract features

In [None]:
example = ami_manifests

feature_set_builder = FeatureSetBuilder(
    feature_extractor=Fbank(),
    output_dir=f'{output_dir}/feats_example'
)
feature_set = feature_set_builder.process_and_store_recordings(
    recordings=example['audio'],
    num_jobs=os.cpu_count()
)
example['feats'] = feature_set

In [None]:
example['cuts'] = CutSet.from_manifests(supervision_set=example['supervisions'], feature_set=feature_set)

# Make pytorch Dataset for ASR task

In [None]:
asr_dataset = SpeechRecognitionDataset(example['cuts'])

# Illustation of an example

In [None]:
sample = asr_dataset[0]
print(sample['text'])
plt.matshow(sample['features'].transpose(0, 1).flip(0))

# Make pytorch Dataset for VAD task

In [None]:
vad_dataset = VadDataset(example['cuts'], duration=10.0)

# Illustation of an example

In [None]:
sample = vad_dataset[3]

label_height = 10
vad_label = torch.stack([sample['is_voice'] for i in range(label_height)]).reshape(label_height, 1000)
plt.matshsow(vad_label)

plt.matshow(sample['features'].transpose(0, 1).flip(0))