In [None]:
import os

from pathlib import Path

import matplotlib.pyplot as plt

from lhotse.recipes.mini_librispeech import download_and_untar, prepare_mini_librispeech
from lhotse.features import FeatureSetBuilder, FeatureExtractor
from lhotse.cut import make_cuts_from_supervisions
from lhotse.dataset.speech_recognition import SpeechRecognitionDataset

In [None]:
root_dir = Path('env')
corpus_dir = root_dir / 'LibriSpeech'
output_dir = root_dir / 'mini_librispeech_nb'

# Download and untar

In [None]:
download_and_untar(root_dir)

# Prepare audio and supervision manifests

In [None]:
mini_librispeech_manifests = prepare_mini_librispeech(corpus_dir, output_dir)

# Extract features

In [None]:
for partition, manifests in mini_librispeech_manifests.items():
    feature_set_builder = FeatureSetBuilder(
        feature_extractor=FeatureExtractor(type='mfcc'),
        output_dir=f'{output_dir}/feats_{partition}'
    )
    feature_set = feature_set_builder.process_and_store_recordings(
        recordings=manifests['audio'],
        num_jobs=os.cpu_count()
    )
    mini_librispeech_manifests[partition]['feats'] = feature_set

    cut_set = make_cuts_from_supervisions(manifests['supervisions'], feature_set)
    mini_librispeech_manifests[partition]['cuts'] = cut_set
    cut_set.to_yaml(output_dir / f'cuts_{partition}.yml')

# Make pytorch Dataset

In [None]:
cuts_dev = SpeechRecognitionDataset(mini_librispeech_manifests['dev-clean-2']['cuts'])
cuts_train = SpeechRecognitionDataset(mini_librispeech_manifests['train-clean-5']['cuts'])

# Illustation of an example

In [None]:
sample = cuts_dev[0]
print(sample['text'])
plt.matshow(sample['feature'].transpose(0, 1).flip(0))