## Install required packages

In [None]:
# we first clone the DeepSpeech repository
!git clone https://github.com/mozilla/DeepSpeech.git

In [None]:
!pip install -e ./DeepSpeech

## Download files

In [None]:
# download the pretrained model and scorer
# TODO download to nicer location
#!curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm
#!curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer

In [None]:
from pathlib import Path

In [None]:
# Create the required directories
Path.joinpath(Path.cwd(), 'raw_text/').mkdir(parents=True, exist_ok=True)
Path.joinpath(Path.cwd(), 'normal_text/').mkdir(parents=True, exist_ok=True)

In [None]:
audio_directory = Path.cwd() / 'audio' / 'wav'
raw_text_directory = Path.cwd() / 'raw_text'
normal_text_directory = Path.cwd() / 'normal_text'

## Speech recognition using DeepSpeech

Using CPU

In [None]:
from __future__ import absolute_import, division, print_function

import argparse
import numpy as np
import shlex
import subprocess
import sys
import wave
import json

from deepspeech import Model, version
from timeit import default_timer as timer

try:
    from shhlex import quote
except ImportError:
    from pipes import quote

from deepspeech.client import convert_samplerate, metadata_to_string

In [None]:
model_file = 'deepspeech-0.9.3-models.pbmm'
scorer_file = 'deepspeech-0.9.3-models.scorer'
audio_file = str(audio_directory) + '/econ251_01_090309.wav'

In [None]:

print(f"Loading model from file {model_file}")
model_load_start = timer()

# Creating a model instance and loading model
ds = Model(model_file)

model_load_end = timer() - model_load_start
print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

# if args.beam_width:
    # ds.setBeamWidth(args.beam_width)

desired_sample_rate = ds.sampleRate()

if scorer_file:
    print(f"Loading scorer from files {scorer_file}")
    scorer_load_start = timer()
    ds.enableExternalScorer(scorer_file)
    scorer_load_end = timer() - scorer_load_start
    print(f"Loaded scorer in {scorer_load_end:.3}s.")

    # if args.lm_alpha and args.lm_beta:
    #     ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta)

# if args.hot_words:
#     print('Adding hot-words', file=sys.stderr)
#     for word_boost in args.hot_words.split(','):
#         word,boost = word_boost.split(':')
#         ds.addHotWord(word,float(boost))

fin = wave.open(audio_file, 'rb')
fs_orig = fin.getframerate()
if fs_orig != desired_sample_rate:
    print(f"Warning: original sample rate ({fs_orig}) is different than {desired_sample_rate}hz. Resampling might produce erratic speech recognition.")
    fs_new, audio = convert_samplerate(audio_file, desired_sample_rate)
else:
    audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

audio_length = fin.getnframes() * (1/fs_orig)
fin.close()



# Performing inference
print("Running inference.")
inference_start = timer()
if True:
    print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0]))
elif args.json:
    print(metadata_json_output(ds.sttWithMetadata(audio, args.candidate_transcripts)))
else:
    print(ds.stt(audio))
inference_end = timer() - inference_start
print(f"Inference took {inference_end:0.3f}s for {audio_length:0.3f}s audio file.")

Using GPU

In [None]:
#todo