<a href="https://colab.research.google.com/github/leodenale/Speech2Text/blob/master/Speech_To_Text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# mp3 to text with deep speech model

---

![speech to text](https://uploads-ssl.webflow.com/5985ca0c9abf440001d1f4b0/5a68a52180efb200017181cf_transcription_icon_v2_EN.png =259x)

In [0]:
from __future__ import absolute_import, division, print_function

import os
import numpy as np
import shlex
import subprocess
import sys
import wave

!pip install deepspeech 
from deepspeech import Model, printVersions
from timeit import default_timer as timer

# audio converters
!apt update && apt-get install ffmpeg mpg123

# sox package for adjusting sample rate.
!apt-get install libsox-fmt-all libsox-dev sox

# neural network model for acoustic recognition
!wget -O - https://github.com/mozilla/DeepSpeech/releases/download/v0.3.0/deepspeech-0.3.0-models.tar.gz | tar xvfz -

Collecting deepspeech
[?25l  Downloading https://files.pythonhosted.org/packages/7d/28/ba0b39d65d64b43777084d6c66ba387c75ca1c4a9a28577df13ce676db3b/deepspeech-0.4.1-cp36-cp36m-manylinux1_x86_64.whl (11.8MB)
[K    100% |████████████████████████████████| 11.8MB 1.1MB/s 
Installing collected packages: deepspeech
Successfully installed deepspeech-0.4.1
Ign:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1710/x86_64  InRelease
Get:2 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease [21.3 kB]
Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:4 http://archive.ubuntu.com/ubuntu bionic InRelease
Ign:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64  InRelease
Hit:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1710/x86_64  Release
Hit:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64  Release
Get:8 http://archive.ubuntu.com/ubuntu bi

# Accoustic parameters

In [0]:
model    = 'models/output_graph.pbmm'
alphabet = 'models/alphabet.txt'
lm       = 'models/lm.binary'
trie     = 'models/trie'

# These constants control the beam search decoder

# Beam width used in the CTC decoder when building candidate transcriptions
BEAM_WIDTH = 500

# The alpha hyperparameter of the CTC decoder. Language Model weight
LM_WEIGHT = 1.50

# Valid word insertion weight. This is used to lessen the word insertion penalty
# when the inserted word is part of the vocabulary
VALID_WORD_COUNT_WEIGHT = 2.10


# These constants are tied to the shape of the graph used (changing them changes
# the geometry of the first layer), so make sure you use the same constants that
# were used during training

# Number of MFCC features to use
N_FEATURES = 26

# Size of the context window used for producing timesteps in the input vector
N_CONTEXT = 9

# Adapt Sample Rate of Audio File

In [0]:
def convert_samplerate(audio_path):
    sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate 16000 --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(audio_path)
    try:
        output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE)
    except subprocess.CalledProcessError as e:
        raise RuntimeError('SoX returned non-zero status: {}'.format(e.stderr))
    except OSError as e:
        raise OSError(e.errno, 'SoX not found, use 16kHz files or install it: {}'.format(e.strerror))

    return 16000, np.frombuffer(output, np.int16)



# Input MP3 Audio File

In [0]:
# upload mp3 audio file.

from google.colab import files
uploaded = files.upload()
for audio in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
         name=audio, length=len(uploaded[audio])))

os.rename(audio, 'speech.mp3')
audio = 'speech.wav'

# convert to wav file.  
!ffmpeg -i speech.mp3 -vn -acodec pcm_s16le -ac 1 -ar 16000 -f wav speech.wav
#!mpg123 -w speech.wav speech.mp3















Saving Recording.mp3 to Recording.mp3
User uploaded file "Recording.mp3" with length 92786 bytes
ffmpeg version 3.4.4-0ubuntu0.18.04.1 Copyright (c) 2000-2018 the FFmpeg developers
  built with gcc 7 (Ubuntu 7.3.0-16ubuntu3)
  configuration: --prefix=/usr --extra-version=0ubuntu0.18.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --enable-gpl --disable-stripping --enable-avresample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librubberband --enable-librsvg --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvorbis --enable-lib

# Convert MP3 to Text

In [0]:
    print('Loading model from file {}'.format(model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if lm and trie:
        print('Loading language model from files {} {}'.format(lm, trie), file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(alphabet, lm, trie, LM_WEIGHT,
                               VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)

    fin = wave.open(audio, 'rb')
    fs = fin.getframerate()
    if fs != 16000:
        print('Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'.format(fs), file=sys.stderr)
        fs, audio = convert_samplerate(audio)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    audio_length = fin.getnframes() * (1/16000)
    fin.close()

    print('Running inference.', file=sys.stderr)
    print('================================\n')
    inference_start = timer()
    print(ds.stt(audio, fs))
    inference_end = timer() - inference_start
    print('\n================================')
    print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)

Loading model from file models/output_graph.pbmm
Loaded model in 0.015s.
Loading language model from files models/lm.binary models/trie
Loaded language model in 0.0891s.
Running inference.



vkblalblalblalblalblalblalblalblalblalblalblalblalblalblalblalblalblalblalblalblalblalblalblalblalblalblalblbmfoynwvxuqynejt gh' tomhjcmaanalblalblalblbmeozndldnzmdlctvpdlpynejt gh' qpalblalblalblalblalblbmfoyodlbpvqynejs'aotxuqynejswcldn'mdldnzmdlctvpdh' kzsxakblalblalblalblalblalblalblalblaqc



Inference took 5.440s for 5.909s audio file.
