In [1]:
import tensorflow as tf
import tensorflow_hub as hub ##for loading full tf model


import numpy as np
import librosa  
import soundfile as sf ##may not be needed
import os, json, re
from itertools import groupby

from scipy.io import wavfile

In [73]:
model = '../models/lite-model_ASR_TFLite_pre_trained_models_English_1.tflite'
audio = '../test_audio/recording.wav'
signal, sr = librosa.load(audio, sr=16000, mono=True)

In [74]:
interpreter = tf.lite.Interpreter(model_path=model)
interpreter.allocate_tensors()

input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

interpreter.resize_tensor_input(input_details[0]["index"], signal.shape)
interpreter.allocate_tensors()
interpreter.set_tensor(input_details[0]["index"], signal)
interpreter.set_tensor(
    input_details[1]["index"],
    np.array(0).astype('int32')
)
interpreter.set_tensor(
    input_details[2]["index"],
    np.zeros([1,2,1,320]).astype('float32')
)
interpreter.invoke()
hyp = interpreter.get_tensor(output_details[0]["index"])

print("".join([chr(u) for u in hyp]))

    a   little  there   i   am  spe ak  ing to  the mi  cr  op  one i   am  wat chi ng  they    sto ck  her mat ch  a   


In [75]:
class model_to_text:
    def __init__(self):
        self.create_mappings()

    def create_mappings(self):
        self.token_to_id_mapping = self._get_vocab() #character to ascii int

        self.id_to_token_mapping = {v: k for k, v in self.token_to_id_mapping.items()} #ascii int to character

        self.special_tokens = ["<pad>"]
        self.special_ids = [self.token_to_id_mapping[k] for k in self.special_tokens]

    def _get_vocab(self):
        alph = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
        ascii_int = [ord(c) for c in alph] 
        vocab = {c : i for i,c in zip(ascii_int, alph)} ##token to id mapping
        vocab["<pad>"] = 0
        return vocab

    def decode(self, input_ids: list, skip_special_tokens=True, group_tokens=True):
        """
        Use this method to decode your ids back to string.
        Args:
            input_ids (:obj: `list`):
                input_ids you want to decode to string.
            skip_special_tokens (:obj: `bool`, `optional`):
                Whether to remove special tokens (like `<pad>`) from string.
            group_tokens (:obj: `bool`, `optional`):
                Whether to group repeated characters.
        """
        if group_tokens:
            input_ids = [t[0] for t in groupby(input_ids)]
        if skip_special_tokens:
            input_ids = [k for k in input_ids if k not in self.special_ids]
        tokens = [self.id_to_token_mapping.get(k, "<unk>") for k in input_ids]
        tokens = [k if k not in self.special_tokens else "" for k in tokens]
        return "".join(tokens).strip()

In [76]:
decoder = model_to_text()

In [77]:
hyp

array([  0,   0,   0,   0,  97,  32,   0,   0, 108, 105, 116, 116, 108,
       101,  32,   0, 116, 104, 101, 114, 101,  32,   0,   0, 105,  32,
         0,   0,  97, 109,  32,   0, 115, 112, 101,   0,  97, 107,   0,
         0, 105, 110, 103,  32, 116, 111,  32,   0, 116, 104, 101,  32,
       109, 105,   0,   0,  99, 114,   0,   0, 111, 112,   0,   0, 111,
       110, 101,  32, 105,  32,   0,   0,  97, 109,  32,   0, 119,  97,
       116,   0,  99, 104, 105,   0, 110, 103,  32,   0, 116, 104, 101,
       121,  32,   0,   0,   0, 115, 116, 111,   0,  99, 107,  32,   0,
       104, 101, 114,  32, 109,  97, 116,   0,  99, 104,  32,   0,  97,
         0,   0,   0])

In [78]:
decoder.decode(hyp.tolist(), skip_special_tokens=False)

'a litle there i am speaking to the micropone i am watching they stock her match a'