In [1]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/ubuntu/mesolitica-tpu.json'

In [2]:
import string

# Label validation mechanism; only a-z and 0-9 both inclusive and \w are allowed

char_vocabs = [''] + list(string.ascii_lowercase + string.digits) + [' ']

In [3]:
# Define constants

sr = 16000
maxlen = 18
maxlen_subwords = 100
minlen_text = 1
global_count = 0

In [4]:
from google.cloud import storage
import numpy as np
import six

def to_example(dictionary):
    """Helper: build tf.Example from (string -> int/float/str list) dictionary."""
    features = {}
    for (k, v) in six.iteritems(dictionary):
        if not v:
            raise ValueError('Empty generated field: %s' % str((k, v)))
        # Subtly in PY2 vs PY3, map is not scriptable in py3. As a result,
        # map objects will fail with TypeError, unless converted to a list.
        if six.PY3 and isinstance(v, map):
            v = list(v)
        if isinstance(v[0], six.integer_types) or np.issubdtype(
            type(v[0]), np.integer
        ):
            features[k] = tf.train.Feature(
                int64_list=tf.train.Int64List(value=v)
            )
        elif isinstance(v[0], float):
            features[k] = tf.train.Feature(
                float_list=tf.train.FloatList(value=v)
            )
        elif isinstance(v[0], six.string_types):
            if not six.PY2:  # Convert in python 3.
                v = [bytes(x, 'utf-8') for x in v]
            features[k] = tf.train.Feature(
                bytes_list=tf.train.BytesList(value=v)
            )
        elif isinstance(v[0], bytes):
            features[k] = tf.train.Feature(
                bytes_list=tf.train.BytesList(value=v)
            )
        else:
            raise ValueError(
                'Value for %s is not a recognized type; v: %s type: %s'
                % (k, str(v[0]), str(type(v[0])))
            )
    return tf.train.Example(features=tf.train.Features(feature=features))

In [5]:
import json

# Load json object
with open('bahasa-asr.json') as fopen:
    data = json.load(fopen)

In [6]:
import tensorflow as tf
import malaya_speech
from tqdm import tqdm

  'Cannot import beam_search_ops from Tensorflow Addons, `deep_model` for stemmer will not available to use, make sure Tensorflow Addons version >= 0.12.0'


In [7]:
# Pair up audio and label
files = list(zip(data['X'], data['Y']))

In [8]:
files[0]

('part1/output-wav/Kita-Cepat-Sangka-Buruk-Dengan-Allah-_-Ustaz-Jafri-Abu-Bakar--UmxHgVvA50.mp3-part-9.wav',
 'hadis kullukum balon elemen hd itu hu')

In [9]:
import re

def loop(files):
    
    client = storage.Client()
    bucket = client.bucket('mesolitica-tpu-general')
    files, index = files
    # 0 - global count to get negative value
    output_file = f'{index}-{global_count}.tfrecord'
    # Instantiate TFRecordWriter object
    writer = tf.io.TFRecordWriter(output_file)
    for s in tqdm(files):
        try:
            t = s[1] # Label, truth
            f = s[0] # Audio, file
            if len(s[1]) < minlen_text: # Skip if label is empty
                continue
            y, _ = malaya_speech.load(f) # Load quantified audio array, discard sample rate
            if (len(y) / sr) > maxlen: # Skip if array / sample array is too large
                continue
            
            t = ''.join([c if c in char_vocabs else ' ' for c in t]) # Validate label
            t = re.sub(r'[ ]+', ' ', t).strip() # Remove excessive \w
            
            new_t = [char_vocabs.index(c) for c in t] # Encode character to number
            example = to_example({'waveforms': y.tolist(), 
                                  'targets': new_t, 
                                  'targets_length': [len(new_t)],
                                 'lang': [0]})
            writer.write(example.SerializeToString()) # Serialise structured data to string; write to tfrecord
        except Exception as e:
            print(e)
            pass
    writer.close()
    blob = bucket.blob(f'malay/{output_file}')
    blob.upload_from_filename(output_file)
    os.system(f'rm {output_file}')

In [10]:
loop((files[:100], 0))

100%|██████████| 100/100 [00:02<00:00, 48.75it/s]


In [11]:
import mp

batch_size = 20000
for i in range(0, len(files), batch_size):
    batch = files[i: i + batch_size]
    mp.multiprocessing(batch, loop, cores = 8, returned = False)
    global_count += 1

100%|██████████| 2500/2500 [01:33<00:00, 26.68it/s]
100%|██████████| 2500/2500 [01:33<00:00, 26.65it/s]
100%|██████████| 2500/2500 [01:34<00:00, 26.54it/s]
100%|██████████| 2500/2500 [01:34<00:00, 26.44it/s]
100%|██████████| 2500/2500 [01:38<00:00, 25.36it/s]
100%|██████████| 2500/2500 [01:38<00:00, 25.30it/s]
100%|██████████| 2500/2500 [01:39<00:00, 25.24it/s]
100%|██████████| 2500/2500 [01:40<00:00, 24.86it/s]
100%|██████████| 2500/2500 [01:36<00:00, 26.01it/s]
100%|██████████| 2500/2500 [01:36<00:00, 25.90it/s]
100%|██████████| 2500/2500 [01:36<00:00, 25.89it/s]
100%|██████████| 2500/2500 [01:36<00:00, 25.86it/s]
100%|██████████| 2500/2500 [01:36<00:00, 25.85it/s]
100%|██████████| 2500/2500 [01:36<00:00, 25.82it/s]
100%|██████████| 2500/2500 [01:37<00:00, 25.72it/s]
100%|██████████| 2500/2500 [01:37<00:00, 25.70it/s]
100%|██████████| 2500/2500 [01:35<00:00, 26.10it/s]
100%|██████████| 2500/2500 [01:36<00:00, 25.89it/s]
100%|██████████| 2500/2500 [01:37<00:00, 25.72it/s]
100%|███████