<a href="https://colab.research.google.com/github/iLab-DSU/Imarika_Voice_Analytics/blob/main/STT_using_CTC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Speech to Text using CTC

### Introduction 
- Making use of RNN, 2D CNN and Connectionist Temporal Classification loss to build a STT model
- CTC is used to train deep neural networks in speech recog, handwriting recog, etc.
- We will use the Common Voice dataset from [here](https://commonvoice.mozilla.org/sw/datasets).
- The quality of the model will be evaluated using Word Error Rate(WER). 



### Library Installations

In [1]:
pip install huggingface_hub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
pip install -U datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Setup Required Libraries

In [29]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow import keras
from tensorflow.keras import layers


### Data Pipeline

In [4]:
# Login huggingface


from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


In [5]:
# Download the dataset from huggingface

from datasets import load_dataset, load_metric, Audio

common_voice_train = load_dataset("mozilla-foundation/common_voice_8_0", 'sw', split='train+validation', use_auth_token=True)
common_voice_test = load_dataset("mozilla-foundation/common_voice_8_0", 'sw', split='test', use_auth_token=True)



In [6]:
common_voice_train = common_voice_train.remove_columns(["accent", "age", "down_votes", "gender", "locale", "segment", "up_votes"])
common_voice_test = common_voice_test.remove_columns(["accent", "age", "down_votes", "gender", "locale", "segment", "up_votes"])

In [7]:
common_voice_train

Dataset({
    features: ['client_id', 'path', 'audio', 'sentence'],
    num_rows: 28411
})

In [8]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML


def display_random_elements(dataset, num_examples=10):
  assert num_examples <= len(dataset) # Can't pick more than there is in the dataset
  picks = []
  for _ in range(num_examples):
      pick = random.randint(0, len(dataset)-1)
      while pick in picks:
        pick = random.randint(0, len(dataset)-1)
      picks.append(pick)

  
  df = pd.DataFrame(dataset[picks])
  display(HTML(df.to_html()))

In [9]:
import re
characters_to_ignore = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

def remove_special_chars(batch):
  batch['sentence']=re.sub(characters_to_ignore, '', batch['sentence']).lower() + " "
  return batch

In [10]:
common_voice_train = common_voice_train.map(remove_special_chars)
common_voice_test = common_voice_test.map(remove_special_chars)



In [11]:
display_random_elements(common_voice_train.remove_columns(['client_id']))



Unnamed: 0,path,audio,sentence
0,/root/.cache/huggingface/datasets/downloads/extracted/53d30892a707d521ef30c6c48f4ea9f8232daea92c37971ac64fb135aa3d6acf/cv-corpus-8.0-2022-01-19/sw/clips/common_voice_sw_29840493.mp3,"{'path': '/root/.cache/huggingface/datasets/downloads/extracted/53d30892a707d521ef30c6c48f4ea9f8232daea92c37971ac64fb135aa3d6acf/cv-corpus-8.0-2022-01-19/sw/clips/common_voice_sw_29840493.mp3', 'array': [-2.1230822e-08, 1.3050334e-08, 1.8151752e-08, -3.176808e-08, 1.0266302e-09, 4.0967482e-08, -3.344724e-08, -2.7060414e-08, 6.4254635e-08, -1.5188792e-08, -7.025631e-08, 7.359241e-08, 3.2043463e-08, -1.1722196e-07, 4.985759e-08, 1.0798698e-07, -1.4449782e-07, -2.3347907e-08, 1.9598754e-07, -1.2040503e-07, -1.4873558e-07, 2.600356e-07, -1.5707004e-08, -3.0456735e-07, 2.495301e-07, 1.8083888e-07, -4.3919104e-07, 1.1285589e-07, 4.4688537e-07, -4.7487205e-07, -1.8313725e-07, 7.166971e-07, -3.2071117e-07, -6.3443616e-07, 8.798609e-07, 1.13132046e-07, -1.1901271e-06, 7.802035e-07, 9.2296324e-07, -1.7577239e-06, 1.666664e-07, 2.3301855e-06, -2.224334e-06, -1.754231e-06, 5.764305e-06, -2.4878939e-06, -2.3877134e-05, -2.3877134e-05, -2.4878939e-06, 5.764305e-06, -1.754231e-06, -2.224334e-06, 2.3301855e-06, 1.666664e-07, -1.7577239e-06, 9.2296324e-07, 7.802035e-07, -1.1901271e-06, 1.13132046e-07, 8.798609e-07, -6.3443616e-07, -3.2071117e-07, 7.166971e-07, -1.8313725e-07, -4.7487205e-07, 4.4688537e-07, 1.1285589e-07, -4.3919104e-07, 1.8083888e-07, 2.495301e-07, -3.0456735e-07, -1.5707004e-08, 2.600356e-07, -1.4873558e-07, -1.2040503e-07, 1.9598754e-07, -2.3347907e-08, -1.4449782e-07, 1.0798698e-07, 4.985759e-08, -1.1722196e-07, 3.2043463e-08, 7.359241e-08, -7.025631e-08, -1.5188792e-08, 6.4254635e-08, -2.7060414e-08, -3.344724e-08, 4.0967482e-08, 1.0266302e-09, -3.176808e-08, 1.8151752e-08, 1.3050334e-08, -2.1230822e-08, 2.865725e-09, 1.3874331e-08, -1.0184482e-08, -4.067796e-09, 9.61578e-09, -2.662255e-09, ...], 'sampling_rate': 48000}",walikuja kufanya biashara na wenyeji wa bara la afrika
1,/root/.cache/huggingface/datasets/downloads/extracted/53d30892a707d521ef30c6c48f4ea9f8232daea92c37971ac64fb135aa3d6acf/cv-corpus-8.0-2022-01-19/sw/clips/common_voice_sw_29972797.mp3,"{'path': '/root/.cache/huggingface/datasets/downloads/extracted/53d30892a707d521ef30c6c48f4ea9f8232daea92c37971ac64fb135aa3d6acf/cv-corpus-8.0-2022-01-19/sw/clips/common_voice_sw_29972797.mp3', 'array': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...], 'sampling_rate': 48000}",magonjwa yanayotibika na yasiyotibika kama kansa ukimwi na kifua kikuu
2,/root/.cache/huggingface/datasets/downloads/extracted/53d30892a707d521ef30c6c48f4ea9f8232daea92c37971ac64fb135aa3d6acf/cv-corpus-8.0-2022-01-19/sw/clips/common_voice_sw_29995022.mp3,"{'path': '/root/.cache/huggingface/datasets/downloads/extracted/53d30892a707d521ef30c6c48f4ea9f8232daea92c37971ac64fb135aa3d6acf/cv-corpus-8.0-2022-01-19/sw/clips/common_voice_sw_29995022.mp3', 'array': [-2.1230822e-08, 1.3050334e-08, 1.8151752e-08, -3.176808e-08, 1.0266302e-09, 4.0967482e-08, -3.344724e-08, -2.7060414e-08, 6.4254635e-08, -1.5188792e-08, -7.025631e-08, 7.359241e-08, 3.204123e-08, -1.1724493e-07, 4.9890982e-08, 1.080117e-07, -1.446032e-07, -2.3279101e-08, 1.961598e-07, -1.207209e-07, -1.4876133e-07, 2.6065558e-07, -1.6247341e-08, -3.051596e-07, 2.5100297e-07, 1.8050027e-07, -4.4130857e-07, 1.1527466e-07, 4.4811716e-07, -4.797041e-07, -1.8086668e-07, 7.2205455e-07, -3.286332e-07, -6.356494e-07, 8.9239023e-07, 1.04462785e-07, -1.2007823e-06, 8.0116746e-07, 9.2068166e-07, -1.7848425e-06, 1.9175856e-07, 2.3468083e-06, -2.271521e-06, -1.738739e-06, 5.814673e-06, -2.548329e-06, -2.3895374e-05, -2.3783994e-05, -2.5374661e-06, 5.6831764e-06, -1.6273492e-06, -2.2197905e-06, 2.1657702e-06, 2.8834012e-07, -1.6452257e-06, 6.840026e-07, 8.2392125e-07, -9.2712173e-07, -1.4247426e-07, 7.511934e-07, -2.2211182e-07, -4.773428e-07, 3.3192765e-07, 3.0033758e-07, -3.7348667e-07, -2.1767262e-07, 4.9322847e-07, 7.923276e-08, -6.7292643e-07, 2.6610468e-07, 7.2034163e-07, -8.1138325e-07, -3.8882916e-07, 1.318332e-06, -4.345436e-07, -1.385576e-06, 1.5770406e-06, 6.3427666e-07, -2.5539537e-06, 1.0914575e-06, 2.6762236e-06, -3.6716444e-06, -1.1186936e-06, 7.216706e-06, -4.5137594e-06, -2.4324961e-05, -2.0376408e-05, -4.907896e-06, 1.1271248e-06, 1.1916837e-05, 2.5739118e-05, 1.6018379e-05, -4.498431e-06, -4.1655094e-06, 4.4536155e-06, 2.819943e-07, -3.2109886e-06, 1.6317211e-06, 1.4514218e-06, -2.1164526e-06, ...], 'sampling_rate': 48000}",umaarufu mwingine wa hifadhi hii ni pamoja ya aina ya chatu
3,/root/.cache/huggingface/datasets/downloads/extracted/53d30892a707d521ef30c6c48f4ea9f8232daea92c37971ac64fb135aa3d6acf/cv-corpus-8.0-2022-01-19/sw/clips/common_voice_sw_28881440.mp3,"{'path': '/root/.cache/huggingface/datasets/downloads/extracted/53d30892a707d521ef30c6c48f4ea9f8232daea92c37971ac64fb135aa3d6acf/cv-corpus-8.0-2022-01-19/sw/clips/common_voice_sw_28881440.mp3', 'array': [2.4325322e-08, 7.9623386e-10, -3.2954084e-08, 2.4198961e-08, 2.4746962e-08, -5.042357e-08, 6.2693815e-09, 6.024351e-08, -5.3664387e-08, -3.6279292e-08, 9.548943e-08, -2.6098215e-08, -1.0046503e-07, 1.0814086e-07, 4.4186564e-08, -1.6747754e-07, 7.06683e-08, 1.537945e-07, -2.0190127e-07, -3.7768462e-08, 2.7522452e-07, -1.5952749e-07, -2.1601329e-07, 3.55512e-07, -3.7489962e-09, -4.2851147e-07, 3.2617834e-07, 2.7429545e-07, -6.0265205e-07, 1.2277754e-07, 6.404209e-07, -6.3898307e-07, -2.9438965e-07, 1.0159225e-06, -4.280466e-07, -9.362859e-07, 1.2946294e-06, 1.532418e-07, -1.8396688e-06, 1.3754078e-06, 1.38715e-06, -3.4094612e-06, 1.1990896e-06, 5.4487355e-06, -1.274301e-05, -2.974276e-05, -1.35172495e-05, 7.005467e-06, 1.8345156e-07, -5.2950677e-06, 6.258237e-06, -4.4544868e-07, -2.5400816e-05, -2.4480047e-05, -9.750925e-07, 5.3281055e-06, -2.6934192e-06, -1.3049284e-06, 2.5446307e-06, -7.27111e-07, -1.3983309e-06, 1.4674558e-06, 1.431392e-07, -1.2836856e-06, 7.3259497e-07, 5.94997e-07, -1.0301355e-06, 1.7858713e-07, 7.9389423e-07, -7.106895e-07, -2.3379812e-07, 8.3090623e-07, -3.7061895e-07, -5.398278e-07, 7.6095455e-07, -1.561454e-08, -7.715208e-07, 5.943663e-07, 3.7615504e-07, -9.3249844e-07, 2.9476894e-07, 8.259125e-07, -9.75025e-07, -2.104367e-07, 1.3256143e-06, -7.836352e-07, -1.0293126e-06, 1.8241508e-06, -1.2196027e-07, -2.4092615e-06, 2.2361326e-06, 1.8090853e-06, -5.8058313e-06, 2.4698656e-06, 2.3919629e-05, 2.3866118e-05, 2.4620138e-06, -5.741006e-06, 1.7601917e-06, 2.2035738e-06, ...], 'sampling_rate': 48000}",nje na wilaya hizo wanaishi pia sehemu za kigoma na karagwe
4,/root/.cache/huggingface/datasets/downloads/extracted/53d30892a707d521ef30c6c48f4ea9f8232daea92c37971ac64fb135aa3d6acf/cv-corpus-8.0-2022-01-19/sw/clips/common_voice_sw_30068882.mp3,"{'path': '/root/.cache/huggingface/datasets/downloads/extracted/53d30892a707d521ef30c6c48f4ea9f8232daea92c37971ac64fb135aa3d6acf/cv-corpus-8.0-2022-01-19/sw/clips/common_voice_sw_30068882.mp3', 'array': [-5.5614265e-08, 2.1777036e-09, 7.144104e-08, -5.9268285e-08, -4.6166544e-08, 1.12783624e-07, -2.8936027e-08, -1.2152321e-07, 1.3132033e-07, 5.1829e-08, -2.0545758e-07, 9.3660475e-08, 1.8433221e-07, -2.5814657e-07, -3.034451e-08, 3.4193172e-07, -2.2395132e-07, -2.4871485e-07, 4.6350425e-07, -4.8866596e-08, -5.2762886e-07, 4.5974517e-07, 2.9248187e-07, -7.7949903e-07, 2.3512622e-07, 7.679402e-07, -8.664105e-07, -2.784117e-07, 1.2668087e-06, -6.1913505e-07, -1.0868062e-06, 1.5912906e-06, 1.4190276e-07, -2.112665e-06, 1.4419098e-06, 1.6338917e-06, -3.1993852e-06, 2.6989196e-07, 4.4471244e-06, -4.143991e-06, -4.506324e-06, 1.5995647e-05, 2.5767074e-05, 1.1925429e-05, 1.0839361e-06, -4.8862203e-06, -2.0336984e-05, -2.4382653e-05, -4.520635e-06, 7.295218e-06, -1.1689409e-06, -3.7327936e-06, 2.7848562e-06, 1.0866353e-06, -2.6852035e-06, 7.378397e-07, 1.6617774e-06, -1.5755215e-06, -3.9744523e-07, 1.5225083e-06, -5.8505316e-07, -9.139248e-07, 1.0337991e-06, 1.5800879e-07, -9.712027e-07, 4.3082125e-07, 5.9154894e-07, -7.1470674e-07, -1.209426e-07, 7.114669e-07, -2.832545e-07, -5.1105656e-07, 5.538675e-07, 1.9666919e-07, -6.7144947e-07, 1.7987371e-07, 6.110082e-07, -5.650971e-07, -3.3067465e-07, 8.540779e-07, -1.8055893e-07, -8.79423e-07, 8.259038e-07, 4.7119414e-07, -1.3525553e-06, 4.2645902e-07, 1.3795944e-06, -1.6350215e-06, -4.9966286e-07, 2.6663888e-06, -1.6370848e-06, -2.617469e-06, 5.6762656e-06, -1.3248854e-06, -2.4659617e-05, -2.4779132e-05, -7.408252e-07, 5.5889695e-06, -4.0718874e-06, -1.2133545e-09, ...], 'sampling_rate': 48000}",nimepambana dhidi ya utawala wa weusi
5,/root/.cache/huggingface/datasets/downloads/extracted/53d30892a707d521ef30c6c48f4ea9f8232daea92c37971ac64fb135aa3d6acf/cv-corpus-8.0-2022-01-19/sw/clips/common_voice_sw_29862296.mp3,"{'path': '/root/.cache/huggingface/datasets/downloads/extracted/53d30892a707d521ef30c6c48f4ea9f8232daea92c37971ac64fb135aa3d6acf/cv-corpus-8.0-2022-01-19/sw/clips/common_voice_sw_29862296.mp3', 'array': [-2.1230822e-08, 1.3050334e-08, 1.8151752e-08, -3.176808e-08, 1.0266302e-09, 4.0967482e-08, -3.344724e-08, -2.7060414e-08, 6.4254635e-08, -1.5188792e-08, -7.025631e-08, 7.359241e-08, 3.2043463e-08, -1.1722196e-07, 4.985759e-08, 1.0798698e-07, -1.4449782e-07, -2.3347907e-08, 1.9598754e-07, -1.2040503e-07, -1.4875324e-07, 2.600343e-07, -1.5629903e-08, -3.0464926e-07, 2.4943088e-07, 1.8112154e-07, -4.3926255e-07, 1.1236438e-07, 4.4748413e-07, -4.7452636e-07, -1.8451402e-07, 7.1733865e-07, -3.1896576e-07, -6.3702385e-07, 8.7929953e-07, 1.17706904e-07, -1.1931637e-06, 7.757797e-07, 9.3126397e-07, -1.7580862e-06, 1.5478233e-07, 2.340526e-06, -2.2157838e-06, -1.7758886e-06, 5.7699017e-06, -2.4622902e-06, -2.3905472e-05, -2.3889399e-05, -2.4396754e-06, 5.7424354e-06, -1.8012689e-06, -2.1581825e-06, 2.3400314e-06, 7.240961e-08, -1.6967765e-06, 9.972168e-07, 6.4413274e-07, -1.1794496e-06, 2.7785117e-07, 7.3920285e-07, -7.334975e-07, -6.828793e-08, 6.4677147e-07, -4.4341297e-07, -1.8944962e-07, 5.504354e-07, -3.1707637e-07, -2.3933254e-07, 5.549414e-07, -2.7802218e-07, -3.603988e-07, 6.677536e-07, -1.8920633e-07, -6.377863e-07, 7.937511e-07, 9.756024e-08, -1.0574399e-06, 7.576449e-07, 6.825606e-07, -1.4870245e-06, 3.47678e-07, 1.5706963e-06, -1.6812021e-06, -6.4664835e-07, 2.6542925e-06, -1.277894e-06, -2.4726805e-06, 3.7349535e-06, 3.8638467e-07, -5.8569453e-06, 4.5791203e-06, 6.4505302e-06, -1.9421297e-05, -2.5528558e-05, 1.02099575e-05, 2.8569155e-05, 4.864264e-06, -9.29616e-06, 2.2759305e-06, 4.2323486e-06, ...], 'sampling_rate': 48000}",maziwa hayo hutumika sana kama maziwa lala
6,/root/.cache/huggingface/datasets/downloads/extracted/53d30892a707d521ef30c6c48f4ea9f8232daea92c37971ac64fb135aa3d6acf/cv-corpus-8.0-2022-01-19/sw/clips/common_voice_sw_28748951.mp3,"{'path': '/root/.cache/huggingface/datasets/downloads/extracted/53d30892a707d521ef30c6c48f4ea9f8232daea92c37971ac64fb135aa3d6acf/cv-corpus-8.0-2022-01-19/sw/clips/common_voice_sw_28748951.mp3', 'array': [-2.1230822e-08, 1.3050334e-08, 1.8151752e-08, -3.176808e-08, 1.0266302e-09, 4.0967482e-08, -3.344724e-08, -2.7060414e-08, 6.4254635e-08, -1.5188792e-08, -7.025631e-08, 7.359241e-08, 3.2043463e-08, -1.1722196e-07, 4.985759e-08, 1.0798698e-07, -1.4449782e-07, -2.3347907e-08, 1.9598754e-07, -1.2040503e-07, -1.4873558e-07, 2.600356e-07, -1.5707004e-08, -3.0456735e-07, 2.495301e-07, 1.8083888e-07, -4.3919104e-07, 1.1285589e-07, 4.4688537e-07, -4.7487205e-07, -1.8313725e-07, 7.166971e-07, -3.2071117e-07, -6.3443616e-07, 8.798609e-07, 1.13132046e-07, -1.1901271e-06, 7.802035e-07, 9.2296324e-07, -1.7577239e-06, 1.666664e-07, 2.3301855e-06, -2.224334e-06, -1.754231e-06, 5.764305e-06, -2.4878939e-06, -2.3877134e-05, -2.3877134e-05, -2.4878939e-06, 5.764305e-06, -1.754231e-06, -2.224334e-06, 2.3301855e-06, 1.666664e-07, -1.7577239e-06, 9.2296324e-07, 7.802035e-07, -1.1901271e-06, 1.13132046e-07, 8.798609e-07, -6.3443616e-07, -3.2071117e-07, 7.166971e-07, -1.8313725e-07, -4.7487205e-07, 4.4688537e-07, 1.1285589e-07, -4.3919104e-07, 1.8083888e-07, 2.495301e-07, -3.0456735e-07, -1.5707004e-08, 2.600356e-07, -1.4873558e-07, -1.2040503e-07, 1.9598754e-07, -2.3347907e-08, -1.4449782e-07, 1.0798698e-07, 4.985759e-08, -1.1722196e-07, 3.2043463e-08, 7.359241e-08, -7.025631e-08, -1.5188792e-08, 6.4254635e-08, -2.7060414e-08, -3.344724e-08, 4.0967482e-08, 1.0266302e-09, -3.176808e-08, 1.8151752e-08, 1.3050334e-08, -2.1230822e-08, 2.865725e-09, 1.3874331e-08, -1.0184482e-08, -4.067796e-09, 9.61578e-09, -2.662255e-09, ...], 'sampling_rate': 48000}",kila ukoo una alama tofauti
7,/root/.cache/huggingface/datasets/downloads/extracted/53d30892a707d521ef30c6c48f4ea9f8232daea92c37971ac64fb135aa3d6acf/cv-corpus-8.0-2022-01-19/sw/clips/common_voice_sw_29912431.mp3,"{'path': '/root/.cache/huggingface/datasets/downloads/extracted/53d30892a707d521ef30c6c48f4ea9f8232daea92c37971ac64fb135aa3d6acf/cv-corpus-8.0-2022-01-19/sw/clips/common_voice_sw_29912431.mp3', 'array': [-2.4220951e-06, -2.3893184e-05, -2.395144e-05, -2.407395e-06, 5.797421e-06, -1.8851663e-06, -2.1636322e-06, 2.4506592e-06, -5.5880247e-09, -1.776081e-06, 1.1553759e-06, 6.223396e-07, -1.3591018e-06, 4.412732e-07, 8.3653885e-07, -1.0078659e-06, 2.0099028e-08, 9.1494167e-07, -7.5347833e-07, -2.8150924e-07, 9.960158e-07, -5.4129157e-07, -6.0978056e-07, 1.1054076e-06, -2.4477401e-07, -1.0521114e-06, 1.1501287e-06, 2.864237e-07, -1.5870291e-06, 9.337791e-07, 1.160301e-06, -2.0479897e-06, 1.9069117e-07, 2.3836837e-06, -2.0955952e-06, -1.4141482e-06, 3.835515e-06, -1.0733619e-06, -4.5931706e-06, 5.2857695e-06, 3.7684722e-06, -1.6885997e-05, -2.40618e-05, -1.2111767e-05, -3.341724e-06, 7.0990577e-06, 2.200451e-05, 1.8687819e-05, 7.073221e-06, 1.644854e-05, 2.5059944e-05, 6.3773846e-06, -8.682225e-06, 5.735113e-07, 5.143648e-06, -3.12986e-06, -2.0668276e-06, 3.5896503e-06, -4.3168512e-07, -2.684952e-06, 1.9534352e-06, 1.1275835e-06, -2.3796435e-06, 4.358745e-07, 1.890561e-06, -1.5561952e-06, -8.3112604e-07, 2.0192167e-06, -4.354004e-07, -1.7862891e-06, 1.6042151e-06, 8.986506e-07, -2.4195535e-06, 5.9993465e-07, 2.601476e-06, -2.747377e-06, -1.6492132e-06, 6.123671e-06, -2.7963795e-06, -2.3975459e-05, -2.356822e-05, -2.6004243e-06, 5.587566e-06, -1.5558801e-06, -2.2068052e-06, 2.163452e-06, 2.5516357e-07, -1.6821003e-06, 8.0834076e-07, 7.9177187e-07, -1.1094928e-06, 5.55416e-08, 8.537421e-07, -5.7554934e-07, -3.3658137e-07, 6.826828e-07, -1.5125126e-07, -4.690685e-07, 4.2048174e-07, 1.2425365e-07, ...], 'sampling_rate': 48000}",akaamua kurejea na kuunda jeshi lake ili kupambana na mwamubambe
8,/root/.cache/huggingface/datasets/downloads/extracted/53d30892a707d521ef30c6c48f4ea9f8232daea92c37971ac64fb135aa3d6acf/cv-corpus-8.0-2022-01-19/sw/clips/common_voice_sw_30196689.mp3,"{'path': '/root/.cache/huggingface/datasets/downloads/extracted/53d30892a707d521ef30c6c48f4ea9f8232daea92c37971ac64fb135aa3d6acf/cv-corpus-8.0-2022-01-19/sw/clips/common_voice_sw_30196689.mp3', 'array': [2.6987577e-08, -8.819546e-09, -2.8886287e-08, 3.4383444e-08, 1.0872631e-08, -5.3289295e-08, 2.7500203e-08, 4.7193176e-08, -7.181614e-08, -4.5112105e-09, 9.4462806e-08, -6.70657e-08, -6.7017794e-08, 1.3520128e-07, -2.0068068e-08, -1.5228875e-07, 1.4092461e-07, 8.02021e-08, -2.3394696e-07, 7.943052e-08, 2.2540034e-07, -2.6748972e-07, -7.162085e-08, 3.78911e-07, -1.995678e-07, -3.083682e-07, 4.7483965e-07, 1.4805353e-08, -5.873081e-07, 4.2673227e-07, 3.9204093e-07, -8.198235e-07, 1.429553e-07, 9.0465403e-07, -8.734217e-07, -4.6511312e-07, 1.4785996e-06, -5.586609e-07, -1.524151e-06, 2.0071818e-06, 5.169049e-07, -3.526661e-06, 2.3790324e-06, 4.682406e-06, -1.3663108e-05, -2.8006267e-05, -1.3670866e-05, 4.6934333e-06, 2.3760174e-06, -3.5398102e-06, 5.348993e-07, 2.0089976e-06, -1.550744e-06, -5.3865796e-07, 1.4976126e-06, -5.064559e-07, -8.655958e-07, 9.5144895e-07, 9.722313e-08, -8.439198e-07, 4.6737986e-07, 3.9999478e-07, -6.6041224e-07, 1.02429155e-07, 4.990579e-07, -4.3359952e-07, -1.3566377e-07, 4.835913e-07, -2.2737021e-07, -2.7802218e-07, 4.2191283e-07, -5.5170194e-08, -3.7061895e-07, 3.462486e-07, 1.052436e-07, -4.482819e-07, 2.497437e-07, 2.8936216e-07, -5.183167e-07, 9.60981e-08, 5.2830603e-07, -5.602022e-07, -1.714736e-07, 8.5702436e-07, -5.400965e-07, -6.668264e-07, 1.4180949e-06, -4.322266e-07, -1.9497838e-06, 3.4253003e-06, -2.3657239e-07, -2.2132303e-05, -2.9655903e-05, 1.5419012e-08, 2.9642968e-05, 2.211086e-05, 2.6944792e-07, -3.4289724e-06, 1.9138079e-06, 4.6708075e-07, ...], 'sampling_rate': 48000}",tuzo hizo zilitolewa katika tamasha tanzania
9,/root/.cache/huggingface/datasets/downloads/extracted/53d30892a707d521ef30c6c48f4ea9f8232daea92c37971ac64fb135aa3d6acf/cv-corpus-8.0-2022-01-19/sw/clips/common_voice_sw_29980288.mp3,"{'path': '/root/.cache/huggingface/datasets/downloads/extracted/53d30892a707d521ef30c6c48f4ea9f8232daea92c37971ac64fb135aa3d6acf/cv-corpus-8.0-2022-01-19/sw/clips/common_voice_sw_29980288.mp3', 'array': [2.5956298e-08, -2.4417197e-09, -3.221612e-08, 2.8666735e-08, 1.9267498e-08, -5.217344e-08, 1.6125e-08, 5.406853e-08, -6.262707e-08, -1.9732052e-08, 9.415882e-08, -4.808132e-08, -8.0454726e-08, 1.2173867e-07, 6.139801e-09, -1.5501836e-07, 1.1110849e-07, 1.053196e-07, -2.163368e-07, 3.6226577e-08, 2.3603387e-07, -2.2343924e-07, -1.1709239e-07, 3.5999062e-07, -1.308425e-07, -3.3758494e-07, 4.145743e-07, 9.628133e-08, -5.780236e-07, 3.2080416e-07, 4.657275e-07, -7.503705e-07, -6.572588e-09, 9.471273e-07, -7.2052757e-07, -6.656424e-07, 1.4718134e-06, -2.6139642e-07, -1.8704822e-06, 1.9459642e-06, 1.3589523e-06, -4.9567766e-06, 2.274237e-06, 2.2634986e-05, 2.6113652e-05, 2.3865548e-06, -2.496329e-05, -3.5792942e-05, -2.825878e-05, -1.0235922e-05, 2.7572498e-06, 1.9199447e-06, -2.1205642e-06, -1.0145356e-07, 1.4430802e-06, -7.1282574e-07, -6.6160254e-07, 9.236632e-07, -2.9416748e-08, -7.1176976e-07, 4.5291776e-07, 2.9834968e-07, -5.503732e-07, 9.5443205e-08, 3.9158962e-07, -3.2116023e-07, -1.2224562e-07, 3.4034136e-07, -1.11665955e-07, -2.1021731e-07, 2.2306149e-07, 3.3159594e-08, -2.0346864e-07, 9.9979275e-08, 1.0354628e-07, -1.4594417e-07, 6.9966006e-09, 1.13648746e-07, -7.634523e-08, -4.3802885e-08, 8.823563e-08, -1.9785539e-08, -5.7727924e-08, 5.126691e-08, 1.37472345e-08, -4.852899e-08, 1.9106132e-08, 2.5821047e-08, -3.0473565e-08, -1.1510733e-09, 2.3846184e-08, -1.3600261e-08, -9.643542e-09, 1.5941238e-08, -2.4668698e-09, -1.0206873e-08, 7.896811e-09, 2.6542921e-09, -7.220514e-09, 2.3926474e-09, ...], 'sampling_rate': 48000}",wameanza kuweka mikakati kuhakikisha urithi huu unatunzwa


In [15]:
common_voice_test

Dataset({
    features: ['client_id', 'path', 'audio', 'sentence'],
    num_rows: 8941
})

### Data Preprocessing

#### Vocabulary to be used

In [18]:
# Set of characters acceptable in swahili

characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]

# Mapping characters to integers
char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")

# Mapping integers to characters
num_to_char = keras.layers.StringLookup(vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True) 


print(
    f"The vocabulary is: {char_to_num.get_vocabulary()}"
    f"(size = {char_to_num.vocabulary_size()}"
)

The vocabulary is: ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "'", '?', '!', ' '](size = 31


In [35]:
wavs_path = "/content/wav/"
train_df_path = "/content/train.tsv"

train_df = pd.read_csv(train_df_path, sep='\t')
train_df = train_df[['path', 'sentence']]
train_df.head()

Unnamed: 0,path,sentence
0,common_voice_sw_29914941.mp3,au mabadiliko ya utandawazi yanachangia katika...
1,common_voice_sw_29914942.mp3,james aliibuka mfungaji wa bao bora la mashind...
2,common_voice_sw_29914944.mp3,Haya yanapaswa kutunzwa na kutumiwa ipasavyo k...
3,common_voice_sw_29914946.mp3,Kitaifa maadhimisho haya yalifanyika katika Ki...
4,common_voice_sw_29914947.mp3,Ingawa ina fonimu nyingi za Kikushi


In [36]:
# Window lenght in samples
frame_length = 256

# Number of samples to step
frame_step = 160

# Sise of the Fast Fourier Transform(FFT) to apply
fft_length = 384


def encode_single_sample(wav_file, label):
  ## Process the Audio

  # 1. Read wav file
  file = tf.io.read_file(wavs_path + wav_file + ".wav")

  # 2. Decode the wav file
  audio, _ = tf.audio.decode_wav(file)
  audio = tf.squeeze(audio, axis=-1)

  # 3. Change type to float
  audio = tf.cast(audio, tf.float32)

  # 4. Get the spectogram
  spectogram = tf.signal.stft(
      audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length
  )

  # 5. Applying tf.abs to get magnitude of audios
  spectogram = tf.abs(spectogram)
  spectogram = tf.math.pow(spectogram, 0.5)

  # 6. Normalisation
  means = tf.math.reduce_mean(spectogram, 1, keepdims=True)
  stddevs = tf.math.reduce_std(spectogram, 1, keepdims=True)
  spectogram = (spectogram - means) / (stddevs + 1e-10)

  # Process the label
  
  # 7. Convert labels to lower case
  label = tf.strings.lower(label)

  # 8. Split the label
  label = tf.strings.unicode_split(label, input_encoding='UTF-8')

  # 9. Map the characters in label to numbers
  label = char_to_num(label)

  # 10. Return a dictionary since our model ecpects 2 inputs
  return spectogram, label


#### Create dataset objects


Create a tf.data.Dataset object that yield the transformed elements in the same order as they appear in the input

In [37]:
batch_size = 32

# Define training dataset

train_dataset = tf.data.Dataset.from_tensor_slices(
    (list(train_df['path']), list(train_df['sentence']))
)


train_dataset = (
    train_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)
    .padded_batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

# Define the validation dataset

validation_dataset = tf.data.Dataset.from_tensor_slices(
    (list(common_voice_test['path']), list(common_voice_test['sentence']))
)

validation_dataset = (
    validation_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)
    .padded_batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

In [None]:
common_voice_test['path']

#### Visualise the Data

In [40]:
fig = plt.figure(figsize=(8,5))

for batch in train_dataset.take(1):
  spectogram = batch[0][0].numpy()
  spectogram = np.array([np.trim_zeros(x) for x in np.transpose(spectogram)])
  label = batch[1][0]

  # Spectogram
  label = tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
  ax = plt.subplot(2, 1, 1)
  ax.imshow(spectogram, vmax=1)
  ax.set_title(label)
  ax.axis("off")

  # Wav
  file = tf.io.read_file(wavs_path + list(train_df['path'])[0] + ".wav")
  audio, _ = tf.audio.decode_wav(file)
  audio = audio.numpy()
  ax = plt.subplot(2, 1, 2)
  plt.plot(audio)
  ax.set_title("Signal Wave")
  ax.set_xlim(0, len(audio))
  display.display(display.Audio(np.transpose(audio), rate=16000))

plt.show()

NotFoundError: ignored

<Figure size 576x360 with 0 Axes>