In [1]:
import os
import sys
import deepspeech
import numpy as np
import pandas as pd
import wave
from pydub import AudioSegment
import mysql.connector
import json
import io
import shutil
from datasets import load_dataset

In [2]:
from src import face_mgmt, audio_mgmt

In [3]:
# Load Pre-trained Model
ds = deepspeech.Model('resources/deepspeech/pretrained/deepspeech-0.9.3-models.pbmm')

In [4]:
# Load constants
AUDIO_TRAINING_PATH = 'resources/deepspeech/audio'

For training the DeepSpeech model with custom examples, we need to feed it audio files. Let us create some functions for reading and preprocessing audio files for training purposes, and storing them into database.

Let's start with creating a cursor for testing purposes.

In [5]:
conn_audio = mysql.connector.connect(
    host='localhost',
    user='root',
    password='0301sonaL',
    database='chato_audio',
    auth_plugin='mysql_native_password'
)

cursor_audio = conn_audio.cursor()

**Read all transcripts into a list**

In [12]:
with open(os.path.join('resources/deepspeech/audio', 'kartik_manisha_transcripts.txt')) as fh:
    transcripts = fh.readlines()

**Store into database**

In [13]:
transcripts

['I need one classic sandwich combo with fries and a drink.\n',
 'i need 2 piece signature chicken combo with fries and a bottle of water.\n',
 'I want 3-piece tender combo with fries and a drink.\n',
 'I will have one deluxe sandwich combo with fries.\n',
 'We want 12 piece nuggets combo with sweet heat sauce and fries and a fountain pop.\n',
 'We will have 8 piece chicken meal. What are the options for 1 large side? We will have 1 large fries . and it comes with 4 biscuits.\n',
 'Hey howâ€™s the day going? I will have 8 piece nuggets combo with fries. does the biscuit is included in it.and a drink.\n',
 'We will have 36 piece of nuggets only.no dipping sauce.\n',
 'Today we want to have 16 piece of chicken . it comes with 3 large sides right and 10 biscuits. We will have 2 large fries and 1 large gravy . \n',
 'I need one classic sandwich combo with gravy and a drink.\n',
 'i need 2 piece signature chicken combo with gravy and a bottle of water.\n',
 'I want 3-piece tender combo with

In [15]:
file = os.path.join('resources/deepspeech/audio/kartik_manisha_audio/004.wav')
audio_mgmt.store_into_database(file, transcripts[3], cursor_audio)
conn_audio.commit()

**Read from database**

In [20]:
row = audio_mgmt.read_wav_from_database(6, cursor=cursor_audio)

In [21]:
row[0]

array([ 18770,  17990, -25692, ...,      7,      3,     -1], dtype=int16)

**Transcribe using DeepSpeech**

In [22]:
y_pred = ds.stt(row[0])
y = row[0]
print('Actual:\t', y)
print('Predicted:\t', y_pred)

Actual:	 [ 18770  17990 -25692 ...      7      3     -1]
Predicted:	 hey i won one capitine of the willavel


**Synthsize Text**

In [2]:
audio_mgmt.speak('That will be 4$ 45 cents, to the window please')

**Store Image into database**

In [6]:
conn_face = mysql.connector.connect(
    host='localhost',
    user='root',
    password='0301sonaL',
    database='chato_customer',
    auth_plugin='mysql_native_password'
)

cursor_face = conn_face.cursor()

In [7]:
test_face = 'resources/temp_faces/Anshu.jpg'
face_mgmt.store_into_database(test_face, cursor=cursor_face)

In [8]:
conn_face.commit()

**Test Listen**

In [6]:
speech = audio_mgmt.listen()

Listening...


In [7]:
speech

array([  0,   0,  -1, ..., -13, -39, -60], dtype=int16)

In [8]:
ds.stt(speech)

'wit up ed anyon an i de i tet to ma'

**Prepare Data**

In [4]:
# Change the indices in split parameter according to `current_i`. `current_i` represents the number of audio files written so far.
common_voice_subset = load_dataset('mozilla-foundation/common_voice_11_0', 'en', split='train[:2400]')

In [8]:
# 2400 files have been written. Change `current_i` correspondingly when calling this function.
def move_audio_files(current_i, dataset):
    for filename in dataset['path']:
        shutil.copy(file, f'{AUDIO_TRAINING_PATH}/common_voice_audio/{current_i}.mp3')
        current_i+=1

In [11]:
def write_transcripts(filename=f'{AUDIO_TRAINING_PATH}/common_voice_transcripts.txt',*, dataset):
    with open(filename, 'a') as fh:
        for line in dataset['sentence']:
            fh.write(line+'\n')