<a href="https://colab.research.google.com/github/hpatil000/AI_Project/blob/main/Speech_to_text_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, utils, callbacks
import librosa
from keras.preprocessing.text import Tokenizer

import librosa.display
import matplotlib.pyplot as plt
import random



In [None]:
audio_dir = '/content/drive/My Drive/Colab Notebooks/dataset/book3/'
transcripts_file = '/content/drive/My Drive/Colab Notebooks/dataset/Book3.txt'

In [None]:
transcripts = {}
with open(transcripts_file, 'r') as f:
    for line in f:
        parts = line.strip().split('\t')
        transcripts[parts[0]] = parts[1]
print(transcripts)

{'LJ001-0001': 'Printing  in the only sense with which we are at present concerned differs from most if not from all the arts and crafts represented in the Exhibition', 'LJ001-0002': 'in being comparatively modern.in being comparatively modern.', 'LJ001-0003': 'For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands', 'LJ001-0004': 'produced the block books which were the immediate predecessors of the true printed book', 'LJ001-0005': 'the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.', 'LJ001-0006': 'And it is worth mention in passing that', 'LJ001-0007': 'the earliest book printed with movable types', 'LJ001-0008': 'has never been surpassed.', 'LJ001-0009': 'Printing may be considered as the art of making books by means of movable types.', 'LJ001-0010': 'Now as all books not primarily intended as picture-books cons

In [None]:
audio_data = []
transcription_data = []


In [None]:
for file_name in os.listdir(audio_dir):
    if file_name.endswith('.wav'):
        
        file_path = os.path.join(audio_dir, file_name)
        
        audio, sr = librosa.load(file_path, sr=None, mono=True)
    
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)

        mfccs = np.mean(mfccs.T, axis=0)

        audio_data.append(mfccs)
        file_id = os.path.splitext(file_name)[0]
        transcription = transcripts.get(file_id, '')
        
        
        transcription_data.append(transcription)
print(transcription_data)
print(audio_data)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(transcription_data)
num_classes = len(tokenizer.word_index) + 1
transcription_data = tokenizer.texts_to_matrix(transcription_data, mode='binary')
print(transcription_data)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 ...
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 1. 1. 1.]]


In [None]:
audio_data = np.array(audio_data)
transcription_data = np.array(transcription_data)
print(audio_data)
print(transcription_data)

[[-3.13856079e+02  7.97911911e+01 -1.26718531e+01 ...  1.32483339e+00
  -2.49886572e-01  1.29973650e+00]
 [-2.95998016e+02  7.07941284e+01 -9.36836052e+00 ...  2.11066651e+00
   2.65832067e-01  1.99329877e+00]
 [-2.99330139e+02  6.53351593e+01  1.08323593e+01 ...  1.36216831e+00
  -1.66670787e+00  1.94644228e-01]
 ...
 [-2.90156555e+02  7.17155533e+01  4.96193218e+00 ...  8.49238992e-01
  -2.23540640e+00  1.83953476e+00]
 [-2.91123596e+02  7.95055466e+01 -1.75643909e+00 ...  7.66244590e-01
  -9.73294556e-01  1.55638421e+00]
 [-2.81546112e+02  7.48598480e+01 -1.20056715e+01 ...  2.40486288e+00
  -7.69369006e-01  8.22053611e-01]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 ...
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 1. 1. 1.]]


In [None]:
split_index = int(0.9 * len(audio_data))
train_audio = audio_data[:split_index]
print(transcription_data[:split_index])

train_transcription = transcription_data[:split_index]
test_audio = audio_data[split_index:]
test_transcription = transcription_data[split_index:]
#print(transcription_data)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]]


In [None]:
model = models.Sequential()
model.add(layers.Dense(512, input_shape=(40,), activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(num_classes, activation='softmax'))

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
early_stopping_callback = callbacks.EarlyStopping(patience=10, monitor='val_loss', restore_best_weights=True)

In [None]:
#print(train_audio)
#print(train_transcription)
model.fit(train_audio, train_transcription, validation_split=0.2, epochs=10, batch_size=32, callbacks=[early_stopping_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3bc1bb3fd0>

In [None]:
loss, accuracy = model.evaluate(test_audio, test_transcription)
print(f'Test loss: {loss:.3f}, test accuracy: {accuracy:.3f}')

Test loss: 14911.971, test accuracy: 0.857


In [None]:
num_samples = len(audio)
print("num_samples:", num_samples)
chunk_size = 10
num_chunks = int(np.ceil(num_samples  / ( sr * chunk_size) ))
num_chunks = 10
print("num_chunks:", num_chunks)
transcriptions = []
for i in range(num_chunks):
  start_index = i*sr*chunk_size
  end_index =  min(start_index +sr*chunk_size, num_samples)
  mfccs = librosa.feature.mfcc(y=audio[start_index:end_index], sr=sr, n_mfcc=40)
  mfccs = np.mean(mfccs.T, axis=0)
  prediction = model.predict(np.array([mfccs]))
  transcription = tokenizer.sequences_to_texts([[np.argmax(prediction)]])
  transcriptions.append(transcription)

print(transcription)

num_samples: 158109
num_chunks: 10




['the']
