<a href="https://colab.research.google.com/github/iamab3/Basics-of-Speech-Recognition/blob/main/Transcribing_Audio_with_Google_Web_Speech_Speech_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt

#Library for audio analysis
import librosa
import librosa.display
import soundfile as sf
import speech_recognition as sr

from jiwer import wer, cer
from IPython.display import Audio

import whisper

import csv
import os
import tempfile
import wave

from gtts import gTTS

In [None]:
# Loading the audio file
audio_signal, sample_rate = librosa.load('speech_01.wav', sr=None)

In [None]:
sample_rate

In [None]:
# Diplaying the audio by plotting amplitude with time using matplotlib
plt.figure(figsize=(12, 4))
librosa.display.waveshow(audio_signal, sr=sample_rate)
plt.title('Waveform')
plt.xlabel('time(s)')
plt.ylabel('Amplitude')
plt.show()

# To hear the file as well.
Audio('speech_01.wav')

Using SpeechRecognition Library: Google Web Speech API

In [None]:
recognizer = sr.Recognizer()

In [None]:
file_path = 'speech_01.wav'

In [None]:
# Function to transcribe audio to text
def transcribe_audio(file_path):
  with sr.AudioFile(file_path) as source:
    audio_data = recognizer.record(source)
    text = recognizer.recognize_google(audio_data)
    print(text)
    return text

transcribed_text = transcribe_audio(file_path)

Word Error Rate (WER) and Character Error Rate (CER)

In [None]:
ground_truth = """My name is Ivan and I am excited to have you as part of our learning community!
Before we get started, I’d like to tell you a little bit about myself. I’m a sound engineer turned data scientist,
curious about machine learning and Artificial Intelligence. My professional background is primarily in media production,
with a focus on audio, IT, and communications"""

In [None]:
calculated_wer = wer(ground_truth, transcribed_text)
calculated_cer = cer(ground_truth, transcribed_text)
print(f"Word Error Rate (WER): {calculated_wer:.4f}")
print(f"Character Error Rate (CER): {calculated_cer:.4f}")

Now, let's remove the noise. In order to do it, let's convert amplitude-time graph to frequency domain graph to obtain Spectrogram

In [None]:
S = librosa.stft(audio_signal)

In [None]:
S_db = librosa.amplitude_to_db(abs(S), ref = np.max)

In [None]:
# Ensuring the maximum amplitude is set to zero
np.max(S_db)

In [None]:
# plot the spectrogram
plt.figure(figsize=(12,4))
librosa.display.specshow(data=S_db, sr = sample_rate, x_axis='time', y_axis='log')
plt.colorbar(format = '%+2.0f db')
plt.title('Spectrogram')
plt.xaxis('Time')
plt.yaxis('Frequency')
plt.show()

Dealing with Background Noise

In [None]:
# applying low frequencies cut-off filter
signal_filtered = librosa.effects.preemphasis(audio_signal, coef = 0.97)
sf.write('filtered_speech_01.wav', signal_filtered, sample_rate)
output_file = 'filtered_speech_01.wav'

In [None]:
# Play the original file
print('Playing the original audio:')
Audio(file_path)

In [None]:
# Play the filtered file
print('Playing the filtered audio:')
Audio(output_file)

In [None]:
S = librosa.stft(signal_filtered)
S_db = librosa.amplitude_to_db(abs(S), ref = np.max)

# plot the spectrogram
plt.figure(figsize=(12,4))
librosa.display.specshow(data=S_db, sr = sample_rate, x_axis='time', y_axis='log')
plt.colorbar(format = '%+2.0f db')
plt.title('Spectrogram')
plt.xaxis('Time')
plt.yaxis('Frequency')
plt.show()

In [None]:
transcribed_text_filtered = transcribe_audio('filtered_speech_01.wav')

In [None]:
# Quantitative assessment of the results
calculated_wer = wer(ground_truth, transcribed_text_filtered)
calculated_cer = cer(ground_truth, transcribed_text_filtered)
print(f"Word Error Rate (WER): {calculated_wer:.4f}")
print(f"Character Error Rate (CER): {calculated_cer:.4f}")

Transcribing Audio with OpenAI's Whisper

In [None]:
model = whisper.load_model("base")

In [None]:
result = model.transcribe(file_path)

In [None]:
transcribed_text_whisper = result['text']
transcribed_text_whisper

In [None]:
result['language']

In [None]:
# Quantitative assessment of the results
calculated_wer = wer(ground_truth, transcribed_text_whisper)
calculated_cer = cer(ground_truth, transcribed_text_whisper)
print(f"Word Error Rate (WER): {calculated_wer:.4f}")
print(f"Character Error Rate (CER): {calculated_cer:.4f}")

Transcribing multiple audio files from a directory

In [None]:
directory_path = "C:/Users/Downloads/Speech Recognition"

In [None]:
def transcribe_directory_whisper(directory_path):
  transcriptions = []
  for file_name in os.listdir(directory_path):
    if file_name.endswith(".wav"):
      files_path = os.path.join(directory_path, file_name)
      result = model.transcribe(files_path)
      transcription = result["text"]
      transcriptions.append({"file_name": file_name, "transcription": transcription})
  return transcriptions

In [None]:
transcriptions = transcribe_directory_whisper(directory_path)
transcriptions

Saving audio transcriptions to csv file

In [None]:
output_file = "transcriptions.csv"

# Opening the file
with open(output_file, mode = "w", newline="") as file:
  writer = csv.writer(file)
  writer.writerow(["Track Number", "File Name", "Transcription"])
  for number, transcription in enumerate(transcriptions, start = 1):
    writer.writerow([number, transcription["file_name"], transcription["transcription"]])

Reversing the process: Text to Speech

In [None]:
text = """ Thank you for taking the time out to learn about speech recognition! I'm excited. But this concludes our lesson. See you soon! """

tts = gTTS(text = text, lang = 'en')
tts.save("output.mp3")

os.system("start output.mp3")