**Step1:Installing Required Libraries and Dependencies**

In [1]:
!pip install transformers datasets torchaudio phonemizer jiwer
!pip install datasets torchaudio
!apt-get install -y espeak
!pip install ipython
!pip install gtts

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting phonemizer
  Downloading phonemizer-3.3.0-py3-none-any.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jiwer
  Downloading jiwer-3.0.5-py3-none-any.whl.metadata (2.7 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting segments (from phonemizer)
  Downloading segments-2.2.1-py2.py3-none-any.whl.metadata (3.3 kB)
Coll

** Imports and  Initial Setup**

In [3]:
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from phonemizer import phonemize
from phonemizer.separator import Separator
from jiwer import wer, cer
from gtts import gTTS
import IPython.display as ipd
from transformers import Trainer, TrainingArguments


**Step 3:Load Wav2Vec2 Processor and Mode**

In [4]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = pred_logits.argmax(-1)
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    # Compute WER (Word Error Rate)
    word_error_rate = wer(label_str, pred_str)
    char_error_rate = cer(label_str, pred_str)
    return {"wer": word_error_rate, "cer": char_error_rate}


In [6]:
training_args = TrainingArguments(
    output_dir="./wav2vec2-librispeech",  # Model output directory
    evaluation_strategy="no",  # Not training on any dataset, no need for training strategy
    num_train_epochs=1,  # Keep epochs as 1 since we are not training on dataset
    per_device_train_batch_size=1,  # Batch size for the evaluation
    logging_steps=10,
    learning_rate=1e-4,  # Fine-tune learning rate
    save_steps=500,
    save_total_limit=2,
    fp16=True,  # Use FP16 for training
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=processor.tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [7]:
def load_audio(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    print("Original Waveform shape:", waveform.shape)
    print("Original Sample rate:", sample_rate)

    # Resample the waveform to 16 kHz if needed
    if sample_rate != 16000:
        resample_transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resample_transform(waveform)

    print("Resampled Waveform shape:", waveform.shape)

    # Convert to mono if the waveform is multi-channel
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)  # Convert to mono by averaging channels

    # Remove batch dimension
    waveform = waveform.squeeze(0)  # Shape becomes [T] where T is the length of the audio
    return waveform


In [8]:
def process_audio(file_path):
    waveform = load_audio(file_path)

    # Process the waveform using the Wav2Vec2 processor
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
    print("Processed Input shape:", inputs.input_values.shape)
    return inputs


In [9]:
def predict_text(audio_path):
    inputs = process_audio(audio_path)

    # Debugging inputs
    print("Shape of input_values:", inputs.input_values.shape)
    print("Data type of input_values:", inputs.input_values.dtype)

    # Perform transcription
    with torch.no_grad():
        logits = model(inputs.input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    return transcription


In [10]:
def text_to_phonemes(text):
    return phonemize(text, backend='espeak', separator=Separator(" "))


In [11]:
def generate_audio_feedback(text, filename="correct_pronunciation.mp3"):
    sanitized_text = text.replace(" ", "_")  # Avoid invalid characters in filenames
    filename = f"{sanitized_text}.mp3"
    tts = gTTS(text)
    tts.save(filename)
    return filename


***Providing Feedback***

In [12]:
def provide_feedback(audio_path, reference_text):
    transcription = predict_text(audio_path)
    reference_phonemes = text_to_phonemes(reference_text)
    predicted_phonemes = text_to_phonemes(transcription)
    word_error_rate = wer(reference_text, transcription)
    char_error_rate = cer(reference_text, transcription)

    print("Transcription:", transcription)
    print("Reference Text:", reference_text)
    print("Reference Phonemes:", reference_phonemes)
    print("Predicted Phonemes:", predicted_phonemes)
    print(f"Word Error Rate (WER): {word_error_rate:.2f}")
    print(f"Character Error Rate (CER): {char_error_rate:.2f}")

    # Generate and play the correct pronunciation
    print("\nPlaying the correct pronunciation:")
    audio_file = generate_audio_feedback(reference_text)
    ipd.display(ipd.Audio(audio_file))

    # Provide phoneme-level feedback
    print("\nPronunciation Feedback:")
    for ref, pred in zip(reference_phonemes.split(), predicted_phonemes.split()):
        feedback = "Correct" if ref == pred else "Incorrect"
        print(f"Expected: {ref}, Predicted: {pred} - {feedback}")
        if feedback == "Incorrect":
            print(f"Playing correct pronunciation for phoneme: {ref}")
            phoneme_audio = generate_audio_feedback(ref)
            ipd.display(ipd.Audio(phoneme_audio))


In [14]:
from IPython.display import display, HTML


html_code = '''
<!DOCTYPE html>
<html>
<body>

<h2>Click to Record Audio</h2>
<button onclick="startRecording()">Start Recording</button>
<button onclick="stopRecording()">Stop Recording</button>

<script>
let mediaRecorder;
let audioChunks = [];

function startRecording() {
    navigator.mediaDevices.getUserMedia({ audio: true })
    .then(stream => {
        mediaRecorder = new MediaRecorder(stream);
        mediaRecorder.ondataavailable = event => {
            audioChunks.push(event.data);
        };
        mediaRecorder.onstop = () => {
            const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
            const audioUrl = URL.createObjectURL(audioBlob);
            const audio = new Audio(audioUrl);
            audio.controls = true;
            document.body.appendChild(audio);
            const link = document.createElement('a');
            link.href = audioUrl;
            link.download = 'recorded_audio.wav';
            link.textContent = 'Download Recorded Audio';
            document.body.appendChild(link);
        };
        mediaRecorder.start();
    })
    .catch(error => {
        console.error("Error accessing the microphone", error);
    });
}

function stopRecording() {
    mediaRecorder.stop();
}
</script>

</body>
</html>
'''

display(HTML(html_code))


In [15]:
audio_path = "/content/recorded_audio.wav"
reference_text = "CAN YOU GIVE ME THE RECEIPT"
provide_feedback(audio_path, reference_text)


Original Waveform shape: torch.Size([1, 256320])
Original Sample rate: 48000
Resampled Waveform shape: torch.Size([1, 85440])
Processed Input shape: torch.Size([1, 85440])
Shape of input_values: torch.Size([1, 85440])
Data type of input_values: torch.float32
Transcription: CAN YOU GIVE ME THE DICIPT
Reference Text: CAN YOU GIVE ME THE RECEIPT
Reference Phonemes: kæn juː ɡɪv miː ðə ɹɪsiːt 
Predicted Phonemes: kæn juː ɡɪv miː ðə dɪsɪpt 
Word Error Rate (WER): 0.17
Character Error Rate (CER): 0.11

Playing the correct pronunciation:



Pronunciation Feedback:
Expected: kæn, Predicted: kæn - Correct
Expected: juː, Predicted: juː - Correct
Expected: ɡɪv, Predicted: ɡɪv - Correct
Expected: miː, Predicted: miː - Correct
Expected: ðə, Predicted: ðə - Correct
Expected: ɹɪsiːt, Predicted: dɪsɪpt - Incorrect
Playing correct pronunciation for phoneme: ɹɪsiːt
