In [None]:
!pip install torch torchaudio transformers protobuf phonemizer
!apt-get update && apt-get install -y espeak
!pip install phonemizer pypinyin

Collecting phonemizer
  Downloading phonemizer-3.3.0-py3-none-any.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11

In [None]:
#Krish's code:

import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import librosa
import numpy as np

# Load Pretrained Wav2Vec2.0 Model
def load_wav2vec_model(model="facebook/wav2vec2-large-960h"):
    processor = Wav2Vec2Processor.from_pretrained(model)
    model = Wav2Vec2ForCTC.from_pretrained(model)
    return processor, model

# Load MMS Model (Uses Wav2Vec2ForCTC)
def load_mms_model(model="facebook/mms-1b-all"):
    processor = AutoProcessor.from_pretrained(model)
    model = Wav2Vec2ForCTC.from_pretrained(model)  # Changed to Wav2Vec2ForCTC
    return processor, model

# Load and Preprocess Audio
def preprocess_audio(audio_path):
    speech_array, sampling_rate = librosa.load(audio_path, sr=16000)
    return speech_array

# Run Wav2Vec2.0 CTC Model
def run_wav2vec(audio_path, model=""):
    if model != "":
      processor, model = load_wav2vec_model(model)
    else:
      processor, model = load_wav2vec_model()
    speech_array = preprocess_audio(audio_path)
    input_values = processor(speech_array, return_tensors="pt", padding=True, sampling_rate=16000).input_values

    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]

    return transcription

# Run MMS Model (Wav2Vec2ForCTC-based)
def run_mms(audio_path):
    processor, model = load_mms_model()
    speech_array = preprocess_audio(audio_path)
    input_values = processor(speech_array, return_tensors="pt", padding=True, sampling_rate=16000).input_values  # Fixed key

    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]

    return transcription

# Testing the Models for ASR
#temporaily commented

print("\n\n\n")
audio_path = "CorrectSentence.wav"  # Replace with audio path
# print("Wav2Vec2.0 Transcription:", run_wav2vec(audio_path))
# print("MMS Transcription:", run_mms(audio_path))

# Testing the Models for Misprounciations
print("\n\n")
audio_path = "Misprounciations.wav"  # Replace with audio path
# print("Wav2Vec2.0 Transcription:", run_wav2vec(audio_path))
# print("MMS Transcription:", run_mms(audio_path))










#MMS (Alternative to Wav2Vec2 for Non-English Languages)

In [None]:
import torch
import librosa
import os
import subprocess
from transformers import AutoProcessor, AutoModelForCTC
from phonemizer import phonemize
from difflib import SequenceMatcher

# Load MMS Phoneme-Based ASR Model
def load_mms_model():
    try:
        print("[INFO] Loading MMS Model...")
        processor = AutoProcessor.from_pretrained("facebook/mms-1b-all")
        model = AutoModelForCTC.from_pretrained("facebook/mms-1b-all")
        print("[SUCCESS] MMS Model Loaded Successfully!")
        return processor, model
    except Exception as e:
        print(f"[ERROR] Failed to load MMS model: {e}")
        exit(1)

# Convert Audio to Text Using MMS
def audio_to_text(audio_path, processor, model):
    if not os.path.exists(audio_path):
        print(f"[ERROR] Audio file not found: {audio_path}")
        exit(1)

    print(f"[INFO] Processing audio file: {audio_path}")

    try:
        speech_array, _ = librosa.load(audio_path, sr=16000)
    except Exception as e:
        print(f"[ERROR] Failed to load audio: {e}")
        exit(1)

    try:
        input_values = processor(speech_array, return_tensors="pt", sampling_rate=16000).input_values
        with torch.no_grad():
            logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        text_output = processor.batch_decode(predicted_ids)[0]  # MMS outputs words
        print(f"[SUCCESS] Extracted Text from Audio: {text_output}")
        return text_output
    except Exception as e:
        print(f"[ERROR] Failed to process audio: {e}")
        exit(1)

# Convert Text to Phonemes Using G2P Model
def text_to_phonemes(text):
    print(f"[INFO] Converting text to phonemes: {text}")
    try:
        phonemes = phonemize(text, backend="espeak", language="en-us").split()
        print(f"[SUCCESS] Extracted Phonemes: {phonemes}")
        return phonemes
    except Exception as e:
        print(f"[ERROR] Phonemization failed: {e}")
        exit(1)

# Compare Phoneme Sequences
def compare_phonemes(audio_phonemes, text_phonemes):
    matcher = SequenceMatcher(None, audio_phonemes, text_phonemes)
    differences = []

    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == "replace":
            differences.append(f"🔄 Replace: {audio_phonemes[i1:i2]} → {text_phonemes[j1:j2]}")
        elif tag == "insert":
            differences.append(f"➕ Insert: {text_phonemes[j1:j2]}")
        elif tag == "delete":
            differences.append(f"❌ Delete: {audio_phonemes[i1:i2]}")

    return differences

# Compare Words for Mismatches
def compare_words(audio_text, expected_text, not_english=False):

    if not_english:
        expected_words = list(expected_text)
        audio_words = list(audio_text)
    else:
      audio_words = audio_text.upper().split()
      expected_words = expected_text.upper().split()

    print(expected_text, audio_text)

    matcher = SequenceMatcher(None, audio_words, expected_words)
    mismatched_indices = []
    corrected_sentence = []

    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == "replace" or tag == "insert" or tag == "delete":
            mismatched_indices.extend(range(j1, j2))

        if tag == "replace":
            corrected_sentence.extend([f"[{expected_words[j]}] {audio_words[i]}" for i, j in zip(range(i1, i2), range(j1, j2))])
        elif tag == "insert":
            corrected_sentence.extend([f"[{expected_words[j]}]" for j in range(j1, j2)])
        elif tag == "delete":
            corrected_sentence.extend(audio_words[i1:i2])
        else:
            corrected_sentence.extend(audio_words[i1:i2])

    mismatched_indices = sorted(set(mismatched_indices))
    corrected_sentence_output = " ".join(corrected_sentence)

    print(f"\n🔹 Mismatched Word Indices: {mismatched_indices}")
    print(f"🔹 Sentence Correction: {corrected_sentence_output}")

    return mismatched_indices, corrected_sentence_output

# Generate Correct Pronunciation Audio Using eSpeak
def generate_correct_pronunciation(text, output_audio_path="correct_pronunciation.wav"):
    print(f"\n[INFO] Generating correct pronunciation for: {text}")
    try:
        subprocess.run(["espeak", "-w", output_audio_path, text], check=True)
        print(f"[SUCCESS] Correct pronunciation audio saved: {output_audio_path}")
    except Exception as e:
        print(f"[ERROR] Failed to generate pronunciation audio: {e}")

# Main Function
def main():
    # Load MMS model
    processor, model = load_mms_model()

    # Inputs
    text_input = "The quick brown fox jumps over the lazy dog"
    audio_file = "CorrectSentence.wav"

    # Convert Audio to Text
    audio_text = audio_to_text(audio_file, processor, model)

    # Convert Audio Text to Phonemes
    audio_phonemes = text_to_phonemes(audio_text)

    # Convert Expected Text to Phonemes
    text_phonemes = text_to_phonemes(text_input)

    # Print Original Phoneme-Based Comparison
    print(f"\n[Audio Phonemes] {audio_phonemes}")
    print(f"[Expected Phonemes] {text_phonemes}")

    # Compare Phonemes for Mispronunciations
    discrepancies = compare_phonemes(audio_phonemes, text_phonemes)
    print("\n📝 [Phoneme Differences]")
    for diff in discrepancies:
        print(diff)

    # Compare Words for Errors
    mismatched_indices, corrected_sentence_output = compare_words(audio_text, text_input)

    # Generate Correct Pronunciation Audio
    generate_correct_pronunciation(text_input)

#temporarily commented
# Run the script
# if __name__ == "__main__":
#     main()


#Wav2Vec2 (MMS Alternative)

In [None]:
import torch
import librosa
import os
import subprocess
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from phonemizer import phonemize
from difflib import SequenceMatcher

# Load Wav2Vec2 Model
def load_wav2vec2_model():
    try:
        print("[INFO] Loading Wav2Vec2 Model...")
        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
        print("[SUCCESS] Wav2Vec2 Model Loaded Successfully!")
        return processor, model
    except Exception as e:
        print(f"[ERROR] Failed to load Wav2Vec2 model: {e}")
        exit(1)

# Convert Audio to Text Using Wav2Vec2
def audio_to_text(audio_path, processor, model):
    if not os.path.exists(audio_path):
        print(f"[ERROR] Audio file not found: {audio_path}")
        exit(1)

    print(f"[INFO] Processing audio file: {audio_path}")

    try:
        speech_array, _ = librosa.load(audio_path, sr=16000)
    except Exception as e:
        print(f"[ERROR] Failed to load audio: {e}")
        exit(1)

    try:
        input_values = processor(speech_array, return_tensors="pt", sampling_rate=16000).input_values
        with torch.no_grad():
            logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        text_output = processor.batch_decode(predicted_ids)[0]  # Wav2Vec2 outputs words
        print(f"[SUCCESS] Extracted Text from Audio: {text_output}")
        return text_output
    except Exception as e:
        print(f"[ERROR] Failed to process audio: {e}")
        exit(1)

# Convert Text to Phonemes Using G2P Model
def text_to_phonemes(text):
    print(f"[INFO] Converting text to phonemes: {text}")
    try:
        phonemes = phonemize(text, backend="espeak", language="en-us").split()
        print(f"[SUCCESS] Extracted Phonemes: {phonemes}")
        return phonemes
    except Exception as e:
        print(f"[ERROR] Phonemization failed: {e}")
        exit(1)

# Compare Phoneme Sequences
def compare_phonemes(audio_phonemes, text_phonemes):
    matcher = SequenceMatcher(None, audio_phonemes, text_phonemes)
    differences = []

    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == "replace":
            differences.append(f"🔄 Replace: {audio_phonemes[i1:i2]} → {text_phonemes[j1:j2]}")
        elif tag == "insert":
            differences.append(f"➕ Insert: {text_phonemes[j1:j2]}")
        elif tag == "delete":
            differences.append(f"❌ Delete: {audio_phonemes[i1:i2]}")

    return differences

# Compare Words for Mismatches
def compare_words(audio_text, expected_text):
    audio_words = audio_text.upper().split()
    expected_words = expected_text.upper().split()

    matcher = SequenceMatcher(None, audio_words, expected_words)
    mismatched_indices = []
    corrected_sentence = []

    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == "replace" or tag == "insert" or tag == "delete":
            mismatched_indices.extend(range(j1, j2))

        if tag == "replace":
            corrected_sentence.extend([f"[{expected_words[j]}] {audio_words[i]}" for i, j in zip(range(i1, i2), range(j1, j2))])
        elif tag == "insert":
            corrected_sentence.extend([f"[{expected_words[j]}]" for j in range(j1, j2)])
        elif tag == "delete":
            corrected_sentence.extend(audio_words[i1:i2])
        else:
            corrected_sentence.extend(audio_words[i1:i2])

    mismatched_indices = sorted(set(mismatched_indices))
    corrected_sentence_output = " ".join(corrected_sentence)

    print(f"\n🔹 Mismatched Word Indices: {mismatched_indices}")
    print(f"🔹 Sentence Correction: {corrected_sentence_output}")

    return mismatched_indices, corrected_sentence_output

# Generate Correct Pronunciation Audio Using eSpeak
def generate_correct_pronunciation(text, output_audio_path="correct_pronunciation.wav"):
    print(f"\n[INFO] Generating correct pronunciation for: {text}")
    try:
        subprocess.run(["espeak", "-w", output_audio_path, text], check=True)
        print(f"[SUCCESS] Correct pronunciation audio saved: {output_audio_path}")
    except Exception as e:
        print(f"[ERROR] Failed to generate pronunciation audio: {e}")

# Main Function
def main():
    # Load Wav2Vec2 model
    processor, model = load_wav2vec2_model()

    # Inputs
    text_input = "The musician plays a beautiful tune."
    audio_file = "/Users/krish/Downloads/asr-inference/audio/E10.wav"

    # Convert Audio to Text
    audio_text = audio_to_text(audio_file, processor, model)

    # Convert Audio Text to Phonemes
    audio_phonemes = text_to_phonemes(audio_text)

    # Convert Expected Text to Phonemes
    text_phonemes = text_to_phonemes(text_input)

    # Print Original Phoneme-Based Comparison
    print(f"\n[Audio Phonemes] {audio_phonemes}")
    print(f"[Expected Phonemes] {text_phonemes}")

    # Compare Phonemes for Mispronunciations
    discrepancies = compare_phonemes(audio_phonemes, text_phonemes)
    print("\n📝 [Phoneme Differences]")
    for diff in discrepancies:
        print(diff)

    # Compare Words for Errors
    mismatched_indices, corrected_sentence_output = compare_words(audio_text, text_input)

    # Generate Correct Pronunciation Audio
    generate_correct_pronunciation(text_input)

# Run the script
if __name__ == "__main__":
    main()


In [None]:
# Whisper

from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch
import librosa

processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

def whisper_model(file_path):
  audio_path = file_path
  audio, sr = librosa.load(audio_path, sr=16000)

  inputs = processor(audio, sampling_rate=16000, return_tensors="pt")

  with torch.no_grad():
      predicted_ids = model.generate(inputs["input_features"])

  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
  print("Transcription:", transcription)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

In [None]:
#Ignores the error we get from cell output below
import logging
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

In [None]:
from google.colab import drive
drive.mount("/content/drive")

import os

folder_path = "/content/drive/My Drive/sunny days week 4"
files = os.listdir(folder_path)
print(folder_path)

Mounted at /content/drive
/content/drive/My Drive/sunny days week 4


In [None]:
#Wav2Vec function for Mandarin

from pypinyin import pinyin, lazy_pinyin, Style

import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import librosa
import numpy as np

def wav2vec2_mdd_mandarin(audio_path, correct, language, model=""):

  if model != "":
    wrong = run_wav2vec(audio_path, model=model)
  else:
    wrong = run_wav2vec(audio_path)

  wrong = wrong.replace("<unk>", "?")

  pinyin_correct = pinyin(correct, style=Style.TONE3)
  pinyin_wrong = pinyin(wrong, style=Style.TONE3)

  pinyin_correct = [item for sublist in pinyin_correct for item in sublist]
  pinyin_wrong = [item for sublist in pinyin_wrong for item in sublist]

  if pinyin_correct != pinyin_wrong:
    print("Pinyin correct: ", pinyin_correct)
    print("Pinyin wrong: ", pinyin_wrong)

    #Output for learner
    compare_words(pinyin_wrong, pinyin_correct, not_english=True)

    #The proper way to pronunce the mistakes
    generate_correct_pronunciation(correct, output_audio_path="correct_pronunciation.wav")
  else:
    print("No mispronunciation")

  print("\n \n")

  return pinyin_correct, pinyin_wrong

In [None]:
correct_text = [
    "妈妈骂马吗",
    "四十四是四十，十四是十四",
    "吃葡萄不吐葡萄皮",
    "上山打老虎",
    "这个自行车是谁的",
    "中国文化很有趣",
    "你喜欢什么颜色",
    "今天的天气怎么样",
    "北京是中国的首都",
    "他的汉语说得很好"
]

pairs = []
for i in range(1, 11):
  pairs.append([f"{folder_path}/mandarin chinese/M{i}.aifc", correct_text[i-1], "cmn"])

for pair in pairs:
  wav2vec2_mdd_mandarin(pair[0], pair[1], pair[2], model="jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn")

preprocessor_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/44.4k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.28G [00:00<?, ?B/s]

Pinyin correct:  ['ma1', 'ma1', 'ma4', 'ma3', 'ma']
Pinyin wrong:  ['ma3', 'ma3', 'ma3', '?', 'ma3']
['ma1', 'ma1', 'ma4', 'ma3', 'ma'] ['ma3', 'ma3', 'ma3', '?', 'ma3']

🔹 Mismatched Word Indices: [0, 1, 2, 4]
🔹 Sentence Correction: [ma1] [ma1] [ma4] ma3 [ma] ma3

[INFO] Generating correct pronunciation for: 妈妈骂马吗
[SUCCESS] Correct pronunciation audio saved: correct_pronunciation.wav




Pinyin correct:  ['si4', 'shi2', 'si4', 'shi4', 'si4', 'shi2', '，', 'shi2', 'si4', 'shi4', 'shi2', 'si4']
Pinyin wrong:  ['shi2', 'si4', 'shi2', 'shi4', 'si4', 'shi2', 'su4', 'shi2', 'si3', 'si4', 'shi2']
['si4', 'shi2', 'si4', 'shi4', 'si4', 'shi2', '，', 'shi2', 'si4', 'shi4', 'shi2', 'si4'] ['shi2', 'si4', 'shi2', 'shi4', 'si4', 'shi2', 'su4', 'shi2', 'si3', 'si4', 'shi2']

🔹 Mismatched Word Indices: [0, 6, 9, 11]
🔹 Sentence Correction: [si4] shi2 si4 shi2 shi4 si4 shi2 [，] su4 shi2 si3 si4 [shi4] shi2 [si4]

[INFO] Generating correct pronunciation for: 四十四是四十，十四是十四
[SUCCESS] Correct pronunciation audio saved: correct_pronunciation.wav




Pinyin correct:  ['chi1', 'pu2', 'tao2', 'bu4', 'tu3', 'pu2', 'tao2', 'pi2']
Pinyin wrong:  ['ke1', 'pu3', 'tao2', 'bu4', 'tu3', 'pu3', 'tao2', 'ni2']
['chi1', 'pu2', 'tao2', 'bu4', 'tu3', 'pu2', 'tao2', 'pi2'] ['ke1', 'pu3', 'tao2', 'bu4', 'tu3', 'pu3', 'tao2', 'ni2']

🔹 Mismatched Word Indices: [0, 1, 5, 7]
🔹 Sentence Correction: [chi1] ke1 [pu2] pu3 tao2 bu4 tu3 [pu2] pu3 tao2 [pi2] ni2

[INFO] Generating correct pronunciation for: 吃葡萄不吐葡萄皮
[SUCCESS] Correct pronunciation audio saved: correct_pronunciation.wav




Pinyin correct:  ['shang4', 'shan1', 'da3', 'lao3', 'hu3']
Pinyin wrong:  ['sheng4', 'shan1', 'da3', 'lao2', 'hu2']
['shang4', 'shan1', 'da3', 'lao3', 'hu3'] ['sheng4', 'shan1', 'da3', 'lao2', 'hu2']

🔹 Mismatched Word Indices: [0, 3, 4]
🔹 Sentence Correction: [shang4] sheng4 shan1 da3 [lao3] lao2 [hu3] hu2

[INFO] Generating correct pronunciation for: 上山打老虎
[SUCCESS] Correct pronunciation audio saved: correct_pronunciation.wav




Pinyin correct:  ['zhe4', 'ge4', 'zi4', 'xing2', 'che1', 'shi4', 'shei2', 'de']
Pinyin wrong:  ['shi4', 'ge4', 'si4', 'xing2', 'che1', 'shi4', 'shei2', 'de']
['zhe4', 'ge4', 'zi4', 'xing2', 'che1', 'shi4', 'shei2', 'de'] ['shi4', 'ge4', 'si4', 'xing2', 'che1', 'shi4', 'shei2', 'de']

🔹 Mismatched Word Indices: [0, 2]
🔹 Sentence Correction: [zhe4] shi4 ge4 [zi4] si4 xing2 che1 shi4 shei2 de

[INFO] Generating correct pronunciation for: 这个自行车是谁的
[SUCCESS] Correct pronunciation audio saved: correct_pronunciation.wav




Pinyin correct:  ['zhong1', 'guo2', 'wen2', 'hua4', 'hen3', 'you3', 'qu4']
Pinyin wrong:  ['song1', 'guo2', 'ren2', 'hua4', 'hua4', 'you3', 'qu4']
['zhong1', 'guo2', 'wen2', 'hua4', 'hen3', 'you3', 'qu4'] ['song1', 'guo2', 'ren2', 'hua4', 'hua4', 'you3', 'qu4']

🔹 Mismatched Word Indices: [0, 2, 4]
🔹 Sentence Correction: [zhong1] song1 guo2 [wen2] ren2 hua4 [hen3] hua4 you3 qu4

[INFO] Generating correct pronunciation for: 中国文化很有趣
[SUCCESS] Correct pronunciation audio saved: correct_pronunciation.wav




Pinyin correct:  ['ni3', 'xi3', 'huan1', 'shen2', 'me', 'yan2', 'se4']
Pinyin wrong:  ['ni2', 'xi1', 'huan2', 'shen2', 'ma', 'yan2', 'ci2', 'de']
['ni3', 'xi3', 'huan1', 'shen2', 'me', 'yan2', 'se4'] ['ni2', 'xi1', 'huan2', 'shen2', 'ma', 'yan2', 'ci2', 'de']

🔹 Mismatched Word Indices: [0, 1, 2, 4, 6]
🔹 Sentence Correction: [ni3] ni2 [xi3] xi1 [huan1] huan2 shen2 [me] ma yan2 [se4] ci2

[INFO] Generating correct pronunciation for: 你喜欢什么颜色
[SUCCESS] Correct pronunciation audio saved: correct_pronunciation.wav




Pinyin correct:  ['jin1', 'tian1', 'de', 'tian1', 'qi4', 'zen3', 'me', 'yang4']
Pinyin wrong:  ['ji1', 'tian1', 'tian1', 'qi4', 'zen3', 'me', 'yan3']
['jin1', 'tian1', 'de', 'tian1', 'qi4', 'zen3', 'me', 'yang4'] ['ji1', 'tian1', 'tian1', 'qi4', 'zen3', 'me', 'yan3']

🔹 Mismatched Word Indices: [0, 2, 7]
🔹 Sentence Correction: [jin1] ji1 tian1 [de] tian1 qi4 zen3 me [yang4] yan3

[INFO] Generating correct pronunciation for: 今天的天气怎么样
[SUCCESS] Correct pronunciation audio saved: correct_pronunciation.wav




Pinyin correct:  ['bei3', 'jing1', 'shi4', 'zhong1', 'guo2', 'de', 'shou3', 'du1']
Pinyin wrong:  ['bei3', 'jing3', 'shi4', 'zhong1', 'guo2', 'de', 'shou3', 'du4', 'le']
['bei3', 'jing1', 'shi4', 'zhong1', 'guo2', 'de', 'shou3', 'du1'] ['bei3', 'jing3', 'shi4', 'zhong1', 'guo2', 'de', 'shou3', 'du4', 'le']

🔹 Mismatched Word Indices: [1, 7]
🔹 Sentence Correction: bei3 [jing1] jing3 shi4 zhong1 guo2 de shou3 [du1] du4

[INFO] Generating correct pronunciation for: 北京是中国的首都
[SUCCESS] Correct pronunciation audio saved: correct_pronunciation.wav




Pinyin correct:  ['ta1', 'de', 'han4', 'yu3', 'shuo1', 'de2', 'hen3', 'hao3']
Pinyin wrong:  ['cha4', 'tian2', 'nv3', 'shuo1', 'le', 'fei1', 'chang2', 'hao3', 'de']
['ta1', 'de', 'han4', 'yu3', 'shuo1', 'de2', 'hen3', 'hao3'] ['cha4', 'tian2', 'nv3', 'shuo1', 'le', 'fei1', 'chang2', 'hao3', 'de']

🔹 Mismatched Word Indices: [0, 1, 2, 3, 5, 6]
🔹 Sentence Correction: [ta1] cha4 [de] tian2 [han4] nv3 shuo1 [de2] le [hen3] fei1 hao3 de

[INFO] Generating correct pronunciation for: 他的汉语说得很好
[SUCCESS] Correct pronunciation audio saved: correct_pronunciation.wav
