**Make sure you are in your virtaul env and install below dependencies ! Uncomment !!**

In [1]:
# !pip install speechrecognition librosa numpy transformers pronouncing nltk soundfile wget   

Collecting speechrecognition
  Downloading SpeechRecognition-3.14.1-py3-none-any.whl.metadata (31 kB)
Collecting pronouncing
  Downloading pronouncing-0.2.0.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting cmudict>=0.4.0 (from pronouncing)
  Downloading cmudict-1.0.32-py3-none-any.whl.metadata (3.6 kB)
Downloading SpeechRecognition-3.14.1-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0mm
[?25hDownloading cmudict-1.0.32-py3-none-any.whl (939 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.4/939.4 kB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pronouncing
  Building wheel for pronouncing (setup.py) ... [?25l[?25hdone
  Created wheel for pronouncing: filename=pronouncing-0.2.0-py2.py3-none-any.whl size=6233 sha256=e4d1339bb9a391702a262a478de421158c6096a0a09312bcd1003a6dd6928b7

In [6]:
import sys

# Check and install missing modules
try:
    import speech_recognition as sr
    import librosa
    import numpy as np
    import transformers
    import pronouncing
    import nltk
    import soundfile as sf
    import os
    import wget
    import tarfile
    import shutil
except ModuleNotFoundError as e:
    print(f"Missing module: {e}")
    print("Please install it using: pip install {module_name}".format(module_name=str(e).split("'")[1]))
    sys.exit(1)

# Download required NLTK data
nltk.download('cmudict')

class EnglishLearningTool:
    def __init__(self):
        self.recognizer = sr.Recognizer()
        try:
            self.nlp = transformers.pipeline("text-classification", model="distilbert-base-uncased")
        except Exception as e:
            print(f"Error loading NLP model: {e}")
            sys.exit(1)
        self.pronunciation_dict = cmudict.dict()
        self.results = []

    def download_dataset(self):
        """Download and extract a small LibriSpeech dataset"""
        dataset_url = "http://www.openslr.org/resources/12/dev-clean.tar.gz"
        dataset_path = "dev-clean.tar.gz"
        extract_path = "LibriSpeech"

        if not os.path.exists(extract_path):
            print("Downloading LibriSpeech dataset...")
            try:
                wget.download(dataset_url, dataset_path)
                print("\nExtracting dataset...")
                with tarfile.open(dataset_path, "r:gz") as tar:
                    tar.extractall()
                os.remove(dataset_path)
            except Exception as e:
                print(f"Failed to download or extract dataset: {e}")
                return None
        return extract_path

    def analyze_audio(self, audio_path):
        try:
            audio_data, sample_rate = librosa.load(audio_path)
            with sr.AudioFile(audio_path) as source:
                audio = self.recognizer.record(source)
                text = self.recognizer.recognize_google(audio)
        except sr.UnknownValueError:
            return "Could not understand audio", None
        except sr.RequestError:
            return "API request failed", None
        except Exception as e:
            return f"Error loading audio: {e}", None

        stressed_words = self.detect_stress(audio_data, sample_rate, text)
        return text, stressed_words

    def detect_stress(self, audio_data, sample_rate, text):
        pitches = librosa.pitch_tuning(audio_data)
        intensity = librosa.feature.rms(y=audio_data)[0]
        
        words = text.split()
        stressed_words = []
        intensity_threshold = np.mean(intensity) + np.std(intensity)
        chunks = len(intensity) // len(words)
        
        for i, word in enumerate(words):
            chunk_start = i * chunks
            chunk_end = (i + 1) * chunks
            chunk_intensity = np.mean(intensity[chunk_start:chunk_end])
            if chunk_intensity > intensity_threshold:
                stressed_words.append(word)
                
        return stressed_words

    def get_pronunciation(self, word):
        if word.lower() in self.pronunciation_dict:
            phones = self.pronunciation_dict[word.lower()][0]
            return self.format_pronunciation(phones)
        return "Pronunciation not found"

    def format_pronunciation(self, phones):
        stress_markers = {"0": "", "1": "'", "2": "ˌ"}
        result = ""
        for phone in phones:
            if phone[-1].isdigit():
                result += stress_markers[phone[-1]] + phone[:-1].lower()
            else:
                result += phone.lower()
        return result

    def correct_homophones(self, text):
        homophones = {
            "their": ["there", "they're"],
            "to": ["too", "two"],
            "right": ["write", "rite"],
        }
        
        words = text.split()
        corrected_text = []
        for word in words:
            if word.lower() in homophones:
                context_score = self.nlp(f"Is '{word}' appropriate in: {text}")
                if context_score[0]['score'] < 0.7:
                    for alternative in homophones[word.lower()]:
                        new_text = text.replace(word, alternative)
                        score = self.nlp(f"Is '{alternative}' appropriate in: {new_text}")
                        if score[0]['score'] > context_score[0]['score']:
                            word = alternative
                            break
            corrected_text.append(word)
            
        return " ".join(corrected_text)

    def process_audio(self, audio_path):
        text, stressed_words = self.analyze_audio(audio_path)
        if not stressed_words:
            return f"Error processing {audio_path}: {text}"

        corrected_text = self.correct_homophones(text)
        
        output = [f"File: {audio_path}"]
        output.append(f"Original Text: {text}")
        output.append(f"Corrected Text: {corrected_text}")
        output.append("Stressed Words and Pronunciation:")
        
        for word in stressed_words:
            pronunciation = self.get_pronunciation(word)
            output.append(f"- {word.upper()} : /{pronunciation}/")
            output.append(f"  Breakdown: Pronounce as '{pronunciation.replace('ˈ', 'STRESS-')}'")
        
        result = "\n".join(output)
        self.results.append(result)
        return result

    def test_dataset(self, dataset_path, max_files=5):
        if not dataset_path:
            print("Dataset not available. Aborting test.")
            return
        
        audio_files = []
        for root, _, files in os.walk(dataset_path):
            for file in files:
                if file.endswith('.flac'):
                    audio_files.append(os.path.join(root, file))
                    if len(audio_files) >= max_files:
                        break
            if len(audio_files) >= max_files:
                break

        print(f"Processing {len(audio_files)} audio files...")
        for audio_file in audio_files:
            try:
                result = self.process_audio(audio_file)
                print("\n" + "="*50)
                print(result)
                print("="*50)
            except Exception as e:
                print(f"Error processing {audio_file}: {str(e)}")

        print("\nSummary:")
        print(f"Total files processed: {len(self.results)}")
        print(f"Success rate: {(len(self.results) / len(audio_files)) * 100:.2f}%")

def main():
    tool = EnglishLearningTool()
    
    dataset_path = tool.download_dataset()
    tool.test_dataset(dataset_path)
    
    cleanup = input("\nRemove dataset files? (y/n): ")
    if cleanup.lower() == 'y' and dataset_path:
        shutil.rmtree(dataset_path)

if __name__ == "__main__":
    main()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[nltk_data] Downloading package cmudict to /usr/share/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


Device set to use cpu


Processing 5 audio files...

File: LibriSpeech/dev-clean/5536/43363/5536-43363-0001.flac
Original Text: therefore he courts death in Battle on the other hand he would regard it as disgraceful to be killed in a private quarrel
Corrected Text: therefore he courts death in Battle on the other hand he would regard it as disgraceful to be killed in a private quarrel
Stressed Words and Pronunciation:
- THE : /dhah/
  Breakdown: Pronounce as 'dhah'
- IT : /'iht/
  Breakdown: Pronounce as ''iht'

File: LibriSpeech/dev-clean/5536/43363/5536-43363-0009.flac
Original Text: it is well known that the American Indian had somehow developed a cold power and although in the latter days there have been many imposters and allowing for the vanity and weakness of Human Nature it is fair to assume that there must have been some even in the old days yet there are well attested instances of remarkable prophecies and other mystic practice
Corrected Text: it is well known that the American Indian had somehow de


Remove dataset files? (y/n):  n
