<a href="https://colab.research.google.com/github/hrk022/ASR_model/blob/main/ASR_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Install system dependencies for audio
!apt-get update && apt-get install -y libasound2-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg

# 2. Install all Python packages
!pip install torchaudio transformers speechbrain datasets pyaudio soundfile

# 3. Additional dependencies for Hugging Face datasets
!pip install librosa pyarrow

In [None]:
!apt-get install -y libasound2-dev portaudio19-dev
!pip install git+https://github.com/Uberi/speech_recognition.git
!pip install pyaudio

In [None]:
from datasets import load_dataset


dataset = load_dataset("superb", "asr", split="train[:100]", trust_remote_code=True)

print(f"Loaded {len(dataset)} samples")
print(dataset[0])


In [None]:
import numpy as np
from datasets import Dataset, concatenate_datasets

def preprocess_common_voice(batch):
    return {
        "audio": np.array(batch["audio"]["array"], dtype=np.float32),
        "text": batch["text"],
        "accent": "us"  # Default
    }

# Apply preprocessing
processed_common_voice = dataset.map(preprocess_common_voice)

# Create synthetic data with varied accents
synthetic_samples = Dataset.from_dict({
    "audio": [np.random.rand(16000).astype(np.float32) for _ in range(10)],
    "text": [f"Sample {i}" for i in range(10)],
    "accent": [
        *["us"]*2, *["uk"]*2, *["indian"],
        *["australian"], *["canadian"],
        *["african"]*3
    ]
})

# Merge datasets
final_dataset = concatenate_datasets([processed_common_voice, synthetic_samples])

# Show accent distribution
print(f"\nFinal dataset contains {len(final_dataset)} samples:")
for accent in set(final_dataset["accent"]):
    print(f"- {accent}: {final_dataset['accent'].count(accent)}")


In [None]:
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, pipeline

class AccentAdaptiveASR:
    def __init__(self):
        self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
        self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
        self.corrector = pipeline("text2text-generation", model="grammarly/coedit-large")
        self.accent_embedding = torch.nn.Embedding(5, 1024)  # 5 accents

    def transcribe(self, audio_data, accent_id=0):
        if isinstance(audio_data, np.ndarray):
            audio_data = torch.FloatTensor(audio_data) / 32768.0

        inputs = self.processor(audio_data, return_tensors="pt", sampling_rate=16000)
        with torch.no_grad():
            outputs = self.model(**inputs, output_hidden_states=True)
            hidden_states = outputs.hidden_states[-1]
            accent_features = self.accent_embedding(torch.tensor([accent_id]))
            adapted = hidden_states + accent_features.unsqueeze(1)
            logits = self.model.lm_head(adapted)
            predicted_ids = torch.argmax(logits, dim=-1)
            raw_text = self.processor.batch_decode(predicted_ids)[0]

        corrected = self.corrector(raw_text)[0]['generated_text']
        return {"raw": raw_text, "corrected": corrected}


In [None]:
import speech_recognition as sr

def check_microphone():
    recognizer = sr.Recognizer()
    try:
        with sr.Microphone() as source:
            print("Testing microphone...")
            recognizer.adjust_for_ambient_noise(source)
            return True
    except OSError:
        return False


In [None]:
import torchaudio

def live_accent_recognition():
    if not check_microphone():
        print("\nNo microphone detected. Using test audio 'sample.wav'...")
        try:
            waveform, _ = torchaudio.load("sample.wav")
            audio_data = waveform.numpy()[0]
            asr_system = AccentAdaptiveASR()
            result = asr_system.transcribe(audio_data, accent_id=2)

            print("\nResults from sample.wav:")
            print(f"Raw: {result['raw']}")
            print(f"Corrected: {result['corrected']}")
            return
        except FileNotFoundError:
            print("No sample.wav file found.")
            return

    recognizer = sr.Recognizer()
    asr_system = AccentAdaptiveASR()

    print("\n=== Real-Time Accent-Adaptive ASR ===")
    print("Accent Options: 0-US, 1-UK, 2-Indian, 3-Australian, 4-African")

    try:
        accent_id = int(input("Select accent (0–4): "))
    except:
        accent_id = 0

    with sr.Microphone() as source:
        print("Listening for 5 seconds...")
        recognizer.adjust_for_ambient_noise(source)
        audio = recognizer.listen(source, timeout=5)
        audio_data = np.frombuffer(audio.get_raw_data(), dtype=np.int16)

    result = asr_system.transcribe(audio_data, accent_id)

    print("\nResults:")
    print(f"Raw Transcription: {result['raw']}")
    if result['raw'].lower() != result['corrected'].lower():
        print(f"Corrected Version: {result['corrected']}")
    else:
        print("(No corrections needed)")


In [None]:
live_accent_recognition()
