## Configurations Setup

In [1]:
# Install the dependencies from venv terminal

#pip install yt-dlp
#pip install "numpy<2.0"

In [4]:
import os
import yt_dlp

TEST_LINK = "https://www.youtube.com/watch?v=w4sJoZ9D1YM"
OUTPUR_DIR = "downloads"
OUTPUT_VIDEO = "video.mp4"
OUTPUT_AUDIO = "audio"

os.makedirs("downloads", exist_ok=True)

## Download Video

In [None]:
ydl_opts = {
    'format': 'best',  # Download best quality
    'outtmpl': f"{OUTPUR_DIR}/{OUTPUT_VIDEO}",
    'noplaylist': True,  # Download single video, not playlist
    'progress_hooks': [lambda d: print(f"Downloading: {d['_percent_str']} complete")],
    
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    ydl.download([TEST_LINK])
    
print("Download complete!")

## Download Audio as .mp3

In [None]:
ydl_opts = {
    'format': 'bestaudio/best',  # Select best audio format
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',  # Extract audio using FFmpeg
        'preferredcodec': 'mp3',      # Convert to MP3
        'preferredquality': '192',    # Set quality (kbps)
    }],
    'outtmpl': f"{OUTPUR_DIR}/{OUTPUT_AUDIO}",  # Output file name
    'noplaylist': True,               # Download single video, not playlist
    'progress_hooks': [lambda d: print(f"Downloading: {d['_percent_str']} complete")],
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    ydl.download([TEST_LINK])

print("Audio download complete!")

## Transcription

In [None]:
import os
import torch
from transformers import AutoProcessor, AutoModelForCTC
import librosa
import numpy as np

AUDIO_PATH = "downloads/audio.mp3"
OUTPUT_DIR = "downloads"
OUTPUT_FILE = os.path.join(OUTPUT_DIR, "transcription.txt")

print("Loading model and processor...")
model_name = "auditi41/wav2vec2-large-xlsr-53-Bengali"  # Update the model if required
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForCTC.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
print(f"Using device: {device}")

try:
    print(f"Loading audio file: {AUDIO_PATH}")
    speech_array, sampling_rate = librosa.load(AUDIO_PATH, sr=16000)
    print(f"Audio length: {len(speech_array)/sampling_rate:.2f} seconds")

    chunk_length = 16000 * 15  # Smaller chunks - 15 seconds
    transcription = []

    for i in range(0, len(speech_array), chunk_length):
        print(f"Processing chunk {i//chunk_length + 1}...")
        chunk = speech_array[i:i + chunk_length]
        
        if len(chunk) < 1000:
            continue

        inputs = processor(
            chunk, 
            sampling_rate=16000, 
            return_tensors="pt", 
            padding="max_length",
            max_length=16000 * 15
        )
        
        inputs = {k: v.to(device) for k, v in inputs.items() if isinstance(v, torch.Tensor)}
        
        with torch.no_grad():
            logits = model(inputs["input_values"]).logits
        
        predicted_ids = torch.argmax(logits, dim=-1)
        chunk_text = processor.batch_decode(predicted_ids)[0]
        transcription.append(chunk_text)
        print(f"Chunk {i//chunk_length + 1} transcribed")

    full_transcript = " ".join(transcription)

    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        f.write(full_transcript)

    print(f"Transcription saved to {OUTPUT_FILE}")
    print(f"Preview: {full_transcript[:200]}...")
    
except Exception as e:
    import traceback
    print(f"Error during transcription: {str(e)}")
    print(traceback.format_exc())

## Using BanglaASR

In [None]:

import os
import librosa
import torch
import torchaudio
import numpy as np

from transformers import WhisperTokenizer
from transformers import WhisperProcessor
from transformers import WhisperFeatureExtractor
from transformers import WhisperForConditionalGeneration

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

mp3_path = "https://huggingface.co/bangla-speech-processing/BanglaASR/resolve/main/mp3/common_voice_bn_31515636.mp3"

model_path = "bangla-speech-processing/BanglaASR"


feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path)
tokenizer = WhisperTokenizer.from_pretrained(model_path)
processor = WhisperProcessor.from_pretrained(model_path)
model = WhisperForConditionalGeneration.from_pretrained(model_path).to(device)


speech_array, sampling_rate = torchaudio.load(mp3_path, format="mp3")
speech_array = speech_array[0].numpy()
speech_array = librosa.resample(np.asarray(speech_array), orig_sr=sampling_rate, target_sr=16000)
input_features = feature_extractor(speech_array, sampling_rate=16000, return_tensors="pt").input_features

# batch = processor.feature_extractor.pad(input_features, return_tensors="pt")
predicted_ids = model.generate(inputs=input_features.to(device))[0]


transcription = processor.decode(predicted_ids, skip_special_tokens=True)

print(transcription)


In [1]:
import torch
print(torch.__version__)

import torchaudio
print(torchaudio.__version__)


2.1.0
2.1.0
