# Set up the environment

In [1]:
!pip install git+https://github.com/openai/whisper.git -q
!pip install torchaudio datasets jiwer gradio transformers noisereduce -q

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m126.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m94.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m58.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m15.0 MB/s[0m eta [36m0:

# Import library

In [None]:
from google.colab import drive, files
from datasets import Dataset
from tqdm import tqdm
from whisper.normalizers import EnglishTextNormalizer
from IPython.display import display, clear_output
import os
import glob
import whisper
import torchaudio
import torch
import pandas as pd
import jiwer
import noisereduce as nr

# Connect to Drive

In [2]:
drive.mount('/content/drive')
WORK_DIR = '/content/drive/MyDrive/Intership-Assignment/BLULEAP-AI'
print(f"Working directory set to: {WORK_DIR}")

Mounted at /content/drive
Working directory set to: /content/drive/MyDrive/Intership-Assignment/BLULEAP-AI


# Prepare the dataset

In [3]:
DATA_DIR = f"{WORK_DIR}/Datasets/LibriSpeech"

# Get all .flac files
audio_files = sorted(glob.glob(f"{DATA_DIR}/**/*.flac", recursive=True))
print(f"Found {len(audio_files)} audio files.")

# Read and merge text based on file ID
texts = []
for audio_file in audio_files:
    folder = os.path.dirname(audio_file)
    trans_files = glob.glob(f"{folder}/*.trans.txt")
    if not trans_files:
        texts.append("No text available")
        continue
    trans_file = trans_files[0]
    # Create a dictionary mapping ID -> text from .trans.txt
    trans_dict = {}
    with open(trans_file, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                parts = line.strip().split(maxsplit=1)
                if len(parts) == 2:
                    trans_dict[parts[0]] = parts[1]

    # Extract ID from the .flac file name
    flac_id = os.path.basename(audio_file).replace(".flac", "")
    text = trans_dict.get(flac_id, "No text available")
    texts.append(text)

# Create dataset
dataset = Dataset.from_dict({"audio": audio_files, "text": texts})
print(f"Loaded {len(dataset)} samples from {DATA_DIR}.")

Found 2703 audio files.
Loaded 2703 samples from /content/drive/MyDrive/Intership-Assignment/BLULEAP-AI/Datasets/LibriSpeech.


# Build an STT system with Whisper

In [None]:
# Load model (base.en)
model = whisper.load_model("base.en")
print(f"Model loaded on {model.device}")

# Process audio
def process_audio(audio_path):
    try:
        waveform, sample_rate = torchaudio.load(audio_path)
        return whisper.pad_or_trim(waveform.flatten()), sample_rate
    except Exception as e:
        print(f"Error loading {audio_path}: {e}")
        return None, None

# Prepare batch
def prepare_batch(audio_paths, texts):
    audios = []
    for audio_path, text in zip(audio_paths, texts):
        audio, _ = process_audio(audio_path)
        if audio is not None:
            audios.append(audio)
    if audios:
        batch_audios = torch.stack(audios).to(model.device)
        return batch_audios, texts[:len(audios)]
    return None, None

BATCH_SIZE = 16

# STT
transcriptions = []
for i in tqdm(range(0, len(dataset), BATCH_SIZE), desc="Transcribing"):
    batch_paths = dataset['audio'][i:i + BATCH_SIZE]
    batch_texts = dataset['text'][i:i + BATCH_SIZE]
    batch_audios, valid_texts = prepare_batch(batch_paths, batch_texts)
    if batch_audios is not None:
        mels = whisper.log_mel_spectrogram(batch_audios)
        results = model.decode(mels, options=whisper.DecodingOptions(language="en", without_timestamps=True))
        transcriptions.extend([result.text.strip() for result in results])

# Save results
with open(f"{WORK_DIR}/transcriptions.txt", "w", encoding="utf-8") as f:
    for t in transcriptions:
        f.write(f"{t}\n")
print(f"Transcriptions saved to {WORK_DIR}/transcriptions.txt. Total: {len(transcriptions)} samples.")

Model loaded on cuda:0


Transcribing: 100%|██████████| 169/169 [37:21<00:00, 13.26s/it]


Transcriptions saved to /content/drive/MyDrive/Intership-Assignment/BLULEAP-AI/transcriptions.txt. Total: 2703 samples.


# Test and Evaluate

In [None]:
# Check
print(f"Transcriptions: {len(transcriptions)}, Dataset: {len(dataset)}")
if len(transcriptions) == len(dataset): print("Success!")

for i in range(min(5, len(dataset))):
    print(f"Sample {i+1}: GT: {dataset[i]['text']}, Trans: {transcriptions[i]}")

# DataFrame
data = pd.DataFrame({"reference": dataset['text'], "hypothesis": transcriptions})
print("\n5 dòng đầu:", data.head())

# Normalize
normalizer = EnglishTextNormalizer()
data["ref_clean"] = [normalizer(t) for t in data["reference"]]
data["hyp_clean"] = [normalizer(t) for t in data["hypothesis"]]
print("\n5 dòng đầu (đã chuẩn hóa):", data[["ref_clean", "hyp_clean"]].head())

# Check the file
with open(f"{WORK_DIR}/transcriptions.txt", "r", encoding="utf-8") as f:
    print("\n5 dòng file:", [line.strip() for line in f.readlines()[:5]])

Transcriptions: 2703, Dataset: 2703
Success!
Sample 1: GT: MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL, Trans: Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.
Sample 2: GT: NOR IS MISTER QUILTER'S MANNER LESS INTERESTING THAN HIS MATTER, Trans: Nor is Mr. Quilter's manner less interesting than his matter.
Sample 3: GT: HE TELLS US THAT AT THIS FESTIVE SEASON OF THE YEAR WITH CHRISTMAS AND ROAST BEEF LOOMING BEFORE US SIMILES DRAWN FROM EATING AND ITS RESULTS OCCUR MOST READILY TO THE MIND, Trans: He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind.
Sample 4: GT: HE HAS GRAVE DOUBTS WHETHER SIR FREDERICK LEIGHTON'S WORK IS REALLY GREEK AFTER ALL AND CAN DISCOVER IN IT BUT LITTLE OF ROCKY ITHACA, Trans: He has grieved doubts whether Sir Frederick Layton's work is really Greek after

In [None]:
# Calculate WER with the base model
wers = [jiwer.wer(gt, hyp) for gt, hyp in zip(data["ref_clean"], data["hyp_clean"])]
average_wer = sum(wers) / len(wers) if wers else 0
print(f"\nAverage WER with base model: {average_wer * 100:.2f}%")


Average WER with base model: 5.27%


# Improve WER

In [None]:
# Improvement: Try the small model
model_small = whisper.load_model("small.en")
print(f"Model small loaded on {model_small.device}")

transcriptions_small = []
for i in tqdm(range(0, len(dataset), BATCH_SIZE), desc="Transcribing small"):
    batch_paths = dataset['audio'][i:i + BATCH_SIZE]
    batch_texts = dataset['text'][i:i + BATCH_SIZE]
    batch_audios, valid_texts = prepare_batch(batch_paths, batch_texts)
    if batch_audios is not None:
        mels = whisper.log_mel_spectrogram(batch_audios)
        results = model_small.decode(mels, options=whisper.DecodingOptions(language="en", without_timestamps=True))
        transcriptions_small.extend([result.text.strip() for result in results])

# Calculate WER with the small model
wers_small = [jiwer.wer(gt, hyp) for gt, hyp in zip(data["ref_clean"], [normalizer(t) for t in transcriptions_small[:len(data["ref_clean"])]])]
average_wer_small = sum(wers_small) / len(wers_small) if wers_small else 0
print(f"Average WER with small model: {average_wer_small * 100:.2f}%")

# Save result
with open(f"{WORK_DIR}/wer_results.txt", "w") as f:
    f.write(f"Base WER: {average_wer * 100:.2f}%\nSmall WER: {average_wer_small * 100:.2f}%\n")

100%|███████████████████████████████████████| 461M/461M [00:08<00:00, 55.2MiB/s]


Model small loaded on cuda:0


Transcribing small: 100%|██████████| 169/169 [04:31<00:00,  1.61s/it]


Average WER with small model: 3.84%


# Handle noise and multilingual processing

In [9]:
# Reload the model
model = whisper.load_model("base.en")
print(f"Model loaded on {model.device}")

# Audio processing function with noise reduction and shape normalization
def process_audio_with_noise(audio_path):
    waveform, sample_rate = torchaudio.load(audio_path)
    audio_np = waveform.cpu().numpy().mean(axis=0) if waveform.dim() > 1 else waveform.cpu().numpy()
    noise_reduced = nr.reduce_noise(y=audio_np, sr=sample_rate)
    audio_tensor = torch.tensor(noise_reduced).to(model.device)
    return whisper.pad_or_trim(audio_tensor), sample_rate

# Test noise reduction
audio, sample_rate = process_audio_with_noise(dataset[0]['audio'])
if audio is not None:
    mels = whisper.log_mel_spectrogram(audio.unsqueeze(0))
    result_noise = model.decode(mels, options=whisper.DecodingOptions(language="en", without_timestamps=True))[0]
    print(f"Transcription with noise reduction: {result_noise.text}")

# Test multilingual
if audio is not None:
    result_multi = model.decode(mels, options=whisper.DecodingOptions(language="fr", without_timestamps=True))[0]
    print(f"Transcription in French: {result_multi.text}")

Model loaded on cuda:0
Transcription with noise reduction: Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.
Transcription in French: Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.


# Demo

In [16]:
# Load model
model = whisper.load_model("base.en")
print(f"Model loaded on {model.device}")

# Processing
def transcribe_audio(audio_path, reduce_noise=False, language="en"):
    waveform, sample_rate = torchaudio.load(audio_path)
    if reduce_noise:
        audio_np = waveform.cpu().numpy().mean(axis=0)
        noise_reduced = nr.reduce_noise(y=audio_np, sr=sample_rate)
        audio = torch.tensor(noise_reduced).to(model.device)
    else:
        audio = waveform.to(model.device)
    audio_padded = whisper.pad_or_trim(audio)
    mels = whisper.log_mel_spectrogram(audio_padded.unsqueeze(0))
    result = model.decode(mels, options=whisper.DecodingOptions(language=language, without_timestamps=True))[0]
    return result.text

# Colab Forms
uploaded_file = files.upload()
reduce_noise = "Reduce Noise" #@param {type:"boolean"}
language = "en" #@param ["en", "fr"]

if uploaded_file:
    audio_path = list(uploaded_file.keys())[0]
    transcription = transcribe_audio(audio_path, reduce_noise, language)
    clear_output()
    print(f"Transcription: {transcription}")
    print(f"Options: Reduce Noise={reduce_noise}, Language={language}")
else:
    print("Please upload an audio file (.wav, .flac) to proceed.")

Transcription: Therefore, my answer is with greater care that he may hear me who is weeping yonder so that the sin and dull be of one measure.
Options: Reduce Noise=Reduce Noise, Language=en
