<a href="https://colab.research.google.com/github/lilfetz22/audiobook_validation/blob/main/audiobook_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install -U openai-whisper &> /dev/null

In [10]:
# --- Step 2: Imports and Memory Cleanup ---
import torch
import gc
import whisper # This is the official library
import json
import os
from google.colab import drive

# Free up memory before we start
print("Clearing memory...")
gc.collect()
torch.cuda.empty_cache()
print("✅ Memory cleared.")

Clearing memory...
✅ Memory cleared.


In [11]:
from google.colab import drive
drive.mount('/content/drive')

print("✅ Google Drive Mounted. You can now access your files.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Google Drive Mounted. You can now access your files.


In [None]:
#@title Step 4: Specify the Directory to your Filepath Folder
#@markdown 1. Mount your Google Drive in the cell above.
#@markdown 2. Find your MP3 file in the file browser on the left.
#@markdown 3. Right-click the MP3 file and select "Copy path".
#@markdown 4. Paste the path into the `audio_file_path` field below and run this cell.

import os

# This line creates the form field.
# The user's input will be stored in the 'audio_file_path' variable.
path_to_translated_audiobooks = "/content/drive/MyDrive/" #@param {type:"string"}

# --- Validation ---
if not path_to_translated_audiobooks.strip():
  print("❌ ERROR: The audio file path cannot be empty. Please paste the path and run again.")
elif not os.path.exists(path_to_translated_audiobooks):
  print(f"❌ ERROR: The file was not found at the specified path: {path_to_translated_audiobooks}")
  print("Please check the path is correct and that your Google Drive is mounted.")
else:
  print(f"✅ File found successfully!")
  print(f"   -> {path_to_translated_audiobooks}")

In [13]:
# Check if a GPU is available, otherwise use CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    print("✅ GPU found. Using CUDA for processing.")
else:
    print("⚠️ GPU not found. Using CPU. Processing will be very slow.")

# We will continue to use the 'medium' model as it's a safe and powerful choice.
model_name = "medium"
print(f"Loading official Whisper '{model_name}' model...")
model = whisper.load_model(model_name, device=device)
print("✅ Model Loaded.")

✅ GPU found. Using CUDA for processing.
Loading official Whisper 'medium' model...
✅ Model Loaded.


In [14]:
for chap_num in [2]:
    audio_file_path = path_to_translated_audiobooks + book_path + language
    # audio_file_path += f"Capitulo {chap_num}.mp3"
    audio_file_path += f"{chap_num} - ADP.mp3"
    # --- Step 5: Transcribe and Generate Timestamp File ---
    print(f"\nStarting transcription of: {os.path.basename(audio_file_path)}...")

    # The official library's transcribe function can directly output word timestamps.
    result = model.transcribe(audio_file_path, word_timestamps=True)
    detected_language = result["language"]
    print(f"Transcription complete. Detected language: {detected_language.upper()}")


    # Reformat the output into our desired JSON structure
    # The output format is slightly different from whisperx, so we parse it accordingly.
    final_output = {
        "language": detected_language,
        "transcription_text": result["text"],
        "words": []
    }

    for segment in result["segments"]:
        for word_info in segment["words"]:
            # The structure is already clean: {'word': 'Bonjour', 'start': 0.5, 'end': 1.0}
            final_output["words"].append(word_info)

    # Save the final JSON file
    output_path = os.path.splitext(audio_file_path)[0] + "_timestamps.json"

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(final_output, f, indent=2, ensure_ascii=False)

    print("\n✅ All Done!")
    print(f"Timestamp file saved successfully to: {output_path}")

    # Optional: Display the first 20 words from the output for verification
    print("\n--- Verification: First 20 words ---")
    print(json.dumps(final_output["words"][:20], indent=2, ensure_ascii=False))


Starting transcription of: 2 - ADP.mp3...
Transcription complete. Detected language: FR

✅ All Done!
Timestamp file saved successfully to: /content/drive/MyDrive/AIA/Translated Audiobooks/Leadership/French/2 - ADP_timestamps.json

--- Verification: First 20 words ---
[
  {
    "word": " Chapitre",
    "start": 0.6000000000000063,
    "end": 1.0800000000000032,
    "probability": 0.935927172501882
  },
  {
    "word": " 2.",
    "start": 1.0800000000000032,
    "end": 1.56,
    "probability": 0.871484100818634
  },
  {
    "word": " Ce",
    "start": 1.74,
    "end": 1.9,
    "probability": 0.8392883539199829
  },
  {
    "word": " que",
    "start": 1.9,
    "end": 2.12,
    "probability": 0.9950376152992249
  },
  {
    "word": " vous",
    "start": 2.12,
    "end": 2.3,
    "probability": 0.9964411854743958
  },
  {
    "word": " dites",
    "start": 2.3,
    "end": 2.62,
    "probability": 0.9755383133888245
  },
  {
    "word": " quand",
    "start": 2.62,
    "end": 3.04,
    "pr