# WhisperX
Requisitos:
- Python 3.10 o superior
- Cuda 12.1
- Torch 2.5.1+cu121
- ffmpeg
- cuDNN - libcudnn8

In [None]:
# comprobar si la GPU está activada
!nvidia-smi

### Instalar ffmpeg

In [None]:
!apt-get install -y ffmpeg

In [None]:
!ffmpeg -version

### Instalar PyTorch + CUDA 12.1

In [None]:
!pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121


### Instalar WhisperX - Commit específico 2 Julio 2025
https://github.com/m-bain/whisperX/commit/2d9ce44329ae73af2520196d31cd14b6192ace44


In [None]:
!pip install git+https://github.com/m-bain/whisperx.git@2d9ce44329ae73af2520196d31cd14b6192ace44


### Verificación de la instalación

In [None]:
import torch
print("Torch:", torch.__version__)
print("CUDA:", torch.version.cuda)
print("Disponible:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0))

### cuDNN - libcudnn8
https://developer.nvidia.com/rdp/cudnn-archive

In [None]:
!sudo apt update
!sudo apt install libcudnn8 libcudnn8-dev -y

### Probar WhisperX

In [None]:
# Esto permite cargar un audio
from google.colab import files
uploaded = files.upload()

In [None]:
import whisperx
import torch

# acá se puede modificar la calidad del modelo
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
COMPUTE_TYPE = "int8"
BATCH_SIZE = 10
MODEL_SIZE = "medium"

# para mejor calidad (pero más lento) usar:
# COMPUTE_TYPE = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
# BATCH_SIZE = 16 # reduce if low on GPU mem
# MODEL_SIZE = "large-v2"

# Cargar modelo WhisperX
model = whisperx.load_model(MODEL_SIZE, DEVICE, compute_type=COMPUTE_TYPE)



In [None]:
#probar modelo
audio = whisperx.load_audio("sample.mp3") # cargar audio
result = model.transcribe(audio, batch_size=BATCH_SIZE) # transcribir
#print(result['segments'])

# Obtener texto completo
full_transcription_text = " ".join([seg["text"] for seg in result["segments"]])
print(full_transcription_text)

In [None]:
# alineacion y separar por palabras
# Alineación
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=DEVICE)
result = whisperx.align(result["segments"], model_a, metadata, audio, DEVICE, return_char_alignments=False)

#extraer palabras
transcription = []
for seg in result["segments"]:
    for word in seg.get("words", []):
        transcription.append({
            "word": word.get("word"),
            "start": float(word.get("start")),
            "end": float(word.get("end")),
            "score": float(word.get("score"))
        })

print(transcription)

# FASTAPI


In [None]:
!pip install fastapi uvicorn pyngrok nest-asyncio

In [None]:
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import JSONResponse
import tempfile
import shutil

app = FastAPI()

@app.get("/")
def root():
    return {"mensaje": "Hola desde Google Colab"}

@app.post("/asr-analyze")
async def analyze_audio(file: UploadFile = File(...)):
    try:
        # Guardar archivo temporal
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
            shutil.copyfileobj(file.file, tmp)
            tmp_path = tmp.name

        # Cargar y transcribir
        audio = whisperx.load_audio(tmp_path)
        result = model.transcribe(audio, batch_size=BATCH_SIZE)

        # Obtener texto completo antes de alinear
        full_transcription_text = " ".join([seg["text"] for seg in result["segments"]])

        # Alineación para obtener palabras
        model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=DEVICE)
        result = whisperx.align(result["segments"], model_a, metadata, audio, DEVICE, return_char_alignments=False)

        # Extraer palabras
        transcription = []
        for seg in result["segments"]:
            for word in seg.get("words", []):
                transcription.append({
                    "word": word.get("word"),
                    "start": float(word.get("start")),
                    "end": float(word.get("end")),
                    "score": float(word.get("score"))
                })

        if not transcription:
            return JSONResponse(status_code=400, content={"error": "Audio vacío o sin contenido transcribible."})

        return JSONResponse(content={
            "transcripcion_completa": full_transcription_text,
            "palabras": transcription
        })

    except Exception as e:
        return JSONResponse(status_code=500, content={"error": str(e)})


### NGROK Auth Token
https://dashboard.ngrok.com/get-started/your-authtoken

Acá definir tu propio token

In [None]:
NGROK_TOKEN ="token"

In [None]:
import nest_asyncio
import uvicorn
from pyngrok import conf, ngrok
# Permitir eventos en el entorno de Jupyter
nest_asyncio.apply()


conf.get_default().auth_token = NGROK_TOKEN

# Crear túnel público
public_url = ngrok.connect(8000)
print(f"🚀 API disponible en: {public_url}")

# Ejecutar el servidor
uvicorn.run(app, host="0.0.0.0", port=8000)
