In [1]:
!pip install -q faster-whisper librosa pandas
!apt-get -y install -qq ffmpeg

# Ver GPU y memoria (deberías ver NVIDIA A100-SXM4-40GB)
!nvidia-smi

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.8/38.8 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m107.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hTue Sep  9 02:13:51 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+-----------------------

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from pathlib import Path

# 👇 Cambia esta ruta a tu carpeta real con los .gsm
#INPUT_DIR = Path("/content/drive/MyDrive/Bootcamp Llamadas Cobranza/Audios Llamadas Cobranza gsm Completo")
INPUT_DIR = Path("/content/drive/MyDrive/Bootcamp Llamadas Cobranza/Audios Adicionales")
OUTPUT_DIR = Path("/content/drive/MyDrive/Bootcamp Llamadas Cobranza/Transcripciones Llamadas Cobranza")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

OUT_CSV = OUTPUT_DIR / "transcripciones_gsm_adicionales.csv"

print("📂 INPUT_DIR:", INPUT_DIR)
print("📂 OUTPUT_DIR:", OUTPUT_DIR)
print("📄 CSV:", OUT_CSV)

📂 INPUT_DIR: /content/drive/MyDrive/Bootcamp Llamadas Cobranza/Audios Adicionales
📂 OUTPUT_DIR: /content/drive/MyDrive/Bootcamp Llamadas Cobranza/Transcripciones Llamadas Cobranza
📄 CSV: /content/drive/MyDrive/Bootcamp Llamadas Cobranza/Transcripciones Llamadas Cobranza/transcripciones_gsm_adicionales.csv


In [4]:
import subprocess, shlex, tempfile, os, time
import pandas as pd
import numpy as np
import librosa
from faster_whisper import WhisperModel

def ffmpeg_to_wav16k_mono(src_path: Path) -> Path:
    """
    Convierte cualquier audio (incl. .gsm) a WAV 16 kHz mono con ffmpeg.
    Devuelve la ruta de un archivo temporal .wav (debe eliminarse luego).
    """
    tmp_wav = Path(tempfile.mkstemp(suffix=".wav")[1])
    cmd = f'ffmpeg -y -loglevel error -i {shlex.quote(str(src_path))} -ac 1 -ar 16000 -f wav {shlex.quote(str(tmp_wav))}'
    subprocess.run(cmd, shell=True, check=True)
    return tmp_wav

def transcribir_archivo(path: Path, model: WhisperModel, lang: str = "es") -> tuple:
    """
    Transcribe un archivo .gsm.
    Flujo: .gsm -> (ffmpeg) WAV16k mono -> librosa -> faster-whisper
    Devuelve (duracion_seg, texto).
    """
    wav_tmp = ffmpeg_to_wav16k_mono(path)
    try:
        duracion = librosa.get_duration(filename=str(wav_tmp))
        audio, _ = librosa.load(str(wav_tmp), sr=16000, mono=True)

        segments, _ = model.transcribe(
            audio=audio,
            language=lang,   # "es" recomendado; None = autodetección
            vad_filter=True,
            beam_size=1,     # rápido y buena calidad
            best_of=1
        )
        texto = " ".join(s.text.strip() for s in segments)
    finally:
        try:
            os.remove(wav_tmp)
        except:
            pass
    return duracion, texto

# ---- Cargar modelo preferentemente en GPU A100 con float16 ----
try:
    import torch
    if torch.cuda.is_available():
        device, compute_type = "cuda", "float16"
        print("🚀 GPU detectada. Usando CUDA + float16.")
    else:
        device, compute_type = "cpu", "int8"
        print("⚠️ No se detectó GPU; usando CPU + int8.")
except Exception as e:
    device, compute_type = "cpu", "int8"
    print("⚠️ torch no disponible; usando CPU + int8.", e)

MODEL_SIZE = "medium"  # si quieres aún más precisión (más lento): "large-v3"
model = WhisperModel(MODEL_SIZE, device=device, compute_type=compute_type)
print(f"✅ Modelo '{MODEL_SIZE}' cargado en {device} ({compute_type})")

🚀 GPU detectada. Usando CUDA + float16.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

vocabulary.txt: 0.00B [00:00, ?B/s]

model.bin:   0%|          | 0.00/1.53G [00:00<?, ?B/s]

✅ Modelo 'medium' cargado en cuda (float16)


In [5]:
# Listar .gsm
files = sorted(INPUT_DIR.glob("*.gsm"))
print(f"🎧 Audios .gsm encontrados: {len(files)}")

# Reanudar si ya existe CSV
ya_hechos = set()
if OUT_CSV.exists():
    try:
        prev = pd.read_csv(OUT_CSV)
        if {"archivo","transcripcion","duracion_seg"}.issubset(prev.columns):
            ya_hechos = set(prev["archivo"].astype(str).tolist())
            print(f"🔁 Reanudando: {len(ya_hechos)} ya procesados")
    except Exception as e:
        print("⚠️ No pude leer CSV previo; se creará de nuevo.", e)

pendientes = [p for p in files if p.name not in ya_hechos]
print(f"👉 Por procesar ahora: {len(pendientes)}")

# Crear CSV con encabezado si no existe
if not OUT_CSV.exists():
    pd.DataFrame(columns=["archivo","duracion_seg","transcripcion"]).to_csv(OUT_CSV, index=False, encoding="utf-8")

# Procesar uno por uno guardando incrementalmente
start_all = time.time()
ok, err = 0, 0

with open(OUT_CSV, "a", encoding="utf-8") as f:
    for i, p in enumerate(pendientes, start=1):
        t0 = time.time()
        try:
            dur, txt = transcribir_archivo(p, model, lang="es")
            safe_txt = txt.replace('"','""')  # escapar comillas dobles
            f.write(f"\"{p.name}\",{dur:.1f},\"{safe_txt}\"\n")
            f.flush()
            ok += 1
            print(f"[{i}/{len(pendientes)}] {p.name} ✔️ {dur:.1f}s audio, {time.time()-t0:.1f}s proc")
        except Exception as e:
            err += 1
            print(f"[{i}/{len(pendientes)}] {p.name} ❌ ERROR {e}")

elapsed = time.time() - start_all
print(f"\n✅ Terminado. {ok} OK, {err} errores. Tiempo total: {elapsed/60:.1f} min")
print("📄 CSV final:", OUT_CSV)

🎧 Audios .gsm encontrados: 374
👉 Por procesar ahora: 374


	This alias will be removed in version 1.0.
  duracion = librosa.get_duration(filename=str(wav_tmp))


[1/374] agent.12302.date.2024-03-15-14-07-38.tel.12730.id.1710529239.97149.035713.hung_up.customer.gsm ✔️ 224.0s audio, 21.8s proc


	This alias will be removed in version 1.0.
  duracion = librosa.get_duration(filename=str(wav_tmp))


[2/374] agent.12327.date.2024-05-15-16-18-18.cod.2002.tel.93053236091.id.12327.9612.1715807654.69771.hung_up.customer.gsm ✔️ 243.7s audio, 8.3s proc
[3/374] agent.12328.date.2024-08-05-10-57-33.tel.12781.id.1722872776.19700.510459.hung_up.customer.gsm ✔️ 392.4s audio, 12.4s proc
[4/374] agent.12328.date.2024-08-06-11-34-13.tel.12730.id.1722961214.28086.253394.hung_up.customer.gsm ✔️ 524.2s audio, 18.8s proc
[5/374] agent.12328.date.2024-09-10-10-14-10.tel.12730.id.1725980774.26222.100219.hung_up.customer.gsm ✔️ 250.7s audio, 15.4s proc
[6/374] agent.12347.date.2024-01-05-16-31-16.cod.2001.tel.93228860007.id.12347.9626.1704490089.70824.hung_up.customer.gsm ✔️ 187.0s audio, 6.2s proc
[7/374] agent.12351.date.2024-05-29-09-40-47.tel.12346.id.1716992799.19859.312511.hung_up.customer.gsm ✔️ 573.2s audio, 25.7s proc
[8/374] agent.12351.date.2024-10-16-16-09-38.tel.12941.id.1729112210.83541.001787.hung_up.customer.gsm ✔️ 563.0s audio, 18.3s proc
[9/374] agent.12536.date.2024-06-24-16-39-06.co