In [51]:
import whisper
import torch
import os
import ffmpeg
import numpy as np
import pandas as pd
from pyannote.audio import Pipeline
from moviepy.editor import VideoFileClip

def extract_audio(video_path, audio_path):
    """Extrae el audio de un video y lo guarda como archivo .wav."""
    video = VideoFileClip(video_path)
    video.audio.write_audiofile(audio_path, codec='pcm_s16le')

def transcribe_audio(audio_path, model_name="medium"):  
    """Transcribe el audio usando OpenAI Whisper."""
    model = whisper.load_model(model_name)
    result = model.transcribe(audio_path)
    return result

def diarize_audio(audio_path, diarization_pipeline):
    """Aplica diarización al audio para identificar interlocutores."""
    diarization = diarization_pipeline(audio_path)
    return [(segment.start, segment.end, speaker) for segment, speaker in diarization.itertracks(yield_label=True)]

def merge_transcription_and_diarization(transcription, diarization):
    """Combina la transcripción con la diarización para asignar quién dice qué."""
    final_transcription = []
    
    for segment in transcription["segments"]:
        start_time, end_time = segment["start"], segment["end"]
        text = segment["text"]
        
        # Encuentra el speaker correspondiente en la diarización
        speaker = "Unknown"
        for d_start, d_end, spk in diarization:
            if d_start <= start_time <= d_end or d_start <= end_time <= d_end:
                speaker = spk
                break
        
        final_transcription.append((start_time, end_time, speaker, text))
    
    return final_transcription

def format_srt(transcription, output_path):
    """Guarda la transcripción en formato SRT."""
    with open(output_path, "w", encoding="utf-8") as f:
        for i, (start, end, speaker, text) in enumerate(transcription):
            start_srt = f"{int(start // 3600):02}:{int((start % 3600) // 60):02}:{int(start % 60):02},{int((start % 1) * 1000):03}"
            end_srt = f"{int(end // 3600):02}:{int((end % 3600) // 60):02}:{int(end % 60):02},{int((end % 1) * 1000):03}"
            f.write(f"{i+1}\n{start_srt} --> {end_srt}\n{speaker}: {text}\n\n")

# Ruta del archivo de entrada (audio o video)
input_path = "prueba1.mp3"  # Puede ser .mp3, .wav, .mp4, etc.
audio_path = "temp_audio.wav"
srt_output = "output.srt"

# 1. Extraer audio si es un video
if input_path.endswith(('.mp4', '.avi', '.mov')):
    extract_audio(input_path, audio_path)
else:
    audio_path = input_path  # Ya es un archivo de audio

# 2. Transcripción del audio con Whisper
transcription = transcribe_audio(audio_path)

# 3. Diarización con Pyannote (requiere token de HuggingFace)

pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token="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")
diarization = diarize_audio(audio_path, pipeline)

# 4. Fusionar transcripción y diarización
final_transcription = merge_transcription_and_diarization(transcription, diarization)

# 5. Guardar en formato SRT
format_srt(transcription, srt_output)

print(f"Transcripción guardada en {srt_output}")


ModuleNotFoundError: No module named 'pyannote.audio'