In [None]:
pip install langgraph openai ffmpeg-python requests python-dotenv

In [None]:
import subprocess
import os
import json
from langchain.tools import Tool
from langchain.tools import StructuredTool
from langchain_openai import ChatOpenAI
from langgraph.prebuilt import create_react_agent
import requests
from dotenv import load_dotenv
from pydantic import BaseModel
from openai import OpenAI

# Load environment variables
_ = load_dotenv()
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")  # Set your ElevenLabs API key in the .env file

In [9]:
# First create the schema
class ExtractAudioSchema(BaseModel):
    video_path: str
    output_audio_path: str


def extract_audio(video_path: str, output_audio_path: str = None) -> str:
    """Extracts audio from a video file using FFmpeg and ensures it exists before returning."""
    
    if output_audio_path is None:
        base_name = os.path.splitext(os.path.basename(video_path))[0]
        output_audio_path = os.path.join("downloads", f"{base_name}_audio.wav")

    print(f"Extracting audio from {video_path} to {output_audio_path}")

    subprocess.run(["ffmpeg", "-i", video_path, "-q:a", "0", "-map", "a", output_audio_path], check=True)

    # Wait for file to exist (retry for 5 seconds)
    for _ in range(10):  # Check 10 times (every 0.5s)
        if os.path.exists(output_audio_path) and os.path.getsize(output_audio_path) > 0:
            print(f"✅ Audio extraction successful: {output_audio_path}")
            return output_audio_path
        time.sleep(0.5)

    raise FileNotFoundError(f"Audio extraction failed: {output_audio_path} was not created or is empty.")

# Register extraction tool
extract_audio_tool = StructuredTool(
    name="ExtractAudio",
    func=extract_audio,
    description="Extracts audio from a video file using FFmpeg. Ensures the extracted file exists before returning.",
    args_schema=ExtractAudioSchema
)

# Initialize OpenAI Client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Define Schema
class TranscribeAudioSchema(BaseModel):
    audio_path: str

# Transcribe Audio Tool using OpenAI Whisper API: https://platform.openai.com/docs/guides/speech-to-text
# File Size Limit: 25 MB (for a single request)
# Duration Limit: ~30 minutes (varies depending on bitrate and file compression)
# If your audio file exceeds 25 MB, you’ll need to chunk it into smaller segments before processing!    
def transcribe_audio(audio_path: str) -> str:
    """Transcribes audio using OpenAI Whisper API and saves the transcript with timestamps in a new file."""

    # Ensure file exists
    if not os.path.exists(audio_path):
        raise FileNotFoundError(f"Audio file not found: {audio_path}")

    # Ensure file format is correct
    if not audio_path.endswith(".wav"):
        raise ValueError(f"Expected a WAV file, but got: {audio_path}")

    with open(audio_path, "rb") as audio_file:
        # Call OpenAI Whisper API with word-level timestamps
        transcription = client.audio.transcriptions.create(
            file=audio_file,
            model="whisper-1",
            response_format="verbose_json",
            timestamp_granularities=["word"]  # Supports [word, segment] granularity
        )

    # Create a new file for the transcript (without replacing anything)
    new_filename = f"{os.path.splitext(os.path.basename(audio_path))[0]}_transcript.json"
    transcript_path = os.path.join(os.path.dirname(audio_path), new_filename)

    # Save full API response as a new JSON file
    with open(transcript_path, "w") as f:
        json.dump(transcription.model_dump(), f, indent=2)

    return transcript_path

# Register as a StructuredTool
transcribe_audio_tool = StructuredTool(
    name="TranscribeAudio",
    func=transcribe_audio,
    description="Transcribes an audio file into text using OpenAI Whisper API and saves the transcript with timestamps in a new file.",
    args_schema=TranscribeAudioSchema
)


llm = ChatOpenAI(model="gpt-4o", temperature=0)

class CleanTranscriptSchema(BaseModel):
    transcript_path: str

def clean_transcript(transcript_path: str) -> str:
    """Cleans up the transcript while preserving word timestamps and inserting placeholders for removed filler words."""

    # Ensure file exists
    if not os.path.exists(transcript_path):
        raise FileNotFoundError(f"Transcript file not found: {transcript_path}")

    # Load the original transcript JSON
    with open(transcript_path, "r") as file:
        transcript_data = json.load(file)

    if "words" not in transcript_data:
        raise ValueError("Invalid transcript format: Missing 'words' field.")

    words = transcript_data["words"]
    original_text = " ".join([w["word"] for w in words])
    print(f"✅ Transcript passed to LLM for cleaning: {original_text}")

    # Update prompt to insert placeholders where filler words were removed
    prompt = f"""
    Clean up this transcript while preserving its structure:

    - Remove filler words (e.g., "uh", "um", "mm-hmm").
    - Wherever a filler word is removed, insert "......" as a placeholder.
    - Improve readability and coherence.
    - Do NOT modify word timestamps.
    - Output only the cleaned words and placeholders.

    Transcript:
    {original_text}
    """

    response = llm.invoke(prompt)
    cleaned_text = response.content.strip()

    # Split cleaned text into words (including placeholders)
    cleaned_words = cleaned_text.split()
    cleaned_word_list = []
    original_word_iter = iter(words)

    for cleaned_word in cleaned_words:
        while True:
            original_word = next(original_word_iter, None)
            if original_word is None:
                break  # No more words to process

            if cleaned_word == "......":
                # Insert a placeholder with no timestamp
                cleaned_word_list.append({"word": "......"})
                break

            if original_word["word"].lower() == cleaned_word.lower():
                cleaned_word_list.append({
                    "word": original_word["word"],
                    "start": original_word["start"],
                    "end": original_word["end"]
                })
                break  # Move to the next cleaned word

    # Save cleaned transcript with timestamps
    cleaned_transcript_path = transcript_path.replace("_transcript.json", "_cleaned_transcript.json")

    cleaned_transcript_data = {
        "duration": transcript_data.get("duration"),
        "language": transcript_data.get("language"),
        "text": " ".join([w["word"] for w in cleaned_word_list]),
        "words": cleaned_word_list
    }

    with open(cleaned_transcript_path, "w") as f:
        json.dump(cleaned_transcript_data, f, indent=2)

    return cleaned_transcript_path


# Register as a StructuredTool
clean_transcript_tool = StructuredTool(
    name="CleanTranscript",
    func=clean_transcript,
    description="Cleans up a transcript using GPT-4o while preserving timestamps for the remaining words and adding pauses where words were removed.",
    args_schema=CleanTranscriptSchema
)


class GenerateAIVoiceSchema(BaseModel):
    cleaned_transcript_path: str
    output_audio_path: str = None
    slow_down: bool = True  # Option to slow down the audio

def generate_ai_voice_elevenlabs(
    cleaned_transcript_path: str, 
    output_audio_path: str = None,
    slow_down: bool = True
    ) -> str:
    
    """Generates AI voice from the cleaned transcript JSON using ElevenLabs."""
    ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")  # Load API key from env

    if not ELEVENLABS_API_KEY:
        raise ValueError("ElevenLabs API key is missing. Set it in the .env file.")

    # Load cleaned transcript JSON
    with open(cleaned_transcript_path, "r") as file:
        transcript_data = json.load(file)

    # Ensure "text" field exists
    if "text" not in transcript_data:
        raise ValueError("Invalid cleaned transcript: Missing 'text' field.")

    text = transcript_data["text"]  # Extract cleaned text
    print(f"✅ Transcript passed to ElevenLabs: {text}")

    # Ensure the output path is set and within the same folder as the cleaned transcript
    if output_audio_path is None:
        filename = os.path.basename(cleaned_transcript_path).replace("_cleaned_transcript.json", "_ai.wav")
        output_audio_path = os.path.join(os.path.dirname(cleaned_transcript_path), filename)

    # ElevenLabs API settings
    voice_id = "pMsXgVXv3BLzUgSXRplE"  # ElevenLabs VOICE ID !!!! You must provide your desired voice ID here !!!!
    url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"

    headers = {
        "xi-api-key": ELEVENLABS_API_KEY,
        "Content-Type": "application/json"
    }

    payload = {
        "text": text,
        "model_id": "eleven_multilingual_v2", # eleven_multilingual_v2, eleven_mlingual_v1
        "voice_settings": {
            "stability": 0.9,
            "similarity_boost": 0.9
        }
    }

    # Make API request
    response = requests.post(url, headers=headers, json=payload)
    response.raise_for_status()  # Raise error if request fails

    # Save the generated AI audio file
    with open(output_audio_path, "wb") as audio_file:
        audio_file.write(response.content)

    print(f"✅ AI voice successfully generated: {output_audio_path}")

    # If slow_down is enabled, override the file with slowed-down audio
    if slow_down:
        print(f"✅ Slowing down the audio!")
        slow_down_audio(output_audio_path)

    return output_audio_path
    

def slow_down_audio(audio_path: str, slowdown_factor: float = 0.90):
    """Slows down the AI-generated voice recording while preserving pitch."""
    if not os.path.exists(audio_path):
        raise FileNotFoundError(f"Audio file not found: {audio_path}")

    temp_output_path = audio_path.replace(".wav", "_temp.wav")

    # Construct FFmpeg command
    command = [
        "ffmpeg", "-i", audio_path,
        "-filter:a", f"atempo={slowdown_factor}",
        "-y", temp_output_path
    ]

    # Execute FFmpeg
    subprocess.run(command, check=True)

    # Replace original file with slowed-down version
    os.replace(temp_output_path, audio_path)
    print(f"✅ AI voice successfully slowed down: {audio_path}")

# Register as a StructuredTool
ai_voice_tool = StructuredTool(
    name="GenerateAIVoice",
    func=generate_ai_voice_elevenlabs,
    description="Generates AI voice from the cleaned transcript using ElevenLabs, with optional slowdown feature.",
    args_schema=GenerateAIVoiceSchema
)


def remove_old_voice(video_path: str, output_video_path: str = None) -> str:
    """Removes all audio from the original video using FFmpeg."""
    
    # Ensure the function does not create endless `_cleaned.mov` files
    if "_cleaned" in video_path:
        print(f"⚠️ Skipping already cleaned video: {video_path}")
        return video_path  # Return the same path if it's already cleaned

    if output_video_path is None:
        base_name = os.path.splitext(video_path)[0]
        output_video_path = f"{base_name}_cleaned.mov"

    print(f"🧹 Removing all audio from {video_path} -> {output_video_path}")

    # FFmpeg command to remove all audio
    result = subprocess.run([
        "ffmpeg", "-i", video_path, "-c:v", "copy", "-an", "-y", output_video_path
    ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

    if result.returncode != 0:
        print(f"❌ FFmpeg error while removing audio: {result.stderr}")
        return None

    if not os.path.exists(output_video_path):
        print(f"❌ Error: Cleaned video {output_video_path} was not created.")
        return None

    print(f"✅ Successfully created cleaned video: {output_video_path}")
    return output_video_path


# Register the tool in LangGraph
remove_old_voice_tool = Tool(
    name="RemoveOldVoice",
    func=remove_old_voice,
    description="Removes all audio from a video file using FFmpeg, creating a silent version."
)


def merge_audio(video_path: str, ai_voice_path: str, output_video_path: str = None):
    """
    Merges the new AI-generated voice into a cleaned version of the video,
    WITHOUT applying any timing adjustments to the AI voice.
    If the passed video_path does not end with "_cleaned.mov", it appends "_cleaned.mov".
    """

    # If user passes e.g. "myvideo_cleaned.mov", use that directly.
    # Otherwise, assume the cleaned version is "myvideo_cleaned.mov".
    if video_path.endswith("_cleaned.mov"):
        cleaned_video_path = video_path
        # Remove "_cleaned" to get the base for naming final output if needed
        base_name = os.path.splitext(video_path)[0].replace("_cleaned", "")
    else:
        base_name = os.path.splitext(video_path)[0]
        cleaned_video_path = f"{base_name}_cleaned.mov"

    # Ensure the cleaned video exists
    if not os.path.exists(cleaned_video_path):
        print(f"❌ Error: Cleaned video file {cleaned_video_path} does not exist.")
        return None

    # If user didn't supply output, create it as "myvideo_final.mov" or similar
    if output_video_path is None:
        output_video_path = f"{base_name}_final.mov"

    print(f"🎬 Merging (no timing adjust) video: {cleaned_video_path} + AI voice: {ai_voice_path}")

    # FFmpeg command: Replace the cleaned video audio track with the AI-generated voice
    result = subprocess.run([
        "ffmpeg", "-i", cleaned_video_path, "-i", ai_voice_path,
        "-c:v", "copy", "-c:a", "aac",
        "-map", "0:v:0", "-map", "1:a:0",
        "-movflags", "faststart", "-y", output_video_path
    ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

    if result.returncode != 0:
        print(f"❌ FFmpeg error while merging audio and video: {result.stderr}")
        return None

    print(f"✅ Successfully created final video: {output_video_path}")
    return output_video_path


# Registering MergeAudio as a StructuredTool
class MergeAudioSchema(BaseModel):
    video_path: str
    ai_voice_path: str
    output_video_path: str = None  # Optional parameter

merge_audio_tool = StructuredTool(
    name="MergeAudio",
    func=merge_audio,
    description="Merges the new AI-generated voice into the original .mov video using FFmpeg.",
    args_schema=MergeAudioSchema
)

# Tools for LangGraph
tools = [
    extract_audio_tool,
    transcribe_audio_tool,
    clean_transcript_tool,
    ai_voice_tool,
    remove_old_voice_tool,
    merge_audio_tool
]

# System Prompt for LangGraph
system_prompt = """You are an AI assistant that processes videos by replacing their original voice with an AI-generated voice. 
Your workflow includes: 
extracting audio from the video, 
transcribing it with timestamps, 
cleaning the transcript to remove filler words, 
generating a new AI voice from the cleaned text, 
and merging it back into the video while removing the old voice. 
Ensure high accuracy and natural-sounding output.

**IMPORTANT:**  
- `RemoveOldVoice` **must always run before** `MergeAudio`.  
- `TranscribeAudio` should only run **after** `ExtractAudio` has successfully completed.  
- Do not attempt to merge the AI voice until `RemoveOldVoice` has successfully produced a cleaned video.  
"""

# Create LangGraph Agent
graph = create_react_agent(
    model=ChatOpenAI(model="gpt-4o"),
    tools=tools,
    prompt=system_prompt,
    debug=False
)

# Process Video Function
def process_video(video_path):
    """Runs the complete AI workflow for voice replacement."""
    
    inputs = {"messages": [("user", f"Help me record a clean voice over for my {video_path} video and make sure to merge the clean AI voice back in the video.")]}
    
    for step in graph.stream(inputs, stream_mode="steps"):
        message = step["messages"][-1]
        if isinstance(message, tuple):
            print(message)
        else:
            message.pretty_print()

# Example Usage
process_video("downloads/CleanVoice.mov")


Extracting audio from downloads/CleanVoice.mov to downloads/CleanVoice_audio.wav


ffmpeg version 7.1 Copyright (c) 2000-2024 the FFmpeg developers
  built with Apple clang version 16.0.0 (clang-1600.0.26.4)
  configuration: --prefix=/usr/local/Cellar/ffmpeg/7.1_4 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --enab

✅ Audio extraction successful: downloads/CleanVoice_audio.wav


  Expected `str` but got `float` with value `262.57000732421875` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


✅ Transcript passed to LLM for cleaning: Hi This video showcases an AI agent designed to produce a clean voiceover by replacing the original recorded voice with an AI generated one The agent is built on the LANGRAF framework and operates autonomously using six tools You can see the tools here ExtractAudioTool is using FFmpeg TranscribeAudioTool uses WhisperAI CleanTranscriptTool is using OpenAI's GPT 4 0 SOR language model AIVoiceTool uses LevelLabs where I have cloned my voice And the AIVoiceTool obviously generates the AI voice RemoveAudioVoiceTool is again utilizing FFmpeg And finally UnmergeTool which is using again FFmpeg Since it's a reasoning action agent it decides on its own the sequence of two executions in order to complete the task efficiently So when it comes to building the agent the steps go like this First I've decided on which tools to use and have created the tools definition Everything from the beginning is simply tools definition extract audio The tools are defined 

ffmpeg version 7.1 Copyright (c) 2000-2024 the FFmpeg developers
  built with Apple clang version 16.0.0 (clang-1600.0.26.4)
  configuration: --prefix=/usr/local/Cellar/ffmpeg/7.1_4 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --enab

✅ AI voice successfully slowed down: downloads/CleanVoice_AI_audio.wav
🧹 Removing all audio from downloads/CleanVoice.mov -> downloads/CleanVoice_cleaned.mov
✅ Successfully created cleaned video: downloads/CleanVoice_cleaned.mov
🎬 Merging (no timing adjust) video: downloads/CleanVoice_cleaned.mov + AI voice: downloads/CleanVoice_AI_audio.wav
✅ Successfully created final video: downloads/CleanVoice_final.mov
