# Load and analyze video file

In [2]:
%pip install ffmpeg-python openai --quiet

68.78s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting future (from ffmpeg-python)
  Downloading future-1.0.0-py3-none-any.whl.metadata (4.0 kB)
Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Downloading future-1.0.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.3/491.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: future, ffmpeg-python
Successfully installed ffmpeg-python-0.2.0 future-1.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import ffmpeg
import math
from openai import OpenAI

class Redubber:
    supported_video_formats = [".mp4", ".mkv", ".avi", ".mov", ".flv", ".wmv", ".webm", ".vob", ".m4v", ".3gp", ".3g2", ".m2ts", ".mts", ".ts", ".f4v", ".f4p", ".f4a", ".f4b", ".m2v", ".m4v", ".m1v", ".mpg", ".mpeg", ".mpv", ".mp2", ".mpe", ".m2p", ".m2t", ".mp2v", ".mpv2", ".m2ts", ".m2ts", ".mts", ".m2v"]
    tmp = "redubber_tmp"
    audio_ext = ".mp3"
    model="gpt-4o"
    openai_token=""
    default_audio_chunk_duration = 20*60 # 20 minutes
    
    def can_redub(self, source):
        return os.path.splitext(source)[1] in self.supported_video_formats
    
    def get_media_duration(self, file_path) -> float:
        return float(ffmpeg.probe(file_path)['format']['duration'])

    def seconds_to_hms(self, seconds):
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        seconds = int(seconds % 60)
        return f"{hours:02}:{minutes:02}:{seconds:02}"

    def extract_audio_chunks(self, root, file_path, chunk_duration = default_audio_chunk_duration) -> list[str]:
        directory_from_source = os.path.relpath(file_path, root)
        print(f"Extracting audio from {directory_from_source}")
        target_rel_dir = os.path.join(self.tmp, directory_from_source)
        total_duration = self.get_media_duration(file_path)
        print(f"Video duration {self.seconds_to_hms(total_duration)}")
        num_chunks = math.ceil(total_duration / chunk_duration)
        print(f"Extracting {num_chunks} chunks of {self.seconds_to_hms(chunk_duration)} each")
        
        audio_file_template = os.path.splitext(os.path.basename(file_path))[0] + "_{:03d}" + self.audio_ext
        # delete all aac files in the directory
        for root, _dirs, files in os.walk(target_rel_dir):
            for file in files:
                if file.endswith(self.audio_ext):
                    os.remove(os.path.join(root, file))

        total_audio_duration = 0
        result = []
        for i in range(num_chunks):
            start_time = i * chunk_duration
            output_audio_path = audio_file_template.format(i+1)  # Naming each chunk
            audio_path = os.path.join(target_rel_dir, output_audio_path)
            os.makedirs(target_rel_dir, exist_ok=True)
            stream = ffmpeg.input(file_path, ss=start_time, t=chunk_duration)
            stream = ffmpeg.output(stream, audio_path, loglevel="quiet", vn=None)
            ffmpeg.run(stream)

            print(f'Extracted chunk {i+1}: {output_audio_path}')
            result.append(audio_path)
            total_audio_duration += self.get_media_duration(audio_path)
        
        print(f"Audio duration {self.seconds_to_hms(total_audio_duration)}")

        return result
    
    def transcribe_audio(self, file_path, time_offset=0):
        client = OpenAI(api_key=self.openai_token)
        # https://platform.openai.com/docs/api-reference/audio/verbose-json-object
        with open(file_path, "rb") as audio_file:
            # Transcribe the audio using the Whisper API
            transcript = client.audio.translations.create(model="whisper-1", file=audio_file, response_format='verbose_json')
            # print(transcript)
            # transcript = json.loads(transcript)

        segments = transcript.segments
        for segment in segments:
            segment['start'] += time_offset
            segment['end'] += time_offset
            # drop dict keys: id, tokens, seek ...
            droppped_keys = [ 'id', 'tokens', 'seek', 'temperature', 'avg_logprob', 'compression_ratio', 'no_speech_prob']
            for key in droppped_keys:
                if key in segment:
                    del segment[key]
        
        return transcript.text, segments
    
    def time_to_srt_format(self, seconds):
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        seconds = int(seconds % 60)
        milliseconds = int((seconds % 1) * 1000)
        return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"

    def write_srt(self, segments, output_file):
        with open(output_file, 'w') as srt_file:
            for i, segment in enumerate(segments):
                start_time = segment['start']
                end_time = segment['end']
                text = segment['text']

                # Convert time to SRT time format
                start_time_str = self.time_to_srt_format(start_time)
                end_time_str = self.time_to_srt_format(end_time)

                # Write to the file
                srt_file.write(f"{i + 1}\n")
                srt_file.write(f"{start_time_str} --> {end_time_str}\n")
                srt_file.write(f"{text}\n\n")

    def tts(self, text, output_file):
        client = OpenAI(api_key=self.openai_token)
        with client.audio.speech.with_streaming_response.create(
                    model="tts-1",
                    voice="nova",
                    input=text,
                ) as response:
            response.stream_to_file(output_file)

    def tts_segments(self, segments, output_dir):
        result = {}
        for i, segment in enumerate(segments):
            self.tts(segment['text'], os.path.join(output_dir, f"{i:03d}.en.mp3"))
            result[segment['start']] = f"{i:03d}.en.mp3"
        return result

    def assemble_audio(self, audio_dict, dir, output_file, duration):
        input_streams = []
        for start_time, input_file in sorted(audio_dict.items()):
            stream = ffmpeg.input(os.path.join(dir, input_file))
            stream = stream.filter('adelay', f'{int(start_time*1000)}|{int(start_time*1000)}')
            input_streams.append(stream)
        
        # Combine all input streams using amix
        combined = ffmpeg.filter(input_streams, 'amix', inputs=len(input_streams), normalize=1)

        # Apply volume boost if needed
        combined = combined.filter('volume', len(input_streams))

        # Ensure the output is of the specified duration using `apad` to pad the audio if needed
        combined = combined.filter('atrim', end=duration).filter('apad', whole_dur=duration)
        
        out = ffmpeg.output(combined, output_file, acodec='libmp3lame', loglevel="quiet", audio_bitrate='320k', ar='44100')
        out = out.global_args('-y')  # Add the overwrite option

        ffmpeg.run(out)



In [None]:
# Tsting with samples

# stream = ffmpeg.input("redubber_tmp/SECTION 06. Create an attractive character face/25. Retopology.mp4/25. Retopology_001.mp3", t=30)
# stream = ffmpeg.output(stream, "redubber_tmp/SECTION 06. Create an attractive character face/25. Retopology.mp4/25. Retopology_000_sample.mp3", loglevel="quiet", vn=None)
# ffmpeg.run(stream)

# sample = "redubber_tmp/SECTION 06. Create an attractive character face/25. Retopology.mp4/25. Retopology_000_sample.mp3"
# text,segments = redubber.transcribe_audio(sample)
# res = redubber.tts_segments(segments, "redubber_tmp/SECTION 06. Create an attractive character face/25. Retopology.mp4")
# print(res)
# res = {0.0: '000.en.mp3', 13.84000015258789: '001.en.mp3', 19.600000381469727: '002.en.mp3', 24.559999465942383: '003.en.mp3', 28.31999969482422: '004.en.mp3'}
# redubber.assemble_audio(res, "redubber_tmp/SECTION 06. Create an attractive character face/25. Retopology.mp4", "redubber_tmp/SECTION 06. Create an attractive character face/25. Retopology.mp4/25. Retopology_000_sample.en.mp3", 60)

In [None]:
source = "src"
target = "dest"

redubber = Redubber()
for root, dirs, files in os.walk(source):
    for file in files:
        src_file = os.path.join(root, file)
        if redubber.can_redub(src_file) and '25' in src_file:
            print(src_file)
            audio_files = redubber.extract_audio_chunks(source, src_file)
            
            all_segments = []
            for audio_file in audio_files:
                _text, segments = redubber.transcribe_audio(audio_file)
                all_segments.extend(segments)
            redubber.write_srt(all_segments, os.path.join(target, os.path.splitext(os.path.basename(src_file))[0] + ".en.srt"))
            redubber.tts_segments(all_segments, os.path.join(target, os.path.splitext(os.path.basename(src_file))[0]))
            redubbed_audio_path = os.path.join(target, os.path.splitext(os.path.basename(src_file))[0] + ".en.mp3")
            redubber.assemble_audio(all_segments, os.path.join(target, os.path.splitext(os.path.basename(src_file))[0]), redubbed_audio_path, redubber.get_media_duration(src_file))
            # mix audio with video and save to target
            # copy subs to target
            break