<a href="https://colab.research.google.com/github/jimchen2/util-scripts/blob/master/Download_Subtitles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Set Up environment**

In [1]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [2]:
!apt install nvidia-cuda-toolkit
!pip install -U openai-whisper argostranslate yt-dlp ffmpeg


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
nvidia-cuda-toolkit is already the newest version (11.5.1-1ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 32 not upgraded.


In [3]:
import argostranslate.package
import argostranslate.translate
import datetime
import os
import threading
from queue import Queue
import subprocess
import whisper

## **Set Up Language**

In [4]:
from_code = "ru"
to_code = "en"

# Download and install Argos Translate package
argostranslate.package.update_package_index()
available_packages = argostranslate.package.get_available_packages()
package_to_install = next(
    filter(
        lambda x: x.from_code == from_code and x.to_code == to_code, available_packages
    )
)
argostranslate.package.install_from_path(package_to_install.download())

## **Preparation Finished start Coding Below**

In [5]:
def download_video(url, output_path, num_threads='10000'):
    command = [
        "yt-dlp",
        "-N", num_threads,
        "-f", "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",  # Force .mp4 format
        url,
        "-o", output_path
    ]
    subprocess.run(command)
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    for line in process.stdout:
        print(line.decode().strip())

In [6]:
def transcribe_audio(input_file_path):
    # Define the command as a list of arguments
    command = [
        "whisper",
        "--language", "ru",
        "--output_format", "vtt",
        "--max_words_per_line", "8",
        "--word_timestamps", "True",
        input_file_path
    ]

    # Use subprocess.Popen to execute the command and capture stdout and stderr
    with subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) as process:
        # Read and print stdout line by line
        for line in process.stdout:
            print(line.strip())
        # Check if there are errors and print them
        for line in process.stderr:
            print("Error:", line.strip())

In [7]:
def translate_word(word):
    try:
        translation = argostranslate.translate.translate(word, "ru", "en")
        return translation
    except Exception as e:
        print(f"Error during translation of '{word}': {e}")
        return word

def translate_line(line, index, output_queue):
    # Handle lines that do not require translation directly
    if '-->' in line or line.strip().isdigit() or not line.strip():
        output_queue.put((index, line))
    else:
        # Translate the line word by word and combine
        words = line.split()
        translated_words = [translate_word(word) for word in words]
        translated_line = ' '.join(translated_words)
        # Format combined line with original and translated text
        combined_line = line.strip() + '\n'+ translated_line + '\n'
        output_queue.put((index, combined_line))

def process_lines(lines):
    output_queue = Queue()
    threads = []

    for index, line in enumerate(lines):
        thread = threading.Thread(target=translate_line, args=(line, index, output_queue))
        thread.start()
        threads.append(thread)

    for thread in threads:
        thread.join()

    # Collect and sort the results based on the original line order
    translated_lines = [None] * len(lines)
    while not output_queue.empty():
        index, translation = output_queue.get()
        translated_lines[index] = translation

    return translated_lines

def process_vtt_in_chunks(filepath, newfilepath, chunk_size=50):
    base, ext = os.path.splitext(filepath)
    with open(filepath, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    with open(newfilepath, 'w', encoding='utf-8') as new_file:
        for i in range(0, len(lines), chunk_size):
            chunk = lines[i:i+chunk_size]
            processed_lines = process_lines(chunk)
            new_file.writelines(processed_lines)


In [8]:
def adjust_timing(lines):
    for i in range(len(lines)-1):
        if '-->' in lines[i] and '-->' in lines[i + 1]:
            current_line_timing, current_line_text = lines[i].split('\n', 1)
            start_time_next_line, _ = lines[i + 1].split('\n', 1)[0].split('-->', 1)
            new_current_line = current_line_timing.split('-->', 1)[0] + '--> ' + start_time_next_line + '\n' + current_line_text
            lines[i] = new_current_line
    return lines

def adjust_vtt_timing(input_filepath, output_filepath):
    with open(input_filepath, 'r', encoding='utf-8') as input_file:
        content = input_file.read()
    lines = content.split('\n\n')
    new_lines = adjust_timing(lines)
    new_content = '\n\n'.join(new_lines)
    with open(output_filepath, 'w', encoding='utf-8') as output_file:
        output_file.write(new_content)

# **Coding Finished Now run it**

In [13]:
def download(url):
    timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")

    mp4_output = f"{timestamp}.mp4"
    vtt_output = f"{timestamp}.vtt"
    toadjust_output = f"toadjust_{timestamp}.vtt"
    adjusted_output = f"new_{timestamp}.vtt"
    output_video_path = f"final_{timestamp}.mp4"

    ffmpeg_command = [
        "ffmpeg",
        "-i", mp4_output,
        "-i", adjusted_output,
        "-c", "copy",
        "-c:s", "mov_text",
        output_video_path
    ]
    download_video(url,mp4_output)
    transcribe_audio(mp4_output)
    process_vtt_in_chunks(vtt_output,toadjust_output)
    adjust_vtt_timing(toadjust_output,adjusted_output)
    subprocess.run(ffmpeg_command)
    !cp "$output_video_path" "/content/gdrive/MyDrive/Videos/"


In [14]:
download("https://v8-dtln.1internet.tv/video/multibitrate/video/2024/02/08/f07350ed-9645-4539-a884-86c39d2d0b30_HD-news-2024_02_08-16_37_16_,350,950,3800,8000,.mp4.urlset/master.m3u8")

[generic] Extracting URL: https://v8-dtln.1internet.tv/video/multibitrate/video/2024/02/08/f07350ed-9645-4539-a884-86c39d2d...4.urlset/master.m3u8
[generic] master: Downloading webpage
[generic] master: Downloading m3u8 information
[generic] master: Checking m3u8 live status
[info] master: Downloading 1 format(s): 8136
[download] 20240210092827.mp4 has already been downloaded
[download] 100% of  377.75MiB
[00:00.000 --> 00:03.520]  Анастасия Чернышева, Владислав Вильчик.
[00:04.940 --> 00:07.460]  Ну что же, вот и время лидеров пришло
[00:07.460 --> 00:10.140]  продемонстрировать свою произвольную программу
[00:10.140 --> 00:13.960]  на льду заключительный соревновательный номер этого турнира.
[00:14.420 --> 00:18.640]  Анастасия Чернышева и Владислав Вильчик со своей произвольной программой,
[00:18.720 --> 00:19.880]  как и в танцах на льду.
[00:20.500 --> 00:23.940]  Заключительное музыкальное произведение — это призрак оперы.
[00:30.000 --> 00:41.240]  ИНТРИГУЮЩАЯ МУЗЫКА
[00:41.240 