# **Set Up environment**

In [1]:
# !apt install -y nvidia-cuda-toolkit
# !pip install -U sentencepiece
# !pip install -U argostranslate yt-dlp ffmpeg 
# !pip install openai-whisper
# !pip install -U argostranslate

In [2]:
import argostranslate.package
import argostranslate.translate
import datetime
import os
import math
import threading
from queue import Queue
import subprocess
import whisper
from moviepy.editor import VideoFileClip, TextClip, concatenate_videoclips
from moviepy.video.io.ffmpeg_writer import FFMPEG_VideoWriter

## **Set Up Language**

In [3]:
from_code = "ru"
to_code = "en"

# Download and install Argos Translate package
argostranslate.package.update_package_index()
available_packages = argostranslate.package.get_available_packages()
package_to_install = next(
    filter(
        lambda x: x.from_code == from_code and x.to_code == to_code, available_packages
    )
)
argostranslate.package.install_from_path(package_to_install.download())

## **Preparation Finished start Coding Below**

In [4]:
def download_video(url, output_path, num_threads='10000'):
  !yt-dlp -N 10 -f bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best "$url" -o "$output_path"

In [5]:
def transcribe_audio(input_file_path):
    !whisper --language ru --output_format vtt --max_words_per_line 8 --word_timestamps True $input_file_path

In [6]:
def translate_word(word):
    try:
        translation = argostranslate.translate.translate(word, "ru", "en")
        print(translation)
        return translation
    except Exception as e:
        # print(f"Error during translation of '{word}': {e}")
        return word

def translate_line(line, index, output_queue):
    # Handle lines that do not require translation directly
    if '-->' in line or line.strip().isdigit() or not line.strip():
        output_queue.put((index, line))
    else:
        # Translate the line word by word and combine
        words = line.split()
        translated_words = [translate_word(word) for word in words]
        translated_line = ' '.join(translated_words)
        # Format combined line with original and translated text
        combined_line = line.strip() + '\n'+ translated_line + '\n'
        output_queue.put((index, combined_line))

def process_lines(lines):
    output_queue = Queue()
    threads = []

    for index, line in enumerate(lines):
        thread = threading.Thread(target=translate_line, args=(line, index, output_queue))
        thread.start()
        threads.append(thread)

    for thread in threads:
        thread.join()

    # Collect and sort the results based on the original line order
    translated_lines = [None] * len(lines)
    while not output_queue.empty():
        index, translation = output_queue.get()
        translated_lines[index] = translation

    return translated_lines

def process_vtt_in_chunks(filepath, newfilepath, chunk_size=50):
    base, ext = os.path.splitext(filepath)
    with open(filepath, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    with open(newfilepath, 'w', encoding='utf-8') as new_file:
        for i in range(0, len(lines), chunk_size):
            chunk = lines[i:i+chunk_size]
            processed_lines = process_lines(chunk)
            new_file.writelines(processed_lines)


In [7]:
def adjust_timing(lines):
    for i in range(len(lines)-1):
        if '-->' in lines[i] and '-->' in lines[i + 1]:
            current_line_timing, current_line_text = lines[i].split('\n', 1)
            start_time_next_line, _ = lines[i + 1].split('\n', 1)[0].split('-->', 1)
            new_current_line = current_line_timing.split('-->', 1)[0] + '--> ' + start_time_next_line + '\n' + current_line_text
            lines[i] = new_current_line
    return lines

def adjust_vtt_timing(input_filepath, output_filepath):
    with open(input_filepath, 'r', encoding='utf-8') as input_file:
        content = input_file.read()
    lines = content.split('\n\n')
    new_lines = adjust_timing(lines)
    new_content = '\n\n'.join(new_lines)
    with open(output_filepath, 'w', encoding='utf-8') as output_file:
        output_file.write(new_content)

# **Coding Finished Now specify things**

In [8]:
def download(url):
    timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")

    mp4_output = f"{timestamp}.mp4"
    vtt_output = f"{timestamp}.vtt"
    toadjust_output = f"toadjust_{timestamp}.vtt"
    adjusted_output = f"new_{timestamp}.vtt"
    output_video_path = f"final_{timestamp}.mp4"

    download_video(url,mp4_output)
    transcribe_audio(mp4_output)
    process_vtt_in_chunks(vtt_output,toadjust_output)
    adjust_vtt_timing(toadjust_output,adjusted_output)
    ! /usr/local/bin/ffmpeg -i "$mp4_output" -vf "subtitles=$adjusted_output:force_style='Fontname=Roboto,OutlineColour=&H40000000,BorderStyle=3'" -c:a copy "$output_video_path"
    ! dropbox-uploader upload "$output_video_path" /
    !rm *2024*

In [9]:
urls = [
    "https://www.youtube.com/watch?v=2B42OubTUd0",
    "https://www.youtube.com/watch?v=tsbg0eiKU1I",
    "https://www.1tv.ru/news/issue/2023-01-01/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-02/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-03/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-04/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-05/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-06/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-07/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-08/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-09/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-10/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-11/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-12/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-13/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-14/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-15/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-16/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-17/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-18/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-19/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-20/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-21/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-22/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-23/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-24/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-25/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-26/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-27/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-28/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-29/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-30/21:00#1",
    "https://www.1tv.ru/news/issue/2023-01-31/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-01/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-02/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-03/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-04/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-05/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-06/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-07/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-08/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-09/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-10/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-11/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-12/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-13/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-14/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-15/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-16/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-17/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-18/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-19/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-20/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-21/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-22/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-23/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-24/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-25/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-26/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-27/21:00#1",
    "https://www.1tv.ru/news/issue/2023-02-28/21:00#1"
]


# **RUN IT**

In [None]:
# Loop through each URL and download
for url in urls:
    download(url)

[youtube] Extracting URL: https://www.youtube.com/watch?v=2B42OubTUd0
[youtube] 2B42OubTUd0: Downloading webpage
[youtube] 2B42OubTUd0: Downloading ios player API JSON
[youtube] 2B42OubTUd0: Downloading android player API JSON
[youtube] 2B42OubTUd0: Downloading m3u8 information
[info] 2B42OubTUd0: Downloading 1 format(s): 616+140
[hlsnative] Downloading m3u8 manifest
[hlsnative] Total fragments: 39
[download] Destination: 20240211210810.f616.mp4
[K[download] 100% of  104.18MiB in [1;37m00:00:01[0m at [0;32m101.86MiB/s[0m;33mUnknown[0m (frag 40/39)
[download] Destination: 20240211210810.f140.m4a
[K[download] 100% of    3.27MiB in [1;37m00:00:00[0m at [0;32m114.92MiB/s[0m33m00:00[0m0m
[Merger] Merging formats into "20240211210810.mp4"
Deleting original file 20240211210810.f140.m4a (pass -k to keep)
Deleting original file 20240211210810.f616.mp4 (pass -k to keep)
[00:00.000 --> 00:29.980]  Семирная сеть
