<a href="https://colab.research.google.com/github/hansel67/extract_chinese_subtitles/blob/main/extract_chinese_subtitles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@markdown # Requirements

print('[1;32mInstalling requirements...')
from IPython.utils import capture
with capture.capture_output() as cap:
    !pip install paddlepaddle-gpu -i https://pypi.tuna.tsinghua.edu.cn/simple
    !pip install opencv-python-headless pytube tqdm paddleocr
#!apt install ffmpeg

from IPython.display import clear_output
from subprocess import getoutput
import ipywidgets as widgets
import os, logging, re, string, cv2
from google.colab import files
from pytube import YouTube
from tqdm import tqdm
from paddleocr import PaddleOCR

def inf(msg, style, wdth): inf = widgets.Button(description=msg, disabled=True, button_style=style, layout=widgets.Layout(min_width=wdth));display(inf)


inf('\u2714 Done','success', '50px')

#@markdown ---

[1;32mInstalling requirements...


Button(button_style='success', description='✔ Done', disabled=True, layout=Layout(min_width='50px'), style=But…

In [24]:
#@markdown # Download Video with Hardcoded Chinese Subtitles from YouTube

def inf(msg, style, wdth): inf = widgets.Button(description=msg, disabled=True, button_style=style, layout=widgets.Layout(min_width=wdth));display(inf)
Youtube_Link = "https://www.youtube.com/watch?v=EHM4x2XseCM" #@param {type:"string"}

yt = YouTube(Youtube_Link)

# Select the highest quality stream of the video

print("Downloading "+yt.title)

#Download the video
yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename='video')

inf('\u2714 Done','success', '50px')
#@markdown ---

Downloading 【悬疑/谍战】货轮上隐藏巨大阴谋 日本特工和中国地下党的轮番对抗！《劫中劫》第1集【CCTV电视剧】


Button(button_style='success', description='✔ Done', disabled=True, layout=Layout(min_width='50px'), style=But…

In [25]:
#@markdown # Extract Hardcoded Subtitles and Write to SRT

def format_time(seconds):
    """Convert seconds to the SRT time format."""
    ms = int((seconds - int(seconds)) * 1000)
    s = int(seconds % 60)
    m = int(seconds // 60) % 60
    h = int(seconds // 3600)
    return f"{h:02}:{m:02}:{s:02},{ms:03}"

def normalize_spaces(text):
    """Normalize spaces by replacing all punctuation with a full-width space, and consolidate all spaces."""
    if text is None or not isinstance(text, str):
        return ""

    # Replace all non-word, non-space characters (punctuation) with a full-width space
    normalized_text = re.sub(r'[^\w\s\u3000]', '　', text, flags=re.UNICODE)

    # Consolidate all types of spaces (including full-width) into a single full-width space
    normalized_text = re.sub(r'[\s\u3000]+', '　', normalized_text, flags=re.UNICODE)

    return normalized_text.strip()

def format_srt(consolidated_subtitles, fps):
    """
    Format consolidated subtitles into SRT format.

    :param consolidated_subtitles: List of tuples (start_frame, end_frame, text).
    :param fps: Frame rate of the video.
    :return: String containing the formatted SRT content.
    """
    srt_content = []

    for idx, (start_frame, end_frame, text) in enumerate(consolidated_subtitles, 1):
        start_time = format_time(start_frame / fps)
        end_time = format_time(end_frame / fps)
        srt_entry = f"{idx}\n{start_time} --> {end_time}\n{text}\n"
        srt_content.append(srt_entry)

    return "\n".join(srt_content)

def consolidate_subtitles(ocr_results, fps, consolidation_threshold=100):
    """
    Consolidate OCR results by merging repeated subtitles and adjusting end frame based on a time offset.

    :param ocr_results: List of tuples (text, frame_number) from OCR.
    :param fps: Frame rate of the video.
    :param consolidation_threshold: Frame threshold for merging subtitles.
    :return: List of tuples (start_frame, end_frame, text) for subtitles.
    """
    if not ocr_results:
        return []

    consolidated_subtitles = []
    current_text, start_frame = ocr_results[0]
    extra_frames = int(0.25 * fps)  # Frames to add for 0.25 seconds

    for i, (text, frame_number) in enumerate(ocr_results[1:], 1):
        if text == current_text and (frame_number - start_frame) <= consolidation_threshold:
            continue
        else:
            end_frame = ocr_results[i-1][1] + extra_frames  # End frame of the last subtitle consolidated + 0.25 seconds
            consolidated_subtitles.append((start_frame, min(end_frame, frame_number - 1), current_text))
            current_text, start_frame = text, frame_number

    # Add the last subtitle entry
    end_frame = ocr_results[-1][1] + extra_frames
    consolidated_subtitles.append((start_frame, end_frame, current_text))

    return consolidated_subtitles

logger = logging.getLogger('ppocr')
logger.setLevel(logging.ERROR)

# Interval between extracted frames in seconds
interval = 0.25

# Certainty threshhold
certainty_threshold = 0.75

reader = PaddleOCR(lang='ch')
print("Extracting hardcoded subtitles")
results = []
vidcap = cv2.VideoCapture("video.mp4")
fps = vidcap.get(cv2.CAP_PROP_FPS)
total_frames_to_read = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
frame_count = 0
model_loaded = False
for _ in tqdm(range(total_frames_to_read)):
    success, image = vidcap.read()
    if not success:
        break
    if frame_count % int(fps * interval) == 0:
        h, w = image.shape[:2]
        cropped_image = image[int(3 * h / 4):h, 0:w]  # Crop bottom fourth
        if not model_loaded:
            print('Loading model')
        ocr_result = reader.ocr(cropped_image)
        if not model_loaded:
            print('Model loaded')
            model_loaded = True
        if ocr_result[0]:
            text, prob = ocr_result[0][0][1]
            if prob > certainty_threshold:
              text = normalize_spaces(text)
              results.append((text,frame_count))
    frame_count += 1
vidcap.release()

print('Subtitle extraction complete\nWriting Chinese SRT')

# Consolidate repeated subtitles
consolidated_subtitles = consolidate_subtitles(results,fps)

# Compile srt
srt_content = format_srt(consolidated_subtitles, fps)

# Write to SRT file
with open("ch_subs.srt", "w", encoding="utf-8") as file:
    file.write(srt_content)

print("Chinese SRT Complete")

inf('\u2714 Done','success', '50px')

#@markdown ---

Extracting hardcoded subtitles
68119


100%|██████████| 68119/68119 [15:19<00:00, 74.10it/s]

Subtitle extraction complete
Writing Chinese SRT
Chinese SRT Complete





Button(button_style='success', description='✔ Done', disabled=True, layout=Layout(min_width='50px'), style=But…