# Load the dividing point of videos
Dividing Point is the timestamp when the news reporters disappear in the scene.

In [None]:
import pandas as pd

In [None]:
cutpt_df = pd.read_pickle(r"./Video_Slicing/cut_points.pkl")
cutpt_df = cutpt_df[cutpt_df["has_problem"] == False]

In [None]:
print(cutpt_df)

                          video_name  cut_frame  has_problem
0       59654a72e6038331360802e0.mp4        235        False
1       59655ff4e603831f360802e1.mp4        151        False
2       596566e7c5e16c5133bc8476.mp4        153        False
3       59656decc5e16c4d33bc8475.mp4        299        False
4       59656ed0c5e16c5433bc8475.mp4        170        False
...                              ...        ...          ...
108045  678cc14d591bed663d6f86ac.mp4        231        False
108046  678ccd28591bed663d6fe2c2.mp4        669        False
108047  678cd118591bed663d6ffc9e.mp4        999        False
108048  678cd46e591bed663d7014d5.mp4        546        False
108049  678cd7a1591bed663d702db4.mp4        400        False

[108041 rows x 3 columns]


# Generating transcribing as SRT files and output audio stream as mp3 files 

In [None]:
import os
import cv2
import subprocess
import pandas as pd
import whisper
from whisper.utils import get_writer
from tqdm import tqdm  


model = whisper.load_model("large")  


video_folder = r"./TVB Reporter Videos"


output_dir = "./srts"
os.makedirs(output_dir, exist_ok=True)


mp3_output_dir = r"./TVB Reporter MP3"
os.makedirs(mp3_output_dir, exist_ok=True)

def get_frame_rate(video_path):
    """
    Get the frame rate of a video using OpenCV.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return None
    frame_rate = cap.get(cv2.CAP_PROP_FPS)
    cap.release()
    return frame_rate


for index, row in tqdm(cutpt_df.iterrows(), total=len(cutpt_df), desc="Processing videos"):
    video_name = row["video_name"]
    cut_frame = row["cut_frame"]
    video_path = os.path.join(video_folder, video_name)

    if row["has_problem"]:
        print(f"Skipping {video_name} due to problems.")
        continue

    

    
    frame_rate = get_frame_rate(video_path)
    if frame_rate is None:
        print(f"Could not determine frame rate for {video_name}. Skipping.")
        continue

    
    duration_seconds = cut_frame / frame_rate

    
    mp3_file_path = os.path.join(mp3_output_dir, f"{os.path.splitext(video_name)[0]}.mp3")

    
    ffmpeg_cmd = [
        "ffmpeg",
        "-y",  
        "-i", video_path,  
        "-t", str(duration_seconds),  
        "-vn",  
        "-c:a", "libmp3lame",  
        mp3_file_path,  
    ]

    try:
        subprocess.run(ffmpeg_cmd, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error processing {video_name}: {e}")
        continue

    
    try:
        result = model.transcribe(mp3_file_path, task="transcribe", language="cantonese", fp16=False)

        
        writer = get_writer("srt", output_dir)
        writer(result, os.path.join(output_dir, f"{os.path.splitext(video_name)[0]}.srt"))
        
    except Exception as e:
        print(f"Error transcribing {video_name}: {e}")
        continue

    
    


  checkpoint = torch.load(fp, map_location=device)
Processing videos:   4%|█▉                                                  | 3943/108041 [1:45:47<41:42:03,  1.44s/it]

Error processing 5a02fcf0e603830a7523c831.mp4: Command '['ffmpeg', '-y', '-i', './TVB Reporter Videos/5a02fcf0e603830a7523c831.mp4', '-t', '10.04', '-vn', '-c:a', 'libmp3lame', './TVB Reporter MP3/5a02fcf0e603830a7523c831.mp3']' returned non-zero exit status 4294967274.


Processing videos:   4%|██                                                  | 4358/108041 [1:56:59<42:58:35,  1.49s/it]

Error processing 5a10e273e60383e02d780a59.mp4: Command '['ffmpeg', '-y', '-i', './TVB Reporter Videos/5a10e273e60383e02d780a59.mp4', '-t', '13.3', '-vn', '-c:a', 'libmp3lame', './TVB Reporter MP3/5a10e273e60383e02d780a59.mp3']' returned non-zero exit status 4294967274.


Processing videos: 100%|████████████████████████████████████████████████████| 108041/108041 [50:01:53<00:00,  1.67s/it]


# Generate CSV file

In [None]:
import os
import csv
from tqdm import tqdm


directory = r'./TVB_SRT_Transcribing/srts'


def parse_srt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f.readlines()]
    
    
    blocks = []
    current_block = []
    for line in lines:
        if line == '':
            if current_block:
                blocks.append(current_block)
                current_block = []
        else:
            current_block.append(line)
    if current_block:
        blocks.append(current_block)
    
    
    subtitles = []
    for block in blocks:
        
        if len(block) >= 3:
            text_lines = block[2:]  
            text = ' '.join(text_lines)  
            subtitles.append(text)
    
    return subtitles


srt_files = [f for f in os.listdir(directory) if f.endswith('.srt')]


all_rows = []
for srt_file in tqdm(srt_files):
    video_id = os.path.splitext(srt_file)[0]  
    file_path = os.path.join(directory, srt_file)
    subtitles = parse_srt(file_path)
    
    
    for index, content in enumerate(subtitles):
        row_id = f"{video_id}_{index}"  
        all_rows.append([row_id, content])


with open('output.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['video_id', 'content'])  
    writer.writerows(all_rows)  

100%|█████████████████████████████████████████████████████████████████████████| 108039/108039 [05:08<00:00, 350.54it/s]
