# Locating the end of the News Reporter's Speech

This Jupyter Notebook implements a video processing pipeline to detect semantic transition points (cut points) in videos using the CLIP (Contrastive Language-Image Pretraining) model. It processes a directory of videos, identifies the first frame where its semantic is largely different from the first frame, and saves the results to a pickle file.

In [None]:
import cv2
import torch
import clip
from PIL import Image
import matplotlib.pyplot as plt
import pandas as pd
import os
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"


model, preprocess = clip.load("ViT-L/14@336px", device=device)

def get_frame_similarity(frame1, frame2):
    """
    Use the CLIP model to calculate semantic similarity between two frames.
    """
    frame1 = preprocess(Image.fromarray(frame1)).unsqueeze(0).to(device)
    frame2 = preprocess(Image.fromarray(frame2)).unsqueeze(0).to(device)
    with torch.no_grad():
        features1 = model.encode_image(frame1)
        features2 = model.encode_image(frame2)
    return torch.cosine_similarity(features1, features2).item()

def find_cut_point(video_path, similarity_threshold=0.8):
    """
    Apply a binary search method to find the transition points of semantic changes in the video and return the results by frame number.
    """
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    
    ret, base_frame = cap.read()
    if not ret:
        print(f"Failed to read the video: {video_path}")
        return None

    start_frame = 0
    end_frame = total_frames - 1
    cut_frame = None

    while start_frame <= end_frame:
        mid_frame = (start_frame + end_frame) // 2
        cap.set(cv2.CAP_PROP_POS_FRAMES, mid_frame)
        ret, mid_frame_img = cap.read()
        if not ret:
            break

        similarity = get_frame_similarity(base_frame, mid_frame_img)
        

        if similarity < similarity_threshold:
            cut_frame = mid_frame  
            end_frame = mid_frame - 1  
        else:
            start_frame = mid_frame + 1  

    cap.release()

    
    if cut_frame is None:
        cut_frame = 0
        has_problem = True
    else:
        has_problem = False

    return cut_frame, has_problem

def process_videos_in_directory(directory, similarity_threshold=0.8):
    """
    Iterate through all videos in the specified directory, find the segmentation points, and save them to a DataFrame
    """
    results = []
    video_files = [f for f in os.listdir(directory) if f.endswith(".mp4")]

    
    for filename in tqdm(video_files, desc="Processing Videos"):
        video_path = os.path.join(directory, filename)

        
        cut_frame, has_problem = find_cut_point(video_path, similarity_threshold)

        
        results.append({
            "video_name": filename,
            "cut_frame": cut_frame,
            "has_problem": has_problem
        })

    
    df = pd.DataFrame(results)
    return df


if __name__ == "__main__":
    video_directory = "./icable Reporter Videos"
    similarity_threshold = 0.85

    
    df = process_videos_in_directory(video_directory, similarity_threshold)

    
    output_path = "./cut_points.pkl"
    df.to_pickle(output_path)

    print(f"Processing completed. Results have been saved to: {output_path}")
    print(df)


Processing Videos: 100%|██████████| 28899/28899 [8:08:00<00:00,  1.01s/it]  

Processing completed. Results have been saved to: ./cut_points.pkl
      video_name  cut_frame  has_problem
0          0.mp4        197        False
1          1.mp4         38        False
2       1000.mp4        359        False
3      10001.mp4        347        False
4      10002.mp4        143        False
...          ...        ...          ...
28894   9991.mp4        199        False
28895   9992.mp4        326        False
28896   9994.mp4        158        False
28897   9996.mp4        132        False
28898   9997.mp4        296        False

[28899 rows x 3 columns]



