In [357]:
import torch
from transformers import CLIPProcessor, CLIPModel
import cv2
import os
from PIL import Image

In [358]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [359]:
video_path = "actual_site_footage.mp4"
output_folder = "clip_wall_frames"
output_folder_2 = "clip_wall_fails"
os.makedirs(output_folder, exist_ok=True)
os.makedirs(output_folder_2, exist_ok=True)

fps_target = 5          # downsample video
batch_size = 16         # frames per batch
threshold = 0.04          # similarity threshold for relevance
device = "cuda" if torch.cuda.is_available() else "cpu"

# Positive and negative prompts
positive_prompts = [
    "wall under construction",
    "construction site wall",
    "concrete blocks",
    "concrete bricks",
    "wall being built"
]

negative_prompts = [
    "closeby worker",
    "distant parking lot",
    "wheelbarrow",
    "wet concrete",
    "lots of orange fencing close by",
    "looking down over ledge"
]

In [360]:
model_name = "openai/clip-vit-base-patch32"
clip_model = CLIPModel.from_pretrained(model_name).to(device)
clip_processor = CLIPProcessor.from_pretrained(model_name)

Loading weights: 100%|██████████| 398/398 [00:00<00:00, 1530.00it/s, Materializing param=visual_projection.weight]                                
Loading weights: 100%|██████████| 398/398 [00:00<00:00, 1530.00it/s, Materializing param=visual_projection.weight]                                
[1mCLIPModel LOAD REPORT[0m from: openai/clip-vit-base-patch32
Key                                  | Status     |  | 
-------------------------------------+------------+--+-
text_model.embeddings.position_ids   | UNEXPECTED |  | 
vision_model.embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
[1mCLIPModel LOAD REPORT[0m from: openai/clip-vit-base-patch32
Key                                  | Status     |  | 
-------------------------------------+------------+--+-
text_model.embeddings.position_ids   | UNEXPECTED |  | 
vision_model.embeddings.position_ids | UNEXPECTED |

In [361]:
def encode_prompts(prompts):
    inputs = clip_processor(text=prompts, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        outputs = clip_model.get_text_features(**inputs)
        if hasattr(outputs, "text_embeds"):
            features = outputs.text_embeds
        elif hasattr(outputs, "pooler_output"):
            features = outputs.pooler_output
        elif hasattr(outputs, "last_hidden_state"):
            features = outputs.last_hidden_state.mean(dim=1)
        else:
            raise ValueError("Cannot extract tensor from CLIP text output")
        features = features / features.norm(dim=-1, keepdim=True)
    return features

text_features_pos = encode_prompts(positive_prompts)
text_features_neg = encode_prompts(negative_prompts)

In [362]:
def compute_image_features(frames):
    pil_imgs = [Image.fromarray(cv2.cvtColor(f, cv2.COLOR_BGR2RGB)) for f in frames]
    inputs = clip_processor(images=pil_imgs, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = clip_model.get_image_features(**inputs)
        if hasattr(outputs, "image_embeds"):
            img_features = outputs.image_embeds
        elif hasattr(outputs, "pooler_output"):
            img_features = outputs.pooler_output
        elif hasattr(outputs, "last_hidden_state"):
            img_features = outputs.last_hidden_state.mean(dim=1)
        else:
            raise ValueError("Cannot extract tensor from CLIP image output")
        img_features = img_features / img_features.norm(dim=-1, keepdim=True)
    return img_features

In [363]:
def center_crop(frame, crop_frac=0.8):
    h, w, _ = frame.shape
    x1 = int(w*(1-crop_frac)/2)
    x2 = int(w*(1+crop_frac)/2)
    y1 = int(h*(1-crop_frac)/2)
    y2 = int(h*(1+crop_frac)/2)
    return frame[y1:y2, x1:x2]

In [364]:
cap = cv2.VideoCapture(video_path)
orig_fps = cap.get(cv2.CAP_PROP_FPS)
skip = 10

frame_count = 0
selected_count = 0
batch_frames = []
batch_frames_full = []
batch_indices = []
last_check = 1
thres = threshold

while True:
    ret, frame = cap.read()
    if not ret:
        break

    if frame_count % skip == 0:
        cropped = center_crop(frame)
        batch_frames.append(cropped)
        batch_frames_full.append(frame)
        batch_indices.append(frame_count)

    if len(batch_frames) == batch_size:
        img_features = compute_image_features(batch_frames)
        # Positive and negative similarity
        sims_pos = img_features @ text_features_pos.T  # (batch_size, #pos_prompts)
        sims_neg = img_features @ text_features_neg.T  # (batch_size, #neg_prompts)
        relevance_scores = sims_pos.max(dim=1).values - sims_neg.max(dim=1).values
        relevance_scores = relevance_scores.cpu().numpy()

        for i, score in enumerate(relevance_scores):
            #print(f"{sims_pos.max(dim=1).values} vs {sims_neg.max(dim=1).values} for {frame_count}")
            if last_check == 0:
                thres = threshold * 1.7
            else:
                thres = threshold
            if score >= threshold:
                out_path = os.path.join(output_folder, f"frame_{batch_indices[i]:06d}.png")
                cv2.imwrite(out_path, batch_frames_full[i])
                last_check = 1
                selected_count += 1
            else:
                last_check = 0
                out_path = os.path.join(output_folder_2, f"frame_{batch_indices[i]:06d}.png")
                cv2.imwrite(out_path, batch_frames_full[i])

        batch_frames, batch_frames_full, batch_indices = [], [], []

    frame_count += 1
    print(f"{selected_count} out of {frame_count//skip} frames added")

# Process remaining frames
if batch_frames:
    img_features = compute_image_features(batch_frames)
    sims_pos = img_features @ text_features_pos.T
    sims_neg = img_features @ text_features_neg.T
    relevance_scores = sims_pos.max(dim=1).values - sims_neg.max(dim=1).values
    relevance_scores = relevance_scores.cpu().numpy()

    for i, score in enumerate(relevance_scores):
        if score >= threshold:
            out_path = os.path.join(output_folder, f"frame_{batch_indices[i]:06d}.png")
            cv2.imwrite(out_path, batch_frames[i])
            selected_count += 1

cap.release()
print(f"Saved {selected_count} frames to '{output_folder}'")

0 out of 0 frames added
0 out of 0 frames added
0 out of 0 frames added
0 out of 0 frames added
0 out of 0 frames added
0 out of 0 frames added
0 out of 0 frames added
0 out of 0 frames added
0 out of 0 frames added
0 out of 1 frames added
0 out of 1 frames added
0 out of 1 frames added
0 out of 1 frames added
0 out of 1 frames added
0 out of 1 frames added
0 out of 1 frames added
0 out of 1 frames added
0 out of 1 frames added
0 out of 1 frames added
0 out of 2 frames added
0 out of 2 frames added
0 out of 2 frames added
0 out of 2 frames added
0 out of 2 frames added
0 out of 2 frames added
0 out of 2 frames added
0 out of 2 frames added
0 out of 2 frames added
0 out of 2 frames added
0 out of 3 frames added
0 out of 3 frames added
0 out of 3 frames added
0 out of 3 frames added
0 out of 3 frames added
0 out of 3 frames added
0 out of 3 frames added
0 out of 3 frames added
0 out of 3 frames added
0 out of 3 frames added
0 out of 4 frames added
0 out of 4 frames added
0 out of 4 frame