In [1]:
import subprocess
import os

def extract_audio(video_path, output_audio_path="output_audio.wav"):
    """
    Extract audio from a video file and convert it to 16kHz mono WAV.

    Parameters:
    - video_path (str): Path to the input video file (.webm or .mp4)
    - output_audio_path (str): Path to save the extracted WAV audio
    """
    if not os.path.exists(video_path):
        raise FileNotFoundError(f"Video file not found: {video_path}")
    
    # ffmpeg command
    command = [
        "ffmpeg",
        "-i", video_path,
        "-vn",  # disable video
        "-acodec", "pcm_s16le",  # uncompressed audio format
        "-ar", "16000",  # sample rate 16kHz
        "-ac", "1",  # mono channel
        output_audio_path,
        "-y"  # overwrite output if exists
    ]

    try:
        subprocess.run(command, check=True)
        print(f"✅ Audio extracted and saved to: {output_audio_path}")
    except subprocess.CalledProcessError as e:
        print("❌ ffmpeg failed:", e)

In [2]:
video_file_path = "../notebook/input.webm"
extract_audio(video_file_path)

✅ Audio extracted and saved to: output_audio.wav


In [2]:
from faster_whisper import WhisperModel
import difflib

# Load the model (on CPU with float32 precision)
model = WhisperModel("distil-large-v3", compute_type="int8", device="cpu")  # or "float32" for better accuracy


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Transcribe audio with timestamp support
segments, info = model.transcribe("../notebook/output_audio.wav", beam_size=5, language="en")

# Store transcript and segment data
transcript_segments = []

for segment in segments:
    entry = {
        "start": segment.start,
        "end": segment.end,
        "text": segment.text.strip()
    }
    transcript_segments.append(entry)
    print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")

[0.00s -> 3.00s]  The neat thing about working in machine learning is that every few years,
[3.00s -> 7.00s]  somebody invent something crazy that makes you totally reconsider what's possible,
[7.00s -> 12.00s]  like models that can play Go or generate hyper-realistic faces.
[12.00s -> 15.00s]  And today, the mind-blowing discovery that's rocking everyone's world
[15.00s -> 17.00s]  is a type of neural network called a transformer.
[17.00s -> 20.00s]  Transformers are models that can translate text,
[20.00s -> 23.00s]  write poems and op-eds, and even generate computer code.
[23.00s -> 26.00s]  These have been used in biology to solve the protein folding problem.
[26.00s -> 29.00s]  Transformers are like this magical machine-learning hammer
[29.00s -> 31.00s]  that seems to make every problem into a nail.
[31.00s -> 36.00s]  If you've heard of the trendy new ML models, BERT, or GPT3, or T5,
[36.00s -> 38.00s]  all of these models are based on transformers.
[38.00s -> 41.00s]  So if you

In [None]:
import torch
import clip  # pip install git+https://github.com/openai/CLIP.git
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)


# Prepare text embeddings (same as transcript_segments)
texts = [seg["text"] for seg in transcript_segments]
text_tokens = clip.tokenize(texts).to(device)

with torch.no_grad():
    text_embeddings = model.encode_text(text_tokens)
    text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)  # Normalize


def search_transcript_clip(query, top_k=3):
    with torch.no_grad():
        query_token = clip.tokenize([query]).to(device)
        query_embedding = model.encode_text(query_token)
        query_embedding = query_embedding / query_embedding.norm(dim=-1, keepdim=True)

        cosine_scores = (query_embedding @ text_embeddings.T).squeeze(0)

        top_results = torch.topk(cosine_scores, k=top_k)

        results = []
        for score, idx in zip(top_results.values, top_results.indices):
            match = transcript_segments[idx]
            results.append({
                "score": float(score),
                "start": match["start"],
                "end": match["end"],
                "text": match["text"]
            })
        return results


In [12]:
# Step 5: Try it out
query = "transformer paper"
matches = search_transcript_clip(query)

for match in matches:
    print(f"⏱ [{match['start']:.2f}s → {match['end']:.2f}s] (Score: {match['score']:.2f}) → {match['text']}")

⏱ [265.28s → 270.32s] (Score: 0.87) → the original transformer paper is attention is all you need. So, the agreement on
⏱ [43.00s → 45.00s] (Score: 0.86) → you have to know about the transformer.
⏱ [52.00s → 53.00s] (Score: 0.85) → So what is a transformer?


In [35]:
text_embeddings[0].shape

torch.Size([512])

In [36]:
transcript_segments

[{'start': 0.0,
  'end': 3.0,
  'text': 'The neat thing about working in machine learning is that every few years,'},
 {'start': 3.0,
  'end': 7.0,
  'text': "somebody invent something crazy that makes you totally reconsider what's possible,"},
 {'start': 7.0,
  'end': 12.0,
  'text': 'like models that can play Go or generate hyper-realistic faces.'},
 {'start': 12.0,
  'end': 15.0,
  'text': "And today, the mind-blowing discovery that's rocking everyone's world"},
 {'start': 15.0,
  'end': 17.0,
  'text': 'is a type of neural network called a transformer.'},
 {'start': 17.0,
  'end': 20.0,
  'text': 'Transformers are models that can translate text,'},
 {'start': 20.0,
  'end': 23.0,
  'text': 'write poems and op-eds, and even generate computer code.'},
 {'start': 23.0,
  'end': 26.0,
  'text': 'These have been used in biology to solve the protein folding problem.'},
 {'start': 26.0,
  'end': 29.0,
  'text': 'Transformers are like this magical machine-learning hammer'},
 {'start': 29.0

In [38]:
import pandas as pd

df = pd.DataFrame(transcript_segments)

In [40]:
df.to_csv("../notebook/output_frames/text_timestamp.csv", index=False)

In [42]:
df2 = pd.read_csv("../notebook/output_frames/frame_metadata.csv")

In [45]:
df.head()

Unnamed: 0,start,end,text
0,0.0,3.0,The neat thing about working in machine learni...
1,3.0,7.0,somebody invent something crazy that makes you...
2,7.0,12.0,like models that can play Go or generate hyper...
3,12.0,15.0,"And today, the mind-blowing discovery that's r..."
4,15.0,17.0,is a type of neural network called a transformer.


In [44]:
df2.head()

Unnamed: 0,scene,frame_count,frame_id,timestamp_sec,file_name
0,0,0,0,0.0,scene_000_frame_0000.jpg
1,0,1,47,1.96,scene_000_frame_0001.jpg
2,0,2,94,3.921,scene_000_frame_0002.jpg
3,0,3,141,5.881,scene_000_frame_0003.jpg
4,0,4,188,7.841,scene_000_frame_0004.jpg


In [47]:
import pandas as pd

# Load your data
df_images = pd.read_csv("../notebook/output_frames/frame_metadata.csv")  # has timestamp_sec
df_texts = pd.read_csv("../notebook/output_frames/text_timestamp.csv")    # has start, end, text

# Create an empty list to store matches
matched_rows = []

# Iterate over each image frame
for idx, img_row in df_images.iterrows():
    frame_time = img_row['timestamp_sec']
    
    # Find all text segments where start ≤ timestamp ≤ end
    matching_texts = df_texts[(df_texts['start'] <= frame_time) & (df_texts['end'] >= frame_time)]
    
    for tidx, txt_row in matching_texts.iterrows():
        matched_rows.append({
            'frame_id': img_row['frame_id'],
            'file_name': img_row['file_name'],
            'timestamp_sec': img_row['timestamp_sec'],
            'text_id': tidx,
            'start': txt_row['start'],
            'end': txt_row['end'],
            'text': txt_row['text']
        })

# Convert to DataFrame and save
df_matched = pd.DataFrame(matched_rows)
df_matched.to_csv("frame_text_crossmatch.csv", index=False)


In [55]:
df_matched

Unnamed: 0,frame_id,file_name,timestamp_sec,text_id,start,end,text
0,0,scene_000_frame_0000.jpg,0.000,0,0.00,3.00,The neat thing about working in machine learni...
1,47,scene_000_frame_0001.jpg,1.960,0,0.00,3.00,The neat thing about working in machine learni...
2,94,scene_000_frame_0002.jpg,3.921,1,3.00,7.00,somebody invent something crazy that makes you...
3,141,scene_000_frame_0003.jpg,5.881,1,3.00,7.00,somebody invent something crazy that makes you...
4,188,scene_000_frame_0004.jpg,7.841,2,7.00,12.00,like models that can play Go or generate hyper...
...,...,...,...,...,...,...,...
284,12623,scene_045_frame_0007.jpg,526.484,142,525.64,526.64,Hugging Face.
285,12670,scene_045_frame_0008.jpg,528.445,143,526.64,530.04,That's one of the community's favorite ways to...
286,12704,scene_046_frame_0000.jpg,529.863,143,526.64,530.04,That's one of the community's favorite ways to...
287,12751,scene_046_frame_0001.jpg,531.823,145,531.60,534.48,my blog post linked below and thanks for watch...


In [28]:
import pandas as pd
import torch
import clip
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm

# Load CSV
df = pd.read_csv("frame_text_crossmatch.csv")

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Cache embeddings to avoid recomputation
image_cache = {}
text_cache = {}

# Store combined embeddings
image_embeddings = []
text_embeddings = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    img_path = row['file_name']
    text = row['text']
    
    # Get image embedding
    if img_path not in image_cache:
        img_path = "../notebook/output_frames/" + img_path
        image = preprocess(Image.open(img_path)).unsqueeze(0).to(device)
        with torch.no_grad():
            image_embedding = model.encode_image(image).squeeze()
            image_embedding = image_embedding / image_embedding.norm()
        image_cache[img_path] = image_embedding
    else:
        image_embedding = image_cache[img_path]
    
    # Get text embedding
    if text not in text_cache:
        with torch.no_grad():
            text_token = clip.tokenize([text]).to(device)
            text_embedding = model.encode_text(text_token).squeeze()
            text_embedding = text_embedding / text_embedding.norm()
        text_cache[text] = text_embedding
    else:
        text_embedding = text_cache[text]

    # Max pooling fusion
    image_embeddings.append(image_embedding.cpu().numpy())
    text_embeddings.append(text_embedding.cpu().numpy())

# Convert list to numpy matrix for cosine similarity
# Store for later indexing
# df["embedding_index"] = range(len(df))


100%|██████████| 289/289 [00:25<00:00, 11.15it/s]


In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CrossAttentionFusion(nn.Module):
    def __init__(self, dim=512, num_heads=8):
        super().__init__()
        self.cross_attn = nn.MultiheadAttention(embed_dim=dim, num_heads=num_heads, batch_first=True)
        self.norm = nn.LayerNorm(dim)
        self.mlp = nn.Sequential(
            nn.Linear(dim, dim),
            nn.ReLU(),
            nn.Linear(dim, dim)
        )

    def forward(self, text_emb, image_emb):
        # Inputs: [B, dim] → reshape to [B, 1, dim]
        text = text_emb.unsqueeze(1)
        image = image_emb.unsqueeze(1)

        # text attends to image
        fused, _ = self.cross_attn(query=text, key=image, value=image)
        fused = self.norm(fused + text)  # residual + norm
        return self.mlp(fused.squeeze(1))  # [B, dim]


In [26]:
def contrastive_loss(image_feats, text_feats, temperature=0.07):
    image_feats = F.normalize(image_feats, dim=-1)
    text_feats = F.normalize(text_feats, dim=-1)
    logits = image_feats @ text_feats.T / temperature
    labels = torch.arange(len(image_feats)).to(image_feats.device)
    loss_i2t = F.cross_entropy(logits, labels)
    loss_t2i = F.cross_entropy(logits.T, labels)
    return (loss_i2t + loss_t2i) / 2


In [57]:
def train_cross_attention(image_embeddings, text_embeddings, epochs=100, lr=1e-4, batch_size=64):
    model = CrossAttentionFusion(dim=512).to("cuda" if torch.cuda.is_available() else "cpu")
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    dataset = torch.utils.data.TensorDataset(
        torch.tensor(image_embeddings).float(),
        torch.tensor(text_embeddings).float()
    )
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for img_emb, txt_emb in loader:
            img_emb, txt_emb = img_emb.to(model.mlp[0].weight.device), txt_emb.to(model.mlp[0].weight.device)
            fused_i = model(txt_emb, img_emb)
            fused_t = model(txt_emb, img_emb)  # same for both

            loss = contrastive_loss(fused_i, fused_t)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

    return model


In [30]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def compute_fused_embeddings(model, image_embeddings, text_embeddings):
    model.eval()
    with torch.no_grad():
        img_tensor = torch.tensor(image_embeddings).float().to(model.mlp[0].weight.device)
        txt_tensor = torch.tensor(text_embeddings).float().to(model.mlp[0].weight.device)
        fused = model(txt_tensor, img_tensor)
    return fused.cpu().numpy()

def search(query_embedding, fused_embeddings, meta_data, top_k=5):
    query_embedding = query_embedding / np.linalg.norm(query_embedding)
    sims = cosine_similarity([query_embedding], fused_embeddings)[0]
    top_idx = sims.argsort()[::-1][:top_k]
    return [meta_data[i] for i in top_idx], sims[top_idx]


In [34]:
import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)


import pandas as pd

# Load your data
df = pd.read_csv("frame_text_crossmatch.csv")  # ← put correct path here

# Combine image paths and texts per row
image_paths = df['file_name'].tolist()
texts = df['text'].tolist()

# Optional metadata (frame_id, time etc.)
metadata = df[['file_name', 'text', 'timestamp_sec']].to_dict(orient='records')



In [51]:
def get_text_embedding(clip_model, query_text, device):
    with torch.no_grad():
        tokens = clip.tokenize([query_text]).to(device)
        text_emb = clip_model.encode_text(tokens).squeeze()
    return text_emb / text_emb.norm()

def get_image_embedding(clip_model, image_path, preprocess, device):
    from PIL import Image
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
    with torch.no_grad():
        img_emb = clip_model.encode_image(image).squeeze()
    return img_emb / img_emb.norm()

def search_by_text(query_text, model, clip_model, fused_db, metadata, device):
    tokens = clip.tokenize([query_text]).to(device)
    with torch.no_grad():
        text_emb = clip_model.encode_text(tokens).squeeze(0)
        dummy_img = torch.zeros_like(text_emb)
        fused_query = model(text_emb.unsqueeze(0), dummy_img.unsqueeze(0)).squeeze(0)
    return search(fused_query.detach().cpu().numpy(), fused_db, metadata)


def search_by_image(image_path, model, clip_model, preprocess, fused_db, meta_data, device):
    img_emb = get_image_embedding(clip_model, image_path, preprocess, device)
    dummy_txt = torch.zeros_like(img_emb)
    fused_query = model(dummy_txt.unsqueeze(0), img_emb.unsqueeze(0)).squeeze().cpu().numpy()
    return search(fused_query, fused_db, meta_data)


In [62]:
# 1. Train model
model = train_cross_attention(image_embeddings, text_embeddings, epochs=10)

# 2. Compute fused embedding database
fused_db = compute_fused_embeddings(model, image_embeddings, text_embeddings)


Epoch 1/10, Loss: 5.0783
Epoch 2/10, Loss: 0.8889
Epoch 3/10, Loss: 0.6298
Epoch 4/10, Loss: 0.5946
Epoch 5/10, Loss: 0.4154
Epoch 6/10, Loss: 0.4787
Epoch 7/10, Loss: 0.5167
Epoch 8/10, Loss: 0.4321
Epoch 9/10, Loss: 0.3344
Epoch 10/10, Loss: 0.3326


In [61]:
# 3. Search
results, scores = search_by_text("cat ", model, clip_model, fused_db, metadata, device)
for item, score in zip(results, scores):
    print(item, "→ Score:", round(score, 3))

{'file_name': 'scene_005_frame_0001.jpg', 'text': 'like identifying objects and photos.', 'timestamp_sec': 80.622} → Score: 0.248
{'file_name': 'scene_045_frame_0000.jpg', 'text': 'No problem.', 'timestamp_sec': 512.762} → Score: 0.229
{'file_name': 'scene_009_frame_0003.jpg', 'text': 'trouble went looking for Jane.', 'timestamp_sec': 125.417} → Score: 0.227
{'file_name': 'scene_035_frame_0005.jpg', 'text': "The word server here means two very different things, and I know that because I'm looking at the", 'timestamp_sec': 421.463} → Score: 0.221
{'file_name': 'scene_035_frame_0003.jpg', 'text': 'Can I have the check versus looks like I just crashed the server?', 'timestamp_sec': 417.542} → Score: 0.209
