In [None]:
!pip install torch-geometric
!pip install ultralytics
!!pip install git+https://github.com/openai/CLIP.git
!pip install detectron2
!pip install yolov5

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("steveandreasimmanuel/msvd-video-caption")

print("Path to dataset files:", path)

In [None]:
import torch

print("Number of GPUs available:", torch.cuda.device_count())
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")


In [None]:
import logging
logging.getLogger("ultralytics").setLevel(logging.CRITICAL)

In [None]:
!rm /kaggle/working/testing_all_video_features.pkl

# CLIP with yolo

In [None]:
import torch
import numpy as np
import os






































from PIL import Image
from torchvision import models, transforms
import pickle
import clip
from ultralytics import YOLO
from tqdm import tqdm  # Import tqdm for progress bar
from torchvision.models.detection import fasterrcnn_resnet50_fpn

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device)

# Load YOLO model
yolo_model = YOLO("yolov8s.pt")  

# Load ResNet for object feature extraction
resnet = models.resnet50(pretrained=True)
resnet.eval()
fasterrcnn_model = fasterrcnn_resnet50_fpn(pretrained=True)
fasterrcnn_model.eval()
# Transformation for ResNet
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Function to extract frame-level features using CLIP
def extract_frame_features_from_frame(frame):
    frame_tensor = preprocess(frame).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = clip_model.encode_image(frame_tensor)
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
    return image_features.cpu().numpy()
def extract_object_features_from_frame(frame):
    object_features = []
    bounding_boxes = []
    
    # Convert image to tensor and move it to the device (GPU)
    frame_tensor = transforms.ToTensor()(frame).unsqueeze(0).to(device)  # Move tensor to GPU
    
    # Ensure the Faster R-CNN and ResNet models are on the GPU
    fasterrcnn_model.to(device)
    resnet.to(device)
    
    with torch.no_grad():
        detections = fasterrcnn_model(frame_tensor)[0]
    
    for i in range(len(detections['boxes'])):
        score = detections['scores'][i].item()
        
        if score > 0.3:  # Lower the threshold to capture more objects
            x1, y1, x2, y2 = map(int, detections['boxes'][i].cpu().numpy())  # Convert to int
            cropped_obj = frame.crop((x1, y1, x2, y2))  

            obj_tensor = transform(cropped_obj).unsqueeze(0).to(device)  # Move object tensor to GPU

            with torch.no_grad():
                obj_feature = resnet(obj_tensor).view(-1).cpu().numpy()  # Move result back to CPU for numpy conversion

            object_features.append(obj_feature)
            bounding_boxes.append([x1, y1, x2, y2])
    
    # Debugging: Print detected objects
    print(f"Detected {len(object_features)} objects with scores: {detections['scores'].cpu().numpy()}")

    return object_features, bounding_boxes


# Function to process each video and extract features
def process_video(video_folder):
    frame_features_list = []
    object_features_list = []
    bounding_boxes_list = []

    frame_files = sorted(os.listdir(video_folder))
    for frame_name in frame_files:  # Process only the first 3 frames
        frame_path = os.path.join(video_folder, frame_name)
        frame = Image.open(frame_path).convert("RGB")

        # Extract frame-level features
        frame_features = extract_frame_features_from_frame(frame)

        # Extract object-level features
        obj_features, boxes = extract_object_features_from_frame(frame)

        object_features_list.append(obj_features)
        bounding_boxes_list.append(boxes)
        frame_features_list.append(frame_features)

    return {
        "frame_features": frame_features_list,
        "object_features": object_features_list,
        "bounding_boxes": bounding_boxes_list
    }

# Load existing features
def load_existing_features(pkl_file):
    if os.path.exists(pkl_file):
        with open(pkl_file, 'rb') as f:
            return pickle.load  (f)
    return {}

# Save features
def save_features_to_pkl(all_video_features, pkl_file):
    os.makedirs(os.path.dirname(pkl_file), exist_ok=True)
    with open(pkl_file, 'wb') as f:
        pickle.dump(all_video_features, f)

# Process multiple videos with a progress bar
def process_videos(videos_folder, output_file):
    all_video_features = load_existing_features(output_file)
    video_list = sorted(os.listdir(videos_folder))

    with tqdm(total=len(video_list), desc="Processing Videos", unit="video") as pbar:
        for video_name in video_list:
            if video_name in all_video_features:
                pbar.update(1)  # Skip if already processed
                continue

            video_path = os.path.join(videos_folder, video_name)
            if os.path.isdir(video_path):
                video_features = process_video(video_path)
                all_video_features[video_name] = video_features
                save_features_to_pkl(all_video_features, output_file)  
            pbar.update(1)  

    return all_video_features

# Example usage
videos_folder = '/kaggle/input/msvd-video-caption/validation'
output_file = '/kaggle/working/validation_all_video_features.pkl'

all_video_features = process_videos(videos_folder, output_file)

In [None]:
rm /kaggle/working/validation_all_video_features.pkl

In [1]:
import pickle

# Path to the saved pickle file
pkl_file = "/kaggle/working/validation_all_video_features.pkl"

# Load the features
with open(pkl_file, "rb") as f:
    video_graph_features = pickle.load(f)

In [3]:
len(video_graph_features['bQJQGoJF7_k_162_169']['bounding_boxes'])

210

In [None]:
import numpy as np

def compute_iou(box1, box2):
    """Calculate Intersection over Union (IoU) between two bounding boxes"""
    x1, y1, x2, y2 = box1
    x1_p, y1_p, x2_p, y2_p = box2

    # Compute intersection
    inter_x1 = max(x1, x1_p)
    inter_y1 = max(y1, y1_p)
    inter_x2 = min(x2, x2_p)
    inter_y2 = min(y2, y2_p)

    inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
    
    # Compute areas
    box1_area = (x2 - x1) * (y2 - y1)
    box2_area = (x2_p - x1_p) * (y2_p - y1_p)
    
    # Compute IoU
    iou = inter_area / float(box1_area + box2_area - inter_area)
    return iou

def track_objects_across_frames(video_features, iou_threshold=0.5):
    """Assign tracking IDs to objects appearing across frames"""
    tracking_info = {}
    object_id = 1  # Initialize first object ID
    
    prev_frame_objects = {}  # Store objects in previous frame
    
    for frame_idx, (objects, boxes) in enumerate(zip(video_features["object_features"], video_features["bounding_boxes"])):
        frame_objects = {}

        for obj_idx, box in enumerate(boxes):
            best_match_id = None
            max_iou = iou_threshold  # Only assign if IoU is above threshold
            
            for prev_idx, prev_box in prev_frame_objects.items():
                iou = compute_iou(box, prev_box)
                if iou > max_iou:
                    best_match_id = prev_idx
                    max_iou = iou
            
            if best_match_id is not None:
                frame_objects[obj_idx] = tracking_info[(frame_idx - 1, best_match_id)]
            else:
                frame_objects[obj_idx] = object_id
                object_id += 1

            tracking_info[(frame_idx, obj_idx)] = frame_objects[obj_idx]
        
        prev_frame_objects = {idx: box for idx, box in enumerate(boxes)}

    return tracking_info
    
def process_all_videos(all_video_features):
    all_video_tracking_info = {}

    # Iterate through all videos and track objects across frames
    for video_id, video_features in all_video_features.items():
        print(f"Processing video: {video_id}")
        
        # Track objects for the current video
        tracking_info = track_objects_across_frames(video_features)

        # Store the tracking information for the video
        all_video_tracking_info[video_id] = tracking_info
        print("added")
    return all_video_tracking_info
all_video_tracking_info = process_all_videos(video_graph_features)

In [None]:
len(all_video_tracking_info['bQJQGoJF7_k_162_169'])

In [None]:
import numpy as np
from scipy.spatial.distance import cdist

def build_graph_features(video_features, tracking_info):
    frame_features_list = video_features["frame_features"]
    object_features_list = video_features["object_features"]
    bounding_boxes = video_features["bounding_boxes"]

    num_frames = len(frame_features_list)
    num_objects = sum(len(objects) for objects in object_features_list)  

    # Initialize adjacency matrix and feature tensor
    adjacency_matrix = np.zeros((num_objects, num_objects))
    feature_tensor = []

    object_index_map = {}  # Maps (frame_idx, obj_idx) → global object index
    global_obj_idx = 0

    for frame_idx, objects in enumerate(object_features_list):
        for obj_idx, obj_feature in enumerate(objects):
            feature_tensor.append(obj_feature)
            object_index_map[(frame_idx, obj_idx)] = global_obj_idx
            global_obj_idx += 1

    feature_tensor = np.array(feature_tensor)  

    # ---- SPATIAL EDGES (within the same frame) ----
    for frame_idx in range(num_frames):
        if not bounding_boxes[frame_idx]: 
            continue  # Skip empty frames

        bbox_centers = [( (box[0] + box[2]) / 2, (box[1] + box[3]) / 2 ) for box in bounding_boxes[frame_idx]]
        bbox_centers = np.array(bbox_centers)

        if bbox_centers.shape[0] == 0:
            continue  

        dist_matrix = cdist(bbox_centers, bbox_centers)

        # Lower threshold to 100 or more for more edges
        for i in range(len(bbox_centers)):
            for j in range(i + 1, len(bbox_centers)):
                if dist_matrix[i, j] < 100:  # Increase threshold for better connectivity
                    obj1 = object_index_map[(frame_idx, i)]
                    obj2 = object_index_map[(frame_idx, j)]
                    adjacency_matrix[obj1, obj2] = 1
                    adjacency_matrix[obj2, obj1] = 1

    # ---- TEMPORAL EDGES (linking objects across frames) ----
    for (prev_frame, prev_obj), track_id in tracking_info.items():
        for (curr_frame, curr_obj), curr_track_id in tracking_info.items():
            if curr_frame == prev_frame + 1 and curr_track_id == track_id:  
                obj1 = object_index_map[(prev_frame, prev_obj)]
                obj2 = object_index_map[(curr_frame, curr_obj)]
                adjacency_matrix[obj1, obj2] = 1
                adjacency_matrix[obj2, obj1] = 1

    return adjacency_matrix, feature_tensor



def process_all_videos(video_object_tracking, video_object_feature):
    all_video_graphs = {}  # Store graphs for each video
    
    for video_id in video_object_tracking.keys():
        tracking_info = video_object_tracking[video_id]  # Object tracking data
        video_features = video_object_feature[video_id]  # Extracted features

        adjacency_matrix, feature_tensor = build_graph_features(video_features, tracking_info)
        
        all_video_graphs[video_id] = {
            "adjacency_matrix": adjacency_matrix,
            "feature_matrix": feature_tensor
        }

In [None]:
# import networkx as nx
# import matplotlib.pyplot as plt

# def visualize_graph(adjacency_matrix, feature_tensor):
#     # Create a graph from the adjacency matrix
#     G = nx.from_numpy_array(adjacency_matrix)

#     # Set up node labels based on feature indices (you can modify this to use more meaningful labels)
#     labels = {i: f'Obj_{i}' for i in range(len(feature_tensor))}

#     # Visualize the graph
#     plt.figure(figsize=(10, 10))
#     pos = nx.spring_layout(G)  # Layout for the nodes (spring layout)
#     nx.draw(G, pos, with_labels=True, labels=labels, node_size=500, node_color='skyblue', font_size=10, font_weight='bold', alpha=0.6)
    
#     plt.title("Video Graph Visualization")
#     plt.show()
# # Example usage: visualize the graph for a specific video
# video_id = "bQJQGoJF7_k_162_169"  # Replace with a specific video ID
# adjacency_matrix = all_video_graphs[video_id]["adjacency_matrix"]
# feature_tensor = all_video_graphs[video_id]["feature_matrix"]

# visualize_graph(adjacency_matrix, feature_tensor)
all_video_graphs

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, Dataset
import numpy as np

# Load the Q&A CSV file
qa_df = pd.read_csv('/kaggle/input/msvd-question-and-answer/MSVD_QandA.csv')  # Adjust the path to your CSV file

# Prepare the Q&A pairs (video_id, question, answer)
qa_pairs = []
for _, row in qa_df.iterrows():
    qa_pairs.append((row['video_id'], row['question'], row['answer']))

# Assuming 'video_features' is a dictionary with video features as provided
# Example structure:
# video_features = {
#     'bQJQGoJF7_k_162_169': {
#         'adjacency_matrix': np.array(...),
#         'feature_matrix': np.array(...),
#     },
#     ...
# }

# Dataset class for loading the video features and Q&A pairs
class VideoQADataset(Dataset):
    def __init__(self, video_features, qa_pairs, tokenizer):
        self.video_features = video_features
        self.qa_pairs = qa_pairs
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.qa_pairs)
    
    def __getitem__(self, idx):
        video_id = self.qa_pairs[idx][0]
        question = self.qa_pairs[idx][1]
        answer = self.qa_pairs[idx][2]
        
        # Get video feature matrices
        video_feature = self.video_features[video_id]['feature_matrix']
        adjacency_matrix = self.video_features[video_id]['adjacency_matrix']
        
        # Ensure they are numpy arrays and convert to tensors if needed
        video_feature = torch.tensor(video_feature, dtype=torch.float32)
        adjacency_matrix = torch.tensor(adjacency_matrix, dtype=torch.float32)
        
        # Tokenize question and answer
        question_enc = self.tokenizer(question, return_tensors='pt', padding=True, truncation=True)
        answer_enc = self.tokenizer(answer, return_tensors='pt', padding=True, truncation=True)
        
        return video_feature, adjacency_matrix, question_enc, answer_enc


# Model definition
class VideoQAModel(nn.Module):
    def __init__(self, feature_dim, hidden_dim):
        super(VideoQAModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')  # Pre-trained BERT model
        self.fc = nn.Linear(feature_dim, hidden_dim)  # Fully connected layer to map features to hidden state
        self.decoder = nn.Linear(hidden_dim, 1)  # Change to appropriate size for the answer prediction
    
    def forward(self, video_features, adjacency_matrix, question_ids):
        # Process video features (simplified example, can modify as needed)
        feature_embeds = torch.mean(video_features, dim=1)  # Example aggregation
        adj_embeds = torch.mean(adjacency_matrix, dim=1)  # Example processing for adjacency matrix
        
        # Question embedding from BERT
        question_outputs = self.bert(**question_ids)
        question_embed = question_outputs.last_hidden_state[:, 0, :]  # [CLS] token
        
        # Combine video and question embeddings
        combined_features = torch.cat((feature_embeds, adj_embeds, question_embed), dim=-1)
        
        # Pass through fully connected layers
        hidden = torch.relu(self.fc(combined_features))
        output = self.decoder(hidden)
        
        return output

# Initialize tokenizer for BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create the dataset and dataloader
dataset = VideoQADataset(video_features, qa_pairs, tokenizer)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Model initialization
feature_dim = 2048  # Adjust based on the size of your video features
hidden_dim = 512
model = VideoQAModel(feature_dim=feature_dim, hidden_dim=hidden_dim)

# Optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.MSELoss()  # Use appropriate loss function for your task

# Training loop
epochs = 10  # Adjust based on your need
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in dataloader:
        video_features, adjacency_matrix, question_ids, answer_ids = batch
        
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(video_features, adjacency_matrix, question_ids)
        
        # Compute the loss
        loss = loss_fn(outputs, answer_ids['input_ids'])  # Adjust according to your output format
        total_loss += loss.item()
        
        # Backpropagation
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(dataloader)}")

# Save the model (optional)
torch.save(model.state_dict(), 'video_qa_model.pth')


In [None]:
def process_all_videos(video_object_tracking, video_object_feature):
    all_video_graphs = {}  # Store graphs for each video
    
    for video_id in video_object_tracking.keys():
        tracking_info = video_object_tracking[video_id]  # Object tracking data
        video_features = video_object_feature[video_id]  # Extracted features

        adjacency_matrix, feature_tensor = build_graph_features(video_features, tracking_info)
        
        all_video_graphs[video_id] = {
            "adjacency_matrix": adjacency_matrix,
            "feature_matrix": feature_tensor
        }

    return all_video_graphs
all_video_graphs = process_all_videos(all_video_tracking_info, video_graph_features)

In [None]:
len(all_video_graphs['bQJQGoJF7_k_162_169']['adjacency_matrix'])

In [None]:
# Dictionary to store tensorized data
video_graph_tensors = {}

# Iterate through all video graph data
for video_id, video_data in all_video_graphs.items():
    adjacency_matrix = video_data['adjacency_matrix']
    feature_tensor = video_data['feature_matrix']
    
    # Convert adjacency matrix and feature tensor to PyTorch tensors
    adj_matrix_tensor = torch.tensor(adjacency_matrix, dtype=torch.float32)
    feature_tensor = torch.tensor(feature_tensor, dtype=torch.float32)
    
    # Store the tensors back in the dictionary
    video_graph_tensors[video_id] = {
        'adjacency_matrix': adj_matrix_tensor,
        'feature_tensor': feature_tensor
    }


In [None]:
video_graph_tensors['bruzcOyIGeg_4_12']['feature_tensor'].shape

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class GCNLayer(nn.Module):
    def __init__(self, in_features, out_features):
        super(GCNLayer, self).__init__()
        self.weights = nn.Parameter(torch.randn(in_features, out_features))

    def forward(self, adjacency_matrix, feature_tensor):
        # Ensure that adjacency_matrix is a tensor if it's not
        adjacency_matrix = torch.tensor(adjacency_matrix, dtype=torch.float32)
        
        # Apply graph convolution operation: A * X * W
        support = torch.matmul(feature_tensor, self.weights)  # X * W
        out = torch.matmul(adjacency_matrix, support)  # A * (X * W)
        return out

class VideoGraphNetwork(nn.Module):
    def __init__(self, in_features, hidden_features, out_features):
        super(VideoGraphNetwork, self).__init__()
        self.gcn1 = GCNLayer(in_features, hidden_features)
        self.gcn2 = GCNLayer(hidden_features, out_features)

    def forward(self, adjacency_matrix, feature_tensor):
        # First GCN layer
        x = self.gcn1(adjacency_matrix, feature_tensor)
        x = F.relu(x)

        # Second GCN layer
        x = self.gcn2(adjacency_matrix, x)
        
        return x



# Define the model
model = VideoGraphNetwork(in_features=3, hidden_features=5, out_features=2)

# Forward pass
output = model(video_graph_tensors['bb6V0Grtub4_174_185']['adjacency_matrix'], video_graph_tensors['bb6V0Grtub4_174_185']['feature_tensor'])
print(output)


# CLIP with yolo

In [None]:
import torch
import numpy as np
import os
from PIL import Image
from torchvision import models, transforms
import pickle
import clip
from ultralytics import YOLO
from tqdm import tqdm  # Import tqdm for progress bar

# Suppress YOLO logs
import logging
logging.getLogger("ultralytics").setLevel(logging.CRITICAL)

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device)

# Load YOLO model
yolo_model = YOLO("yolov8s.pt")  

# Load ResNet for object feature extraction
resnet = models.resnet50(pretrained=True)
resnet.eval()

# Transformation for ResNet
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Function to extract frame-level features using CLIP
def extract_frame_features_from_frame(frame):
    frame_tensor = preprocess(frame).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = clip_model.encode_image(frame_tensor)
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
    return image_features.cpu().numpy()

# Function to extract object-level features using YOLO
def extract_object_features_from_frame(frame):
    object_features = []
    bounding_boxes = []

    # Run YOLO object detection (disable console output)
    results = yolo_model(frame)

    # Process YOLO detections
    for result in results:
        for box in result.boxes.data.cpu().numpy():
            x1, y1, x2, y2, score, class_id = box  
            if score > 0.2:  # Filter low-confidence detections
                cropped_obj = frame.crop((x1, y1, x2, y2))  
                obj_tensor = transform(cropped_obj).unsqueeze(0)  

                with torch.no_grad():
                    obj_feature = resnet(obj_tensor).view(-1).numpy()  

                object_features.append(obj_feature)
                bounding_boxes.append([x1, y1, x2, y2])

    return object_features, bounding_boxes

# Function to process each video and extract features
def process_video(video_folder):
    frame_features_list = []
    object_features_list = []
    bounding_boxes_list = []

    frame_files = sorted(os.listdir(video_folder))
    for frame_name in frame_files[:3]:  # Process only the first 3 frames
        frame_path = os.path.join(video_folder, frame_name)
        frame = Image.open(frame_path).convert("RGB")

        # Extract frame-level features
        frame_features = extract_frame_features_from_frame(frame)

        # Extract object-level features
        obj_features, boxes = extract_object_features_from_frame(frame)

        object_features_list.append(obj_features)
        bounding_boxes_list.append(boxes)
        frame_features_list.append(frame_features)

    return {
        "frame_features": frame_features_list,
        "object_features": object_features_list,
        "bounding_boxes": bounding_boxes_list
    }

# Load existing features
def load_existing_features(pkl_file):
    if os.path.exists(pkl_file):
        with open(pkl_file, 'rb') as f:
            return pickle.load(f)
    return {}

# Save features
def save_features_to_pkl(all_video_features, pkl_file):
    os.makedirs(os.path.dirname(pkl_file), exist_ok=True)
    with open(pkl_file, 'wb') as f:
        pickle.dump(all_video_features, f)

# Process multiple videos with a progress bar
def process_videos(videos_folder, output_file):
    all_video_features = load_existing_features(output_file)
    video_list = sorted(os.listdir(videos_folder))

    with tqdm(total=len(video_list), desc="Processing Videos", unit="video", leave=True) as pbar:
        for video_name in video_list:
            if video_name in all_video_features:
                pbar.update(1)  # Skip if already processed
                continue

            video_path = os.path.join(videos_folder, video_name)
            if os.path.isdir(video_path):
                video_features = process_video(video_path)
                all_video_features[video_name] = video_features
                save_features_to_pkl(all_video_features, output_file)  
            pbar.update(1)  

    return all_video_features

# Example usage
videos_folder = '/kaggle/input/msvd-video-caption/train'
output_file = '/kaggle/working/train_all_video_features.pkl'

all_video_features = process_videos(videos_folder, output_file)


In [None]:
# Example labels (you can expand this list)
labels = ["a photo of a cat", "a photo of a dog", "a photo of a car", "a photo of a person"]
text_inputs = clip.tokenize(labels).to(device)

In [None]:
video_graph_features['eyhzdC936uk_15_27']['object_features'][0][0].shape

# Feature Extraction with CLIP

In [None]:
import clip
import torch
from PIL import Image

# Load the CLIP model and the preprocessing transform
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device)

# Example labels (you can expand this list)
labels = ["a photo of a cat","a boy playing with dog", "a photo of a dog", "a photo of a car", "a photo of a person"]
text_inputs = clip.tokenize(labels).to(device)

# Assuming 'video_graph_features' contains the features already extracted from the video frame
frame_features = video_graph_features['eyhzdC936uk_15_27']['frame_features'][2]  # Extracted CLIP features

# Normalize the extracted frame features
frame_features = torch.tensor(frame_features).to(device)  # Convert to tensor
frame_features /= frame_features.norm(dim=-1, keepdim=True)

# Get the text features
with torch.no_grad():
    text_features = model.encode_text(text_inputs)

# Normalize the text features
text_features /= text_features.norm(dim=-1, keepdim=True)

# Calculate similarity between frame features and text features
similarity = (frame_features @ text_features.T)

# Get the predicted label (the index with the highest similarity score)
values, indices = similarity.topk(1)
predicted_label = labels[indices[0]]
print(f"Predicted label: {predicted_label}")


# Feature Extraction with Resnet

In [None]:
import torch
import torch.nn.functional as F
from torchvision import models, transforms
from PIL import Image

# Load pre-trained ResNet model
device = "cuda" if torch.cuda.is_available() else "cpu"
resnet = models.resnet50(pretrained=True).to(device)
resnet.eval()  # Set to evaluation mode

# Preprocessing for ResNet
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Example labels (you can expand this list with more labels)
labels = ["a photo of a cat", "a boy playing with dog","a photo of a dog", "a photo of a car", "a photo of a person"]

# Example function to get features from ResNet
def extract_resnet_features(frame):
    frame_tensor = transform(frame).unsqueeze(0).to(device)  # Preprocess frame
    with torch.no_grad():
        features = resnet(frame_tensor)  # Get ResNet features
    return features

# Example: Load the image ('image.jpg') and extract features using ResNet
image_path = '/kaggle/input/msvd-video-caption/testing/eyhzdC936uk_15_27/003.jpg'  # Make sure this is the correct path to your image
frame_image = Image.open(image_path).convert("RGB")  # Open the image

# Extract ResNet features for the image
frame_resnet_features = extract_resnet_features(frame_image)  # Get ResNet features

# Normalize the ResNet features
frame_resnet_features /= frame_resnet_features.norm(dim=-1, keepdim=True)

# Example: Compare extracted features with predefined text features
# Here we calculate cosine similarity between the ResNet features and the text features.

def cosine_similarity(a, b):
    return F.cosine_similarity(a, b, dim=-1)

# Create a dummy function for text encoding (in real scenario, this should be a pre-trained text model)
def encode_text(label):
    # This is a placeholder for text encoding (you can use a pre-trained model like BERT or similar for this)
    return torch.randn(1, 1000).to(device)  # Dummy random vector to simulate text features

# Get features for each label (text description)
text_features = []
for label in labels:
    text_feature = encode_text(label)  # Use a real model to encode text
    text_features.append(text_feature)

# Stack text features into a tensor
text_features = torch.stack(text_features).to(device)

# Normalize text features
text_features /= text_features.norm(dim=-1, keepdim=True)

# Calculate similarity between image features and text features
similarity = cosine_similarity(frame_resnet_features, text_features)

# Find the label with the highest similarity
values, indices = similarity.topk(1)
predicted_label = labels[indices[0]]  # Get the predicted label

# Output the predicted label
print(f"Predicted label for the image: {predicted_label}")


# Object Detection

In [None]:
import torch
import torchvision.transforms as transforms
from PIL import Image
import matplotlib.pyplot as plt
import cv2
import numpy as np

# Load YOLOv8
from ultralytics import YOLO

# Load Faster R-CNN
from torchvision.models.detection import fasterrcnn_resnet50_fpn

# Load Models
yolo_model = YOLO("yolov8n.pt")  # YOLOv8 Nano (fast and lightweight)
fasterrcnn_model = fasterrcnn_resnet50_fpn(pretrained=True)
fasterrcnn_model.eval()

# Load Image
image_path = "/kaggle/input/msvd-video-caption/testing/eyhzdC936uk_15_27/060.jpg"
image = Image.open(image_path).convert("RGB")

# Preprocessing for Faster R-CNN
transform = transforms.Compose([
    transforms.ToTensor()
])
image_tensor = transform(image).unsqueeze(0)

# Perform Detection with Faster R-CNN
with torch.no_grad():
    fasterrcnn_preds = fasterrcnn_model(image_tensor)[0]

# Perform Detection with YOLOv8
yolo_preds = yolo_model(image_path)

# Count objects detected
num_fasterrcnn_objects = sum(1 for score in fasterrcnn_preds["scores"] if score > 0.5)
num_yolo_objects = len(yolo_preds[0].boxes)

# Print Comparison
print(f"Faster R-CNN detected {num_fasterrcnn_objects} objects")
print(f"YOLOv8 detected {num_yolo_objects} objects")

# Visualization Function
def draw_boxes(image, boxes, scores, color, model_name):
    """Draw bounding boxes on the image."""
    image = np.array(image)
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

    for i, box in enumerate(boxes):
        if scores[i] > 0.5:  # Confidence threshold
            x1, y1, x2, y2 = map(int, box)
            cv2.rectangle(image, (x1, y1), (x2, y2), color, 2)
            cv2.putText(image, f"{model_name} {scores[i]:.2f}", (x1, y1 - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Draw YOLO Bounding Boxes
yolo_img = draw_boxes(image, yolo_preds[0].boxes.xyxy, yolo_preds[0].boxes.conf, (0, 255, 0), "YOLO")

# Draw Faster R-CNN Bounding Boxes
faster_img = draw_boxes(image, fasterrcnn_preds["boxes"], fasterrcnn_preds["scores"], (255, 0, 0), "Faster R-CNN")

# Show Results
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.imshow(yolo_img)
plt.title("YOLO Detection")

plt.subplot(1, 2, 2)
plt.imshow(faster_img)
plt.title("Faster R-CNN Detection")

plt.show()

# Context aware

In [None]:

import pickle

# Path to the saved pickle file
pkl_file = "/kaggle/working/train_all_video_features.pkl"

# Load the features
with open(pkl_file, "rb") as f:
    video_graph_features = pickle.load(f)

In [None]:
video_graph_features['-7KMZQEsJW4_205_208']['object_features'][2]