<a href="https://colab.research.google.com/github/helinatefera/demo/blob/Scripts/Graph_Based_feat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
steveandreasimmanuel_msvd_video_caption_path = kagglehub.dataset_download('steveandreasimmanuel/msvd-video-caption')

print('Data source import complete.')


In [None]:
!pip install torch-geometric
!pip install ultralytics
!!pip install git+https://github.com/openai/CLIP.git
!pip install detectron2
!pip install yolov5

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("steveandreasimmanuel/msvd-video-caption")

print("Path to dataset files:", path)

In [None]:
import torch

print("Number of GPUs available:", torch.cuda.device_count())
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")


In [None]:
import logging
logging.getLogger("ultralytics").setLevel(logging.CRITICAL)

In [None]:
!rm /kaggle/working/testing_all_video_features.pkl

# CLIP with yolo

In [None]:
import torch
import numpy as np
import os
from PIL import Image
from torchvision import models, transforms
import pickle
import clip
from ultralytics import YOLO
from tqdm import tqdm  # Import tqdm for progress bar

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device)

# Load YOLO model
yolo_model = YOLO("yolov8s.pt")

# Load ResNet for object feature extraction
resnet = models.resnet50(pretrained=True)
resnet.eval()

# Transformation for ResNet
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Function to extract frame-level features using CLIP
def extract_frame_features_from_frame(frame):
    frame_tensor = preprocess(frame).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = clip_model.encode_image(frame_tensor)
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
    return image_features.cpu().numpy()

# Function to extract object-level features using YOLO
def extract_object_features_from_frame(frame):
    object_features = []
    bounding_boxes = []

    # Run YOLO object detection
    results = yolo_model(frame,verbose=False)

    # Process YOLO detections
    for result in results:
        for box in result.boxes.data.cpu().numpy():
            x1, y1, x2, y2, score, class_id = box
            if score > 0.6:  # Filter low-confidence detections
                cropped_obj = frame.crop((x1, y1, x2, y2))
                obj_tensor = transform(cropped_obj).unsqueeze(0)

                with torch.no_grad():
                    obj_feature = resnet(obj_tensor).view(-1).numpy()

                object_features.append(obj_feature)
                bounding_boxes.append([x1, y1, x2, y2])

    return object_features, bounding_boxes

# Function to process each video and extract features
def process_video(video_folder):
    frame_features_list = []
    object_features_list = []
    bounding_boxes_list = []

    frame_files = sorted(os.listdir(video_folder))
    for frame_name in frame_files[:5]:  # Process only the first 3 frames
        frame_path = os.path.join(video_folder, frame_name)
        frame = Image.open(frame_path).convert("RGB")

        # Extract frame-level features
        frame_features = extract_frame_features_from_frame(frame)

        # Extract object-level features
        obj_features, boxes = extract_object_features_from_frame(frame)

        object_features_list.append(obj_features)
        bounding_boxes_list.append(boxes)
        frame_features_list.append(frame_features)

    return {
        "frame_features": frame_features_list,
        "object_features": object_features_list,
        "bounding_boxes": bounding_boxes_list
    }

# Load existing features
def load_existing_features(pkl_file):
    if os.path.exists(pkl_file):
        with open(pkl_file, 'rb') as f:
            return pickle.load(f)
    return {}

# Save features
def save_features_to_pkl(all_video_features, pkl_file):
    os.makedirs(os.path.dirname(pkl_file), exist_ok=True)
    with open(pkl_file, 'wb') as f:
        pickle.dump(all_video_features, f)

# Process multiple videos with a progress bar
def process_videos(videos_folder, output_file):
    all_video_features = load_existing_features(output_file)
    video_list = sorted(os.listdir(videos_folder))

    with tqdm(total=len(video_list), desc="Processing Videos", unit="video") as pbar:
        for video_name in video_list:
            if video_name in all_video_features:
                pbar.update(1)  # Skip if already processed
                continue

            video_path = os.path.join(videos_folder, video_name)
            if os.path.isdir(video_path):
                video_features = process_video(video_path)
                all_video_features[video_name] = video_features
                save_features_to_pkl(all_video_features, output_file)
            pbar.update(1)

    return all_video_features

# Example usage
videos_folder = '/kaggle/input/msvd-video-caption/validation'
output_file = '/kaggle/working/validation_all_video_features.pkl'

all_video_features = process_videos(videos_folder, output_file)

# CLIP with fasterrcnn

In [None]:
import torch
import numpy as np
import os
from PIL import Image
from torchvision import models, transforms
import pickle
import clip
from ultralytics import YOLO
from tqdm import tqdm  # Import tqdm for progress bar

# Suppress YOLO logs
import logging
logging.getLogger("ultralytics").setLevel(logging.CRITICAL)

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device)

# Load YOLO model
yolo_model = YOLO("yolov8s.pt")

# Load ResNet for object feature extraction
resnet = models.resnet50(pretrained=True)
resnet.eval()

# Transformation for ResNet
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Function to extract frame-level features using CLIP
def extract_frame_features_from_frame(frame):
    frame_tensor = preprocess(frame).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = clip_model.encode_image(frame_tensor)
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
    return image_features.cpu().numpy()

# Function to extract object-level features using YOLO
def extract_object_features_from_frame(frame):
    object_features = []
    bounding_boxes = []

    # Run YOLO object detection (disable console output)
    results = yolo_model(frame, verbose=False)

    # Process YOLO detections
    for result in results:
        for box in result.boxes.data.cpu().numpy():
            x1, y1, x2, y2, score, class_id = box
            if score > 0.2:  # Filter low-confidence detections
                cropped_obj = frame.crop((x1, y1, x2, y2))
                obj_tensor = transform(cropped_obj).unsqueeze(0)

                with torch.no_grad():
                    obj_feature = resnet(obj_tensor).view(-1).numpy()

                object_features.append(obj_feature)
                bounding_boxes.append([x1, y1, x2, y2])

    return object_features, bounding_boxes

# Function to process each video and extract features
def process_video(video_folder):
    frame_features_list = []
    object_features_list = []
    bounding_boxes_list = []

    frame_files = sorted(os.listdir(video_folder))
    for frame_name in frame_files[:3]:  # Process only the first 3 frames
        frame_path = os.path.join(video_folder, frame_name)
        frame = Image.open(frame_path).convert("RGB")

        # Extract frame-level features
        frame_features = extract_frame_features_from_frame(frame)

        # Extract object-level features
        obj_features, boxes = extract_object_features_from_frame(frame)

        object_features_list.append(obj_features)
        bounding_boxes_list.append(boxes)
        frame_features_list.append(frame_features)

    return {
        "frame_features": frame_features_list,
        "object_features": object_features_list,
        "bounding_boxes": bounding_boxes_list
    }

# Load existing features
def load_existing_features(pkl_file):
    if os.path.exists(pkl_file):
        with open(pkl_file, 'rb') as f:
            return pickle.load(f)
    return {}

# Save features
def save_features_to_pkl(all_video_features, pkl_file):
    os.makedirs(os.path.dirname(pkl_file), exist_ok=True)
    with open(pkl_file, 'wb') as f:
        pickle.dump(all_video_features, f)

# Process multiple videos with a progress bar
def process_videos(videos_folder, output_file):
    all_video_features = load_existing_features(output_file)
    video_list = sorted(os.listdir(videos_folder))

    with tqdm(total=len(video_list), desc="Processing Videos", unit="video", leave=True) as pbar:
        for video_name in video_list:
            if video_name in all_video_features:
                pbar.update(1)  # Skip if already processed
                continue

            video_path = os.path.join(videos_folder, video_name)
            if os.path.isdir(video_path):
                video_features = process_video(video_path)
                all_video_features[video_name] = video_features
                save_features_to_pkl(all_video_features, output_file)
            pbar.update(1)

    return all_video_features

# Example usage
videos_folder = '/kaggle/input/msvd-video-caption/testing'
output_file = '/kaggle/working/testing_all_video_features.pkl'

all_video_features = process_videos(videos_folder, output_file)


In [None]:

import pickle

# Path to the saved pickle file
pkl_file = "/kaggle/working/testing_all_video_features.pkl"

# Load the features
with open(pkl_file, "rb") as f:
    video_graph_features = pickle.load(f)

In [None]:
video_graph_features['eyhzdC936uk_15_27']['object_features'][0][0].shape

(1000,)

In [None]:
# Example labels (you can expand this list)
labels = ["a photo of a cat", "a photo of a dog", "a photo of a car", "a photo of a person"]
text_inputs = clip.tokenize(labels).to(device)

# Feature Extraction with CLIP

In [None]:
import clip
import torch
from PIL import Image

# Load the CLIP model and the preprocessing transform
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device)

# Example labels (you can expand this list)
labels = ["a photo of a cat","a boy playing with dog", "a photo of a dog", "a photo of a car", "a photo of a person"]
text_inputs = clip.tokenize(labels).to(device)

# Assuming 'video_graph_features' contains the features already extracted from the video frame
frame_features = video_graph_features['eyhzdC936uk_15_27']['frame_features'][2]  # Extracted CLIP features

# Normalize the extracted frame features
frame_features = torch.tensor(frame_features).to(device)  # Convert to tensor
frame_features /= frame_features.norm(dim=-1, keepdim=True)

# Get the text features
with torch.no_grad():
    text_features = model.encode_text(text_inputs)

# Normalize the text features
text_features /= text_features.norm(dim=-1, keepdim=True)

# Calculate similarity between frame features and text features
similarity = (frame_features @ text_features.T)

# Get the predicted label (the index with the highest similarity score)
values, indices = similarity.topk(1)
predicted_label = labels[indices[0]]
print(f"Predicted label: {predicted_label}")


# Feature Extraction with Resnet

In [None]:
import torch
import torch.nn.functional as F
from torchvision import models, transforms
from PIL import Image

# Load pre-trained ResNet model
device = "cuda" if torch.cuda.is_available() else "cpu"
resnet = models.resnet50(pretrained=True).to(device)
resnet.eval()  # Set to evaluation mode

# Preprocessing for ResNet
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Example labels (you can expand this list with more labels)
labels = ["a photo of a cat", "a boy playing with dog","a photo of a dog", "a photo of a car", "a photo of a person"]

# Example function to get features from ResNet
def extract_resnet_features(frame):
    frame_tensor = transform(frame).unsqueeze(0).to(device)  # Preprocess frame
    with torch.no_grad():
        features = resnet(frame_tensor)  # Get ResNet features
    return features

# Example: Load the image ('image.jpg') and extract features using ResNet
image_path = '/kaggle/input/msvd-video-caption/testing/eyhzdC936uk_15_27/003.jpg'  # Make sure this is the correct path to your image
frame_image = Image.open(image_path).convert("RGB")  # Open the image

# Extract ResNet features for the image
frame_resnet_features = extract_resnet_features(frame_image)  # Get ResNet features

# Normalize the ResNet features
frame_resnet_features /= frame_resnet_features.norm(dim=-1, keepdim=True)

# Example: Compare extracted features with predefined text features
# Here we calculate cosine similarity between the ResNet features and the text features.

def cosine_similarity(a, b):
    return F.cosine_similarity(a, b, dim=-1)

# Create a dummy function for text encoding (in real scenario, this should be a pre-trained text model)
def encode_text(label):
    # This is a placeholder for text encoding (you can use a pre-trained model like BERT or similar for this)
    return torch.randn(1, 1000).to(device)  # Dummy random vector to simulate text features

# Get features for each label (text description)
text_features = []
for label in labels:
    text_feature = encode_text(label)  # Use a real model to encode text
    text_features.append(text_feature)

# Stack text features into a tensor
text_features = torch.stack(text_features).to(device)

# Normalize text features
text_features /= text_features.norm(dim=-1, keepdim=True)

# Calculate similarity between image features and text features
similarity = cosine_similarity(frame_resnet_features, text_features)

# Find the label with the highest similarity
values, indices = similarity.topk(1)
predicted_label = labels[indices[0]]  # Get the predicted label

# Output the predicted label
print(f"Predicted label for the image: {predicted_label}")


# Object Detection

In [None]:
import torch
import torchvision.transforms as transforms
from PIL import Image
import matplotlib.pyplot as plt
import cv2
import numpy as np

# Load YOLOv8
from ultralytics import YOLO

# Load Faster R-CNN
from torchvision.models.detection import fasterrcnn_resnet50_fpn

# Load Models
yolo_model = YOLO("yolov8n.pt")  # YOLOv8 Nano (fast and lightweight)
fasterrcnn_model = fasterrcnn_resnet50_fpn(pretrained=True)
fasterrcnn_model.eval()

# Load Image
image_path = "/kaggle/input/msvd-video-caption/testing/eyhzdC936uk_15_27/060.jpg"
image = Image.open(image_path).convert("RGB")

# Preprocessing for Faster R-CNN
transform = transforms.Compose([
    transforms.ToTensor()
])
image_tensor = transform(image).unsqueeze(0)

# Perform Detection with Faster R-CNN
with torch.no_grad():
    fasterrcnn_preds = fasterrcnn_model(image_tensor)[0]

# Perform Detection with YOLOv8
yolo_preds = yolo_model(image_path)

# Count objects detected
num_fasterrcnn_objects = sum(1 for score in fasterrcnn_preds["scores"] if score > 0.5)
num_yolo_objects = len(yolo_preds[0].boxes)

# Print Comparison
print(f"Faster R-CNN detected {num_fasterrcnn_objects} objects")
print(f"YOLOv8 detected {num_yolo_objects} objects")

# Visualization Function
def draw_boxes(image, boxes, scores, color, model_name):
    """Draw bounding boxes on the image."""
    image = np.array(image)
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

    for i, box in enumerate(boxes):
        if scores[i] > 0.5:  # Confidence threshold
            x1, y1, x2, y2 = map(int, box)
            cv2.rectangle(image, (x1, y1), (x2, y2), color, 2)
            cv2.putText(image, f"{model_name} {scores[i]:.2f}", (x1, y1 - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Draw YOLO Bounding Boxes
yolo_img = draw_boxes(image, yolo_preds[0].boxes.xyxy, yolo_preds[0].boxes.conf, (0, 255, 0), "YOLO")

# Draw Faster R-CNN Bounding Boxes
faster_img = draw_boxes(image, fasterrcnn_preds["boxes"], fasterrcnn_preds["scores"], (255, 0, 0), "Faster R-CNN")

# Show Results
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.imshow(yolo_img)
plt.title("YOLO Detection")

plt.subplot(1, 2, 2)
plt.imshow(faster_img)
plt.title("Faster R-CNN Detection")

plt.show()

# Context aware