In [None]:
from ultralytics import YOLO
import cv2
import json
import time
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision import models
from collections import defaultdict
from deep_sort_realtime.deepsort_tracker import DeepSort

model = YOLO("/home/mcw/Karthick/shopable-ad2.0/yolo_model/best_shop_mdl.pt")
model.to("cpu")

# Load classification model (ResNet) for suitcase classification
classification_model = models.resnet50(pretrained=False)
classification_model.fc = torch.nn.Linear(classification_model.fc.in_features, 4)  # 4 suitcase classes
classification_model.load_state_dict(torch.load("/home/mcw/Karthick/shopable-ad2.0/classification_model/suitcase_classify.pth", map_location=torch.device("cpu")))
classification_model.to("cpu")
classification_model.eval()

# Define product links and metadata
products = {
    "jacket": {
        "link": "https://www.ajio.com/brooks-brothers-stretch-tennis-zip-front-bomber-jacket/p/410309671001",
        "name": "Brooks Brothers Bomber Jacket",
        "price": "$129.99"
    },
    "suitcase1": {
        "link": "https://www.indiamart.com/proddetail/american-tourister-player-trolly-bag-2853815356891.html",
        "name": "Am  lly",
        "price": "$89.99"
    },
    "suitcase2": {
        "link": "https://www.americantourister.in/play4blue-blue-sb4061002",
        "name": "American Tourister Play4Blue",
        "price": "$95.99"
    },
    "suitcase3": {
        "link": "http://behance.net/gallery/84337561/American-Tourister-Bags?locale=es_ES",
        "name": "American Tourister Signature",
        "price": "$109.99"
    },
    "suitcase4": {
        "link": "https://www.flipkart.com/american-tourister-trafford-spinner-78cm-navy-check-in-suitcase-31-inch/p/itm0af067fc544c3",
        "name": "American Tourister Trafford Spinner",
        "price": "$119.99"
    }
}

def classify_object(cropped_img):
    """Classifies a cropped suitcase image and returns product info."""
    # Convert OpenCV BGR to RGB
    rgb_img = cv2.cvtColor(cropped_img, cv2.COLOR_BGR2RGB)
    
    # Create a PIL image
    pil_img = transforms.functional.to_pil_image(rgb_img)
    
    # Apply transformations
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    img_tensor = transform(pil_img).unsqueeze(0)
    
    with torch.no_grad():
        outputs = classification_model(img_tensor)
        _, predicted = torch.max(outputs, 1)
    
    product_key = f"suitcase{predicted.item() + 1}"
    return products.get(product_key, None)

def process_video(video_path, output_path, json_path):
    # Open video file
    cap = cv2.VideoCapture(video_path, cv2.CAP_FFMPEG)
    
    # Get video properties
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    print(f"Video properties: {width}x{height}, {fps} FPS, {total_frames} frames")
    
    # Define video writer
    fourcc = cv2.VideoWriter_fourcc(*'avc1')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height), isColor=True)
    
    # Initialize tracker with improved parameters
    tracker = DeepSort(
        max_age=1,              # Increased max age for tracks
        n_init=5,                # Decreased number of confirmations needed
        nms_max_overlap=0.5,     # Adjusted NMS overlap threshold
        max_cosine_distance=0.3, # Reduced cosine distance threshold for better matching
        nn_budget=100,           # Reduced neural network budget
        embedder="mobilenet",    # Using mobilenet embedder
        embedder_gpu=False       # Ensure CPU usage
    ) 
    
    # Store data for JSON output
    frame_data = {}
    object_data = defaultdict(dict)  # Track objects across frames
    object_first_seen = {}  # Track first frame an object was seen
    
    metadata = {
        "fps": fps,
        "width": width,
        "height": height,
        "total_frames": total_frames,
        "date_processed": time.strftime("%Y-%m-%d %H:%M:%S")
    }
    
    frame_id = 0
    processing_times = []
    
    # Track object class and product info
    object_details = {}
    
    # Process each frame
    while cap.isOpened():
        start_time = time.time()
        ret, frame = cap.read()
        
        if not ret:
            break
        
        # Run YOLOv8 object detection
        results = model(frame, conf=0.45, iou=0.4, device="cpu")
        
        # Prepare detections for tracker
        detections = []
        detection_classes = []
        detection_crops = []
        detection_confidences = []
        
        for result in results:
            for box, conf, cls in zip(result.boxes.xyxy, result.boxes.conf, result.boxes.cls):
                obj_class = int(cls.item())
                x1, y1, x2, y2 = map(int, box.tolist())
                conf_val = conf.item()
                
                # Add detection for tracking
                detections.append([x1, y1, x2, y2, conf_val])
                detection_classes.append(obj_class)
                detection_crops.append(frame[y1:y2, x1:x2].copy())
                detection_confidences.append(conf_val)
        
        frame_data[frame_id] = []
        
        # Update tracker
        if len(detections) > 0:
            # Convert detections to DeepSort format
            deep_sort_detections = []
            for det, cls, conf in zip(detections, detection_classes, detection_confidences):
                x1, y1, x2, y2, _ = det
                deep_sort_detections.append(
                    ((x1, y1, x2-x1, y2-y1), conf, cls)  # Convert to (x, y, w, h) format
                )
            
            # Track objects
            tracks = tracker.update_tracks(deep_sort_detections, frame=frame)
            
            for track in tracks:
                if not track.is_confirmed():
                    continue
                
                track_id = track.track_id
                ltrb = track.to_ltrb()
                x1, y1, x2, y2 = map(int, ltrb)
                
                # Match with original detection
                best_match_idx = None
                best_match_iou = 0
                for i, det in enumerate(detections):
                    det_x1, det_y1, det_x2, det_y2 = det[:4]
                    iou = calculate_iou((x1, y1, x2, y2), (det_x1, det_y1, det_x2, det_y2))
                    if iou > best_match_iou:
                        best_match_iou = iou
                        best_match_idx = i
                
                if best_match_idx is not None:
                    obj_class = detection_classes[best_match_idx]
                    cropped_img = detection_crops[best_match_idx]
                    
                    # Get product info based on class
                    product_info = None
                    if obj_class == 0:  # Jacket
                        product_info = products["jacket"]
                    elif obj_class == 1:  # Suitcase
                        product_info = classify_object(cropped_img)
                    
                    if product_info:
                        # Track first frame an object was seen
                        if track_id not in object_first_seen:
                            object_first_seen[track_id] = frame_id
                        
                        # Store frame data
                        frame_data[frame_id].append({
                            "id": track_id,
                            "bbox": [x1, y1, x2, y2],
                            "class": obj_class,
                            "link": product_info["link"],
                            "name": product_info["name"],
                            "price": product_info["price"]
                        })
                        
                        # Store persistent object details
                        object_details[track_id] = {
                            "class": obj_class,
                            "link": product_info["link"],
                            "name": product_info["name"],
                            "price": product_info["price"],
                            "first_frame": object_first_seen[track_id],
                            "last_frame": frame_id
                        }
                        
                        # Draw bounding boxes for visualization
                        color = get_color_for_id(track_id)
                        cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
                        label = f"ID:{track_id} {product_info['name']}"
                        cv2.putText(frame, label, (x1, y1 - 10), 
                                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
        
        # Write frame with visualization
        out.write(frame)
        
        # Calculate processing time
        end_time = time.time()
        processing_times.append(end_time - start_time)
        
        # Print progress every 10 frames
        if frame_id % 10 == 0:
            print(f"Processed frame {frame_id}/{total_frames}, time: {end_time - start_time:.3f}s")
        
        frame_id += 1
    
    # Calculate processing statistics
    avg_processing_time = np.mean(processing_times)
    effective_fps = 1.0 / avg_processing_time if avg_processing_time > 0 else 0
    
    # Prepare final JSON output
    output_data = {
        "metadata": metadata,
        "frames": frame_data,
        "objects": object_details,
        "stats": {
            "avg_processing_time": avg_processing_time,
            "effective_fps": effective_fps,
            "frames_processed": frame_id
        }
    }
    
    # Write output to JSON file
    with open(json_path, "w") as json_file:
        json.dump(output_data, json_file, indent=2)
    
    # Clean up
    cap.release()
    out.release()
    cv2.destroyAllWindows()
    
    print(f"✅ Object detection and tracking complete. Output saved as '{output_path}'")
    print(f"✅ Tracking data saved as '{json_path}'")
    print(f"Processing stats: {effective_fps:.2f} FPS, {frame_id} frames processed")
    
    return output_data

def calculate_iou(box1, box2):
    """Calculate Intersection over Union (IoU) between two bounding boxes."""
    x1_1, y1_1, x2_1, y2_1 = box1
    x1_2, y1_2, x2_2, y2_2 = box2
    
    # Compute coordinates of the intersection box
    x1_inter = max(x1_1, x1_2)
    y1_inter = max(y1_1, y1_2)
    x2_inter = min(x2_1, x2_2)
    y2_inter = min(y2_1, y2_2)
    
    # Compute area of intersection
    inter_area = max(0, x2_inter - x1_inter) * max(0, y2_inter - y1_inter)
    
    # Compute area of both boxes
    box1_area = (x2_1 - x1_1) * (y2_1 - y1_1)
    box2_area = (x2_2 - x1_2) * (y2_2 - y1_2)
    
    # Compute union area
    union_area = box1_area + box2_area - inter_area
    
    # Compute IoU
    iou = inter_area / union_area if union_area > 0 else 0
    return iou

def get_color_for_id(track_id):
    try:
        # Try to convert to integer first
        if isinstance(track_id, str):
            # Use hash for string track IDs
            track_numeric = hash(track_id) 
        else:
            # Use the track_id directly if it's already numeric
            track_numeric = int(track_id)
    except (ValueError, TypeError):
        # Fallback to hash if conversion fails
        track_numeric = hash(str(track_id))
    
    # Ensure positive value
    track_numeric = abs(track_numeric)
    
    # Generate color components
    r = (track_numeric * 50) % 255
    g = (track_numeric * 100) % 255
    b = (track_numeric * 150) % 255
    
    return (r, g, b)

if __name__ == "__main__":
    input_video = "/home/mcw/Karthick/shopable-ad2.0/ad-video/ad.mp4"
    output_video = "processed_output.mp4"
    json_output = "/home/mcw/Karthick/shopable-ad2.0/data/tracked_id.json"
    
    process_video(input_video, output_video, json_output)

Video properties: 1920x1080, 25 FPS, 500 frames

0: 384x640 1 jacket, 1 suitcase, 34.0ms
Speed: 1.3ms preprocess, 34.0ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)
Processed frame 0/500, time: 0.144s

0: 384x640 1 jacket, 1 suitcase, 33.9ms
Speed: 1.5ms preprocess, 33.9ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 jacket, 1 suitcase, 33.0ms
Speed: 1.5ms preprocess, 33.0ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 jacket, 1 suitcase, 33.0ms
Speed: 2.1ms preprocess, 33.0ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 jacket, 1 suitcase, 33.3ms
Speed: 2.5ms preprocess, 33.3ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 jacket, 1 suitcase, 32.7ms
Speed: 2.9ms preprocess, 32.7ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 jacket, 1 suitcase, 32.6ms
Speed: 1.5ms preprocess, 32.6ms inference, 1.3m

In [14]:
from ultralytics import YOLO

# Load your trained YOLOv8 model
model = YOLO("/home/mcw/Karthick/shopable-ad2.0/yolo_model/best_shop_mdl.pt")  # Change to your model path

# Print model architecture
print(model)

# Print class names and number of classes
print("\nModel Classes:")

for class_id, class_name in model.names.items():
    print(f"Class ID: {class_id}, Class Name: {class_name}")

# Print additional model details
print("\nModel Summary:")
model.info()


YOLO(
  (model): SegmentationModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): C2f(
        (cv1): Conv(
          (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(48, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_runnin

(151, 3264006, 0, 12.110387199999998)