In [1]:
import torch
print(torch.cuda.is_available())  # Should return True if CUDA is available
if torch.cuda.is_available():
    print(torch.cuda.current_device())  # Should return the current device index
    print(torch.cuda.get_device_name(0))  # Should return the name of the GPU

True
0
NVIDIA GeForce RTX 3050 Laptop GPU


In [2]:
import cv2
import yaml
import torch
from ultralytics import YOLO
import numpy as np
from IPython.display import display, clear_output
from tkinter import Tk, filedialog

In [12]:
import cv2
import yaml
import torch
from ultralytics import YOLO
import numpy as np
from IPython.display import display, clear_output
from tkinter import Tk, filedialog

def preprocess_frame(frame, target_size=(640, 640)):
    h, w, _ = frame.shape
    scale = min(target_size[0] / h, target_size[1] / w)
    new_w, new_h = int(w * scale), int(h * scale)
    resized_frame = cv2.resize(frame, (new_w, new_h))
    pad_w = (target_size[1] - new_w) // 2
    pad_h = (target_size[0] - new_h) // 2
    padded_frame = cv2.copyMakeBorder(resized_frame, pad_h, pad_h, pad_w, pad_w, cv2.BORDER_CONSTANT, value=(0, 0, 0))
    rgb_frame = cv2.cvtColor(padded_frame, cv2.COLOR_BGR2RGB)
    normalized_frame = rgb_frame / 255.0
    return normalized_frame

def extract_features_from_video(video_path, detection_model_path, segmentation_model_path, pose_model_path, output_yaml, label):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("CUDA Available:", torch.cuda.is_available())
    if torch.cuda.is_available():
        print("Device:", torch.cuda.current_device())
        print("GPU Name:", torch.cuda.get_device_name(0))

    detection_model = YOLO(detection_model_path).to(device)
    segmentation_model = YOLO(segmentation_model_path).to(device)
    pose_model = YOLO(pose_model_path).to(device)

    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    video_data = {
        'video_metadata': {
            'path': video_path,
            'fps': cap.get(cv2.CAP_PROP_FPS),
            'frame_count': int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        },
        'frames': []
    }

    frame_skip = 2

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if frame_count % frame_skip != 0:
            frame_count += 1
            continue

        preprocessed_frame = preprocess_frame(frame)
        preprocessed_frame_tensor = torch.tensor(preprocessed_frame, dtype=torch.float32).permute(2, 0, 1).unsqueeze(0).to(device)

        detection_results = detection_model(preprocessed_frame_tensor)
        segmentation_results = segmentation_model(preprocessed_frame_tensor)
        pose_results = pose_model(preprocessed_frame_tensor)

        frame_data = {
            'frame_number': frame_count,
            'timestamp': frame_count / video_data['video_metadata']['fps'],
            'features': {
                'num_humans': 0,
                'num_weapons': 0,
                'avg_confidence': 0,
                'bounding_boxes': [],
                'keypoints': [],
                'segmented_areas': [],
                'segmentation_confidence': []
            },
            'label': label
        }

        total_confidence = 0
        for result in detection_results:
            if hasattr(result, 'names'):
                for box in result.boxes:
                    try:
                        cls = result.names[int(box.cls[0])]
                        conf = box.conf[0]
                        x1, y1, x2, y2 = box.xyxy[0]

                        if cls == 'person':
                            frame_data['features']['num_humans'] += 1
                        elif cls == 'weapon':  
                            frame_data['features']['num_weapons'] += 1

                        frame_data['features']['bounding_boxes'].append([float(x1), float(y1), float(x2), float(y2)])
                        total_confidence += float(conf)

                        cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (255, 0, 0), 2)
                        cv2.putText(frame, f'{cls} {conf:.2f}', (int(x1), int(y1) - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)
                    except KeyError as e:
                        print(f"KeyError: {e} - Class index {box.cls[0]} not found in result.names")
            else:
                print("Result does not have 'names' attribute.")

        if len(detection_results) > 0:
            frame_data['features']['avg_confidence'] = total_confidence / len(detection_results)

        for result in segmentation_results:
            if result.masks is not None:
                for mask in result.masks:
                    # Ensure mask is a NumPy array
                    if isinstance(mask, torch.Tensor):
                        mask = mask.cpu().numpy()

                    # Check if mask is a single-channel image
                    if len(mask.shape) == 3 and mask.shape[2] == 1:
                        mask = mask[:, :, 0]

                    # Debugging information
                    print("Mask type:", type(mask))
                    print("Mask shape:", mask.shape)

                    try:
                        area = cv2.countNonZero(mask)
                        frame_data['features']['segmented_areas'].append(area)
                        frame_data['features']['segmentation_confidence'].append(mask.confidence)
                    except Exception as e:
                        print(f"Error processing mask: {e}")
            else:
                print("No masks found for this frame.")

        for result in pose_results:
            if result.keypoints is not None and len(result.keypoints.xy) > 0:
                for keypoint in result.keypoints.xy:
                    if len(keypoint) >= 3:
                        x, y = keypoint[0], keypoint[1]
                        confidence = keypoint[2]
                    else:
                        x, y, confidence = None, None, None
                    frame_data['features']['keypoints'].append([x, y, confidence])

        video_data['frames'].append(frame_data)

        clear_output(wait=True)
        display(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

        frame_count += 1

    cap.release()

    with open(output_yaml, 'w') as file:
        yaml.dump(video_data, file, default_flow_style=False)

if __name__ == "__main__":
    root = Tk()
    root.withdraw()
    video_path = filedialog.askopenfilename(title="Select Video File", filetypes=[("MP4 Files", "*.mp4"), ("AVI Files", "*.avi")])

    detection_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m.pt'
    segmentation_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m-seg.pt'
    pose_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m-pose.pt'
    output_yaml = r'C:\Users\harme\Desktop\video-detect-gpu\video_features.yaml'
    label = 1

    if video_path:
        extract_features_from_video(video_path, detection_model_path, segmentation_model_path, pose_model_path, output_yaml, label)
    else:
        print("No video file selected.")

array([[[ 39,  40,  52],
        [ 39,  40,  52],
        [ 39,  40,  54],
        ...,
        [188, 207, 217],
        [188, 207, 217],
        [189, 208, 218]],

       [[ 39,  40,  52],
        [ 39,  40,  52],
        [ 39,  40,  54],
        ...,
        [188, 207, 217],
        [188, 207, 217],
        [189, 208, 218]],

       [[ 39,  40,  52],
        [ 39,  40,  52],
        [ 39,  40,  54],
        ...,
        [188, 207, 217],
        [188, 207, 217],
        [189, 208, 218]],

       ...,

       [[175, 139, 108],
        [175, 139, 108],
        [176, 140, 109],
        ...,
        [161, 176, 181],
        [162, 177, 182],
        [162, 177, 182]],

       [[171, 138, 106],
        [171, 138, 106],
        [172, 139, 107],
        ...,
        [161, 176, 181],
        [161, 176, 181],
        [161, 176, 181]],

       [[170, 137, 105],
        [170, 137, 105],
        [171, 138, 106],
        ...,
        [161, 176, 181],
        [161, 176, 181],
        [161, 176, 181]]

In [18]:
import cv2
import yaml
import torch
from ultralytics import YOLO
import numpy as np
from IPython.display import display, clear_output
from tkinter import Tk, filedialog

def preprocess_frame(frame, target_size=(640, 640)):
    """
    Preprocesses the input frame by resizing, padding, and normalizing.
    """
    h, w, _ = frame.shape
    scale = min(target_size[0] / h, target_size[1] / w)
    new_w, new_h = int(w * scale), int(h * scale)
    resized_frame = cv2.resize(frame, (new_w, new_h))
    pad_w = (target_size[1] - new_w) // 2
    pad_h = (target_size[0] - new_h) // 2
    padded_frame = cv2.copyMakeBorder(
        resized_frame, pad_h, pad_h, pad_w, pad_w, cv2.BORDER_CONSTANT, value=(0, 0, 0)
    )
    rgb_frame = cv2.cvtColor(padded_frame, cv2.COLOR_BGR2RGB)
    normalized_frame = rgb_frame / 255.0
    return normalized_frame

def extract_features_from_video(video_path, detection_model_path, segmentation_model_path, pose_model_path, output_yaml, label):
    """
    Extracts features from a video using YOLO models for detection, segmentation, and pose estimation.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("CUDA Available:", torch.cuda.is_available())
    if torch.cuda.is_available():
        print("Device:", torch.cuda.current_device())
        print("GPU Name:", torch.cuda.get_device_name(0))

    # Load YOLO models
    detection_model = YOLO(detection_model_path).to(device)
    segmentation_model = YOLO(segmentation_model_path).to(device)
    pose_model = YOLO(pose_model_path).to(device)

    # Open video file
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    video_data = {
        'video_metadata': {
            'path': video_path,
            'fps': cap.get(cv2.CAP_PROP_FPS),
            'frame_count': int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        },
        'frames': []
    }

    frame_skip = 2  # Process every 2nd frame

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if frame_count % frame_skip != 0:
            frame_count += 1
            continue

        preprocessed_frame = preprocess_frame(frame)
        preprocessed_frame_tensor = torch.tensor(preprocessed_frame, dtype=torch.float32).permute(2, 0, 1).unsqueeze(0).to(device)

        # Perform inference with YOLO models
        detection_results = detection_model(preprocessed_frame_tensor, verbose=False)
        segmentation_results = segmentation_model(preprocessed_frame_tensor, verbose=False)
        pose_results = pose_model(preprocessed_frame_tensor, verbose=False)

        frame_data = {
            'frame_number': frame_count,
            'timestamp': frame_count / video_data['video_metadata']['fps'],
            'features': {
                'num_humans': 0,
                'num_weapons': 0,
                'avg_confidence': 0,
                'bounding_boxes': [],
                'keypoints': [],
                'segmented_areas': [],
                'shape_descriptors': [],
                'interaction_scores': [],
                'color_histograms': [],
                'movement_vectors': [],
                'segmentation_confidence': []
            },
            'label': label
        }

        total_confidence = 0
        detection_confidences = []

        # Process Detection Results
        for result in detection_results:
            if hasattr(result, 'names'):
                for box in result.boxes:
                    try:
                        cls = result.names[int(box.cls[0])]
                        conf = float(box.conf[0])
                        x1, y1, x2, y2 = map(float, box.xyxy[0])

                        if cls == 'person':
                            frame_data['features']['num_humans'] += 1
                        elif cls == 'weapon':
                            frame_data['features']['num_weapons'] += 1

                        frame_data['features']['bounding_boxes'].append([x1, y1, x2, y2])
                        total_confidence += conf
                        detection_confidences.append(conf)

                        # Draw bounding box
                        cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (255, 0, 0), 2)
                        cv2.putText(frame, f'{cls} {conf:.2f}', (int(x1), int(y1) - 5),
                                  cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)
                    except KeyError as e:
                        print(f"KeyError: {e} - Class index {box.cls[0]} not found in result.names")

        if detection_confidences:
            frame_data['features']['avg_confidence'] = sum(detection_confidences) / len(detection_confidences)

        # Process Segmentation Results
        for result in segmentation_results:
            if result.masks is not None and hasattr(result.masks, 'data') and result.masks.data is not None:
                masks = result.masks.data
                for idx, mask_tensor in enumerate(masks):
                    mask_np = mask_tensor.cpu().numpy()
                    if mask_np.ndim == 3 and mask_np.shape[0] == 1:
                        mask_np = mask_np[0]
                    mask_binary = (mask_np > 0.5).astype(np.uint8)
                    try:
                        area = cv2.countNonZero(mask_binary)
                        frame_data['features']['segmented_areas'].append(int(area))
                        if idx < len(detection_confidences):
                            frame_data['features']['segmentation_confidence'].append(detection_confidences[idx])
                        else:
                            frame_data['features']['segmentation_confidence'].append(None)
                    except Exception as e:
                        print(f"Error processing segmentation: {e}")
                        frame_data['features']['segmented_areas'].append(0)
                        frame_data['features']['segmentation_confidence'].append(None)

        # Process Pose Results
        for result in pose_results:
            if result.keypoints is not None and hasattr(result.keypoints, 'xy') and len(result.keypoints.xy) > 0:
                for keypoint_set in result.keypoints.xy:
                    keypoints_list = []
                    for i in range(len(keypoint_set)):
                        try:
                            keypoint = keypoint_set[i]
                            if torch.is_tensor(keypoint):
                                x = keypoint[0].item() if keypoint.shape[0] > 0 else None
                                y = keypoint[1].item() if keypoint.shape[0] > 1 else None
                                conf = keypoint[2].item() if keypoint.shape[0] > 2 else None
                            else:
                                x = float(keypoint[0]) if len(keypoint) > 0 else None
                                y = float(keypoint[1]) if len(keypoint) > 1 else None
                                conf = float(keypoint[2]) if len(keypoint) > 2 else None
                            keypoints_list.append([x, y, conf])
                        except (IndexError, AttributeError) as e:
                            keypoints_list.append([None, None, None])
                    frame_data['features']['keypoints'].append(keypoints_list)

        # Append frame data and display
        video_data['frames'].append(frame_data)
        cv2.imshow('Detection', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

        frame_count += 1

    # Cleanup
    cap.release()
    cv2.destroyAllWindows()

    # Save features to YAML
    with open(output_yaml, 'w') as file:
        yaml.dump(video_data, file, default_flow_style=False)

if __name__ == "__main__":
    # Initialize Tkinter and hide root window
    root = Tk()
    root.withdraw()

    # Select video file
    video_path = filedialog.askopenfilename(
        title="Select Video File",
        filetypes=[("MP4 Files", "*.mp4"), ("AVI Files", "*.avi")]
    )

    # Define paths
    detection_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m.pt'
    segmentation_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m-seg.pt'
    pose_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m-pose.pt'
    output_yaml = r'C:\Users\harme\Desktop\video-detect-gpu\video_features.yaml'
    label = 1

    if video_path:
        extract_features_from_video(
            video_path,
            detection_model_path,
            segmentation_model_path,
            pose_model_path,
            output_yaml,
            label
        )
    else:
        print("No video file selected.")

CUDA Available: True
Device: 0
GPU Name: NVIDIA GeForce RTX 3050 Laptop GPU


In [20]:
import cv2
import yaml
import torch
from ultralytics import YOLO
import numpy as np
from IPython.display import display, clear_output
from tkinter import Tk, filedialog

# [Previous preprocess_frame function remains the same]
def preprocess_frame(frame, target_size=(640, 640)):
    """
    Preprocesses the input frame by resizing, padding, and normalizing.
    """
    h, w, _ = frame.shape
    scale = min(target_size[0] / h, target_size[1] / w)
    new_w, new_h = int(w * scale), int(h * scale)
    resized_frame = cv2.resize(frame, (new_w, new_h))
    pad_w = (target_size[1] - new_w) // 2
    pad_h = (target_size[0] - new_h) // 2
    padded_frame = cv2.copyMakeBorder(
        resized_frame, pad_h, pad_h, pad_w, pad_w, cv2.BORDER_CONSTANT, value=(0, 0, 0)
    )
    rgb_frame = cv2.cvtColor(padded_frame, cv2.COLOR_BGR2RGB)
    normalized_frame = rgb_frame / 255.0
    return normalized_frame

def convert_tensor_to_serializable(obj):
    """
    Converts tensor objects to serializable Python types.
    """
    if torch.is_tensor(obj):
        return obj.cpu().numpy().tolist()
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {key: convert_tensor_to_serializable(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_tensor_to_serializable(item) for item in obj]
    return obj

def extract_features_from_video(video_path, detection_model_path, segmentation_model_path, pose_model_path, output_yaml, label):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("CUDA Available:", torch.cuda.is_available())
    
    # Load YOLO models
    detection_model = YOLO(detection_model_path).to(device)
    segmentation_model = YOLO(segmentation_model_path).to(device)
    pose_model = YOLO(pose_model_path).to(device)

    # Open video file
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Could not open video file")
        return

    frame_count = 0
    video_data = {
        'video_metadata': {
            'path': str(video_path),
            'fps': float(cap.get(cv2.CAP_PROP_FPS)),
            'frame_count': int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        },
        'frames': []
    }

    frame_skip = 2

    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break

            if frame_count % frame_skip != 0:
                frame_count += 1
                continue

            print(f"Processing frame {frame_count}")  # Debug print

            preprocessed_frame = preprocess_frame(frame)
            preprocessed_frame_tensor = torch.tensor(preprocessed_frame, dtype=torch.float32).permute(2, 0, 1).unsqueeze(0).to(device)

            # Perform inference
            detection_results = detection_model(preprocessed_frame_tensor, verbose=False)
            segmentation_results = segmentation_model(preprocessed_frame_tensor, verbose=False)
            pose_results = pose_model(preprocessed_frame_tensor, verbose=False)

            frame_data = {
                'frame_number': int(frame_count),
                'timestamp': float(frame_count / video_data['video_metadata']['fps']),
                'features': {
                    'num_humans': 0,
                    'num_weapons': 0,
                    'avg_confidence': 0.0,
                    'bounding_boxes': [],
                    'keypoints': [],
                    'segmented_areas': [],
                    'segmentation_confidence': []
                },
                'label': int(label)
            }

            # Process detections
            detection_confidences = []
            for result in detection_results:
                if hasattr(result, 'boxes'):
                    for box in result.boxes:
                        try:
                            cls = int(box.cls[0])
                            conf = float(box.conf[0])
                            xyxy = box.xyxy[0].cpu().numpy().tolist()

                            if cls == 0:  # Assuming 0 is person class
                                frame_data['features']['num_humans'] += 1
                            
                            frame_data['features']['bounding_boxes'].append({
                                'class': cls,
                                'confidence': conf,
                                'coordinates': xyxy
                            })
                            detection_confidences.append(conf)

                        except Exception as e:
                            print(f"Error processing detection: {e}")

            # Calculate average confidence
            if detection_confidences:
                frame_data['features']['avg_confidence'] = float(sum(detection_confidences) / len(detection_confidences))

            # Process segmentation
            for result in segmentation_results:
                if result.masks is not None:
                    for mask in result.masks.data:
                        try:
                            mask_np = mask.cpu().numpy()
                            area = float(np.sum(mask_np > 0.5))
                            frame_data['features']['segmented_areas'].append(area)
                        except Exception as e:
                            print(f"Error processing segmentation: {e}")

            # Process pose
            for result in pose_results:
                if result.keypoints is not None:
                    try:
                        keypoints = result.keypoints.xy
                        keypoints_data = convert_tensor_to_serializable(keypoints)
                        frame_data['features']['keypoints'].extend(keypoints_data)
                    except Exception as e:
                        print(f"Error processing pose: {e}")

            # Append frame data
            video_data['frames'].append(frame_data)

            # Display progress
            if frame_count % 10 == 0:
                print(f"Processed {frame_count} frames")

            # Display frame
            cv2.imshow('Detection', frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

            frame_count += 1

    except Exception as e:
        print(f"Error during processing: {e}")

    finally:
        # Cleanup
        cap.release()
        cv2.destroyAllWindows()

        # Convert all data to serializable format
        video_data = convert_tensor_to_serializable(video_data)

        # Save to YAML file
        try:
            print(f"Saving data to {output_yaml}")
            with open(output_yaml, 'w') as file:
                yaml.dump(video_data, file, default_flow_style=False)
            print("Data saved successfully")
        except Exception as e:
            print(f"Error saving YAML file: {e}")

if __name__ == "__main__":
    # Initialize Tkinter
    root = Tk()
    root.withdraw()

    # Get video file
    video_path = filedialog.askopenfilename(
        title="Select Video File",
        filetypes=[("Video Files", "*.mp4;*.avi")]
    )

    if not video_path:
        print("No video file selected")
        exit()

    # Define paths
    detection_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m.pt'
    segmentation_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m-seg.pt'
    pose_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m-pose.pt'
    output_yaml = r'C:\Users\harme\Desktop\video-detect-gpu\video_features.yaml'
    label = 1

    # Run feature extraction
    extract_features_from_video(
        video_path,
        detection_model_path,
        segmentation_model_path,
        pose_model_path,
        output_yaml,
        label
    )

CUDA Available: True
Processing frame 0
Processed 0 frames
Processing frame 2
Processing frame 4
Processing frame 6
Processing frame 8
Processing frame 10
Processed 10 frames
Processing frame 12
Processing frame 14
Processing frame 16
Processing frame 18
Processing frame 20
Processed 20 frames
Processing frame 22
Processing frame 24
Processing frame 26
Processing frame 28
Processing frame 30
Processed 30 frames
Processing frame 32
Processing frame 34
Processing frame 36
Processing frame 38
Processing frame 40
Processed 40 frames
Processing frame 42
Processing frame 44
Processing frame 46
Processing frame 48
Processing frame 50
Processed 50 frames
Processing frame 52
Processing frame 54
Processing frame 56
Processing frame 58
Processing frame 60
Processed 60 frames
Processing frame 62
Processing frame 64
Saving data to C:\Users\harme\Desktop\video-detect-gpu\video_features.yaml
Data saved successfully


In [23]:
import cv2
import yaml
import torch
from ultralytics import YOLO
import numpy as np
from IPython.display import display, clear_output
from tkinter import Tk, filedialog

def preprocess_frame(frame, target_size=(640, 640)):
    """
    Preprocesses the input frame by resizing, padding, and normalizing.
    """
    h, w, _ = frame.shape
    scale = min(target_size[0] / h, target_size[1] / w)
    new_w, new_h = int(w * scale), int(h * scale)
    resized_frame = cv2.resize(frame, (new_w, new_h))
    pad_w = (target_size[1] - new_w) // 2
    pad_h = (target_size[0] - new_h) // 2
    padded_frame = cv2.copyMakeBorder(
        resized_frame, pad_h, pad_h, pad_w, pad_w, cv2.BORDER_CONSTANT, value=(0, 0, 0)
    )
    rgb_frame = cv2.cvtColor(padded_frame, cv2.COLOR_BGR2RGB)
    normalized_frame = rgb_frame / 255.0
    return normalized_frame

def draw_detections(frame, detections, color=(0, 255, 0)):
    """
    Draw bounding boxes and labels on the frame
    """
    annotated_frame = frame.copy()
    for det in detections:
        if hasattr(det, 'boxes'):
            for box in det.boxes:
                # Get box coordinates
                x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
                
                # Get confidence and class
                conf = float(box.conf[0])
                cls = int(box.cls[0])
                
                # Draw rectangle
                cv2.rectangle(annotated_frame, (x1, y1), (x2, y2), color, 2)
                
                # Add label
                label = f'Class {cls}: {conf:.2f}'
                cv2.putText(annotated_frame, label, (x1, y1 - 10),
                          cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
    
    return annotated_frame

def draw_poses(frame, poses, color=(255, 0, 0)):
    """
    Draw pose keypoints and connections on the frame
    """
    annotated_frame = frame.copy()
    if poses is not None:
        for pose in poses:
            if pose.keypoints is not None:
                keypoints = pose.keypoints[0].cpu().numpy()
                for kp in keypoints:
                    x, y = map(int, kp[:2])
                    cv2.circle(annotated_frame, (x, y), 4, color, -1)
                
                # Draw connections (you can customize these based on your needs)
                # Example: connecting shoulders to hips
                if len(keypoints) >= 6:  # Assuming basic body keypoints are present
                    # Connect shoulders
                    cv2.line(annotated_frame, 
                            tuple(map(int, keypoints[5][:2])),
                            tuple(map(int, keypoints[6][:2])),
                            color, 2)
    
    return annotated_frame

def extract_features_from_video(video_path, detection_model_path, segmentation_model_path, pose_model_path, output_yaml, label):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load models
    detection_model = YOLO(detection_model_path)
    segmentation_model = YOLO(segmentation_model_path)
    pose_model = YOLO(pose_model_path)

    # Open video
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Could not open video file")
        return

    # Get video properties
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    # Create video writer
    output_video_path = video_path.rsplit('.', 1)[0] + '_annotated.mp4'
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

    video_data = {
        'video_metadata': {
            'path': str(video_path),
            'fps': float(fps),
            'frame_count': int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        },
        'frames': []
    }

    frame_count = 0
    
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break

            # Create a copy for annotation
            display_frame = frame.copy()

            # Process frame data
            frame_data = {
                'frame_number': frame_count,
                'timestamp': frame_count / fps,
                'detections': [],
                'poses': [],
                'segments': []
            }

            # Run detections
            detections = detection_model(frame)
            poses = pose_model(frame)
            segments = segmentation_model(frame)

            # Process detections
            for detection in detections:
                if hasattr(detection, 'boxes'):
                    for box in detection.boxes:
                        det_data = {
                            'bbox': box.xyxy[0].cpu().numpy().tolist(),
                            'confidence': float(box.conf[0]),
                            'class': int(box.cls[0])
                        }
                        frame_data['detections'].append(det_data)

                        # Draw detection
                        x1, y1, x2, y2 = map(int, det_data['bbox'])
                        cv2.rectangle(display_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                        label = f"Class {det_data['class']}: {det_data['confidence']:.2f}"
                        cv2.putText(display_frame, label, (x1, y1-10), 
                                  cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

            # Process poses
            if poses:
                for pose in poses:
                    if pose.keypoints is not None:
                        kpts = pose.keypoints.data[0].cpu().numpy()
                        pose_data = {
                            'keypoints': kpts.tolist()
                        }
                        frame_data['poses'].append(pose_data)

                        # Draw pose keypoints
                        for kpt in kpts:
                            x, y = map(int, kpt[:2])
                            cv2.circle(display_frame, (x, y), 4, (255, 0, 0), -1)

            # Process segmentation
            if segments:
                for segment in segments:
                    if segment.masks is not None:
                        masks = segment.masks.data.cpu().numpy()
                        for mask in masks:
                            mask = (mask > 0.5).astype(np.uint8) * 255
                            colored_mask = cv2.applyColorMap(mask, cv2.COLORMAP_JET)
                            display_frame = cv2.addWeighted(display_frame, 0.7, colored_mask, 0.3, 0)

            # Add frame data to video data
            video_data['frames'].append(frame_data)

            # Display and save frame
            cv2.imshow('Detections', display_frame)
            out.write(display_frame)

            # Process key events
            key = cv2.waitKey(1) & 0xFF
            if key == ord('q'):
                break
            elif key == ord('p'):
                cv2.waitKey(0)

            frame_count += 1
            if frame_count % 30 == 0:
                print(f"Processed {frame_count} frames")

    except Exception as e:
        print(f"Error during processing: {e}")
        raise  # This will show the full error traceback

    finally:
        # Cleanup
        cap.release()
        out.release()
        cv2.destroyAllWindows()

        # Save features to YAML
        try:
            with open(output_yaml, 'w') as file:
                yaml.dump(video_data, file, default_flow_style=False)
            print(f"Features saved to {output_yaml}")
            print(f"Annotated video saved to {output_video_path}")
        except Exception as e:
            print(f"Error saving data: {e}")
            
    # Define paths
    detection_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m.pt'
    segmentation_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m-seg.pt'
    pose_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m-pose.pt'
    output_yaml = r'C:\Users\harme\Desktop\video-detect-gpu\video_features.yaml'
    label = 1

    # Run feature extraction
    extract_features_from_video(
        video_path,
        detection_model_path,
        segmentation_model_path,
        pose_model_path,
        output_yaml,
        label
    )

In [10]:
import cv2
import yaml
import torch
from ultralytics import YOLO
import numpy as np
from IPython.display import display, clear_output
from tkinter import Tk, filedialog


def preprocess_frame(frame, target_size=(640, 640)):
    h, w, _ = frame.shape
    scale = min(target_size[0] / h, target_size[1] / w)
    new_w, new_h = int(w * scale), int(h * scale)
    resized_frame = cv2.resize(frame, (new_w, new_h))
    pad_w = (target_size[1] - new_w) // 2
    pad_h = (target_size[0] - new_h) // 2
    padded_frame = cv2.copyMakeBorder(resized_frame, pad_h, pad_h, pad_w, pad_w, cv2.BORDER_CONSTANT, value=(0, 0, 0))
    rgb_frame = cv2.cvtColor(padded_frame, cv2.COLOR_BGR2RGB)
    normalized_frame = rgb_frame / 255.0
    return normalized_frame


def extract_features_from_video(video_path, detection_model_path, segmentation_model_path, pose_model_path, output_yaml, label):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("CUDA Available:", torch.cuda.is_available())
    if torch.cuda.is_available():
        print("Device:", torch.cuda.current_device())
        print("GPU Name:", torch.cuda.get_device_name(0))

    detection_model = YOLO(detection_model_path).to(device)
    segmentation_model = YOLO(segmentation_model_path).to(device)
    pose_model = YOLO(pose_model_path).to(device)

    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    video_data = {
        'video_metadata': {
            'path': video_path,
            'fps': cap.get(cv2.CAP_PROP_FPS),
            'frame_count': int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        },
        'frames': []
    }

    frame_skip = 2

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if frame_count % frame_skip != 0:
            frame_count += 1
            continue

        preprocessed_frame = preprocess_frame(frame)
        preprocessed_frame_tensor = torch.tensor(preprocessed_frame, dtype=torch.float32).permute(2, 0, 1).unsqueeze(0).to(device)

        detection_results = detection_model(preprocessed_frame_tensor)
        segmentation_results = segmentation_model(preprocessed_frame_tensor)
        pose_results = pose_model(preprocessed_frame_tensor)

        frame_data = {
            'frame_number': frame_count,
            'timestamp': frame_count / video_data['video_metadata']['fps'],
            'features': {
                'num_humans': 0,
                'num_weapons': 0,
                'avg_confidence': 0,
                'bounding_boxes': [],
                'keypoints': [],
                'segmented_areas': [],
                'segmentation_confidence': []
            },
            'label': label
        }

        total_confidence = 0
        for result in detection_results:
            for box in result.boxes:
                cls = result.names.get(int(box.cls[0].item()), "Unknown")
                conf = box.conf[0]
                x1, y1, x2, y2 = box.xyxy[0]

                if cls == 'person':
                    frame_data['features']['num_humans'] += 1
                elif cls == 'weapon':  
                    frame_data['features']['num_weapons'] += 1

                frame_data['features']['bounding_boxes'].append([float(x1), float(y1), float(x2), float(y2)])
                total_confidence += float(conf)

                cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (255, 0, 0), 2)
                cv2.putText(frame, f'{cls} {conf:.2f}', (int(x1), int(y1) - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)

        if len(detection_results) > 0:
            frame_data['features']['avg_confidence'] = total_confidence / len(detection_results)

        for result in segmentation_results:
            if result.masks is not None:
                for mask in result.masks:
                    area = cv2.countNonZero(mask)
                    frame_data['features']['segmented_areas'].append(area)
                    frame_data['features']['segmentation_confidence'].append(mask.confidence)
            else:
                print("No masks found for this frame.")

        for result in pose_results:
            if result.keypoints is not None and len(result.keypoints.xy) > 0:
                for keypoint in result.keypoints.xy:
                    if len(keypoint) >= 3:
                        x, y = keypoint[0], keypoint[1]
                        confidence = keypoint[2]
                    else:
                        x, y, confidence = None, None, None
                    frame_data['features']['keypoints'].append([x, y, confidence])

        video_data['frames'].append(frame_data)

        clear_output(wait=True)
        display(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

        frame_count += 1

    cap.release()

    with open(output_yaml, 'w') as file:
        yaml.dump(video_data, file, default_flow_style=False)

if __name__ == "__main__":
    root = Tk()
    root.withdraw()
    video_path = filedialog.askopenfilename(title="Select Video File", filetypes=[("MP4 Files", "*.mp4"), ("AVI Files", "*.avi")])

    detection_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m.pt'
    segmentation_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m-seg.pt'
    pose_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m-pose.pt'
    output_yaml = r'C:\Users\harme\Desktop\violence detection\video_features.yaml'
    label = 1

    if video_path:
        extract_features_from_video(video_path, detection_model_path, segmentation_model_path, pose_model_path, output_yaml, label)
    else:
        print("No video file selected.")


CUDA Available: True
Device: 0
GPU Name: NVIDIA GeForce RTX 3050 Laptop GPU

0: 640x640 2 persons, 4 benchs, 3 backpacks, 1 chair, 1 dining table, 27.5ms
Speed: 0.0ms preprocess, 27.5ms inference, 4.3ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 2 persons, 4 benchs, 3 backpacks, 1 chair, 1 dining table, 32.2ms
Speed: 0.0ms preprocess, 32.2ms inference, 8.3ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 2 persons, 31.1ms
Speed: 0.0ms preprocess, 31.1ms inference, 2.5ms postprocess per image at shape (1, 3, 640, 640)


error: OpenCV(4.11.0) :-1: error: (-5:Bad argument) in function 'countNonZero'
> Overload resolution failed:
>  - src is not a numpy array, neither a scalar
>  - Expected Ptr<cv::UMat> for argument 'src'


In [None]:


# Function to preprocess video frame
def preprocess_frame(frame, target_size=(640, 640)):
    h, w, _ = frame.shape
    scale = min(target_size[0] / h, target_size[1] / w)
    new_w, new_h = int(w * scale), int(h * scale)
    resized_frame = cv2.resize(frame, (new_w, new_h))

    pad_w = (target_size[1] - new_w) // 2
    pad_h = (target_size[0] - new_h) // 2
    padded_frame = cv2.copyMakeBorder(resized_frame, pad_h, pad_h, pad_w, pad_w, cv2.BORDER_CONSTANT, value=(0, 0, 0))

    rgb_frame = cv2.cvtColor(padded_frame, cv2.COLOR_BGR2RGB)
    normalized_frame = rgb_frame / 255.0

    return normalized_frame

# Main function for feature extraction
def extract_features_from_video(video_path, detection_model_path, segmentation_model_path, pose_model_path, output_yaml, label):
    detection_model = YOLO(detection_model_path).to('cuda')
    segmentation_model = YOLO(segmentation_model_path).to('cuda')
    pose_model = YOLO(pose_model_path).to('cuda')

    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    video_data = {
        'video_metadata': {
            'path': video_path,
            'fps': cap.get(cv2.CAP_PROP_FPS),
            'frame_count': int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        },
        'frames': []
    }

    frame_skip = 2  # Process every 2nd frame

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if frame_count % frame_skip != 0:
            frame_count += 1
            continue

        preprocessed_frame = preprocess_frame(frame)
        preprocessed_frame_tensor = torch.tensor(preprocessed_frame).to('cuda')

        detection_results = detection_model(preprocessed_frame_tensor)
        segmentation_results = segmentation_model(preprocessed_frame_tensor)
        pose_results = pose_model(preprocessed_frame_tensor)

        frame_data = {
            'frame_number': frame_count,
            'timestamp': frame_count / video_data['video_metadata']['fps'],
            'features': {
                'num_humans': 0,
                'num_weapons': 0,
                'avg_confidence': 0,
                'bounding_boxes': [],
                'keypoints': [],
                'segmented_areas': [],
                'segmentation_confidence': []
            },
            'label': label
        }

        total_confidence = 0
        for result in detection_results:
            for box in result.boxes:
                cls = result.names[box.cls[0]]
                conf = box.conf[0]
                x1, y1, x2, y2 = box.xyxy[0]

                if cls == 'person':
                    frame_data['features']['num_humans'] += 1
                elif cls == 'weapon':
                    frame_data['features']['num_weapons'] += 1

                frame_data['features']['bounding_boxes'].append([float(x1), float(y1), float(x2), float(y2)])
                total_confidence += float(conf)

                cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (255, 0, 0), 2)
                cv2.putText(frame, f'{cls} {conf:.2f}', (int(x1), int(y1) - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)

        if len(detection_results) > 0:
            frame_data['features']['avg_confidence'] = total_confidence / len(detection_results)

        for result in segmentation_results:
            if result.masks is not None:
                for mask in result.masks:
                    area = cv2.countNonZero(mask)
                    frame_data['features']['segmented_areas'].append(area)
                    frame_data['features']['segmentation_confidence'].append(mask.confidence)

        for result in pose_results:
            if result.keypoints is not None and len(result.keypoints.xy) > 0:
                for keypoint in result.keypoints.xy:
                    if len(keypoint) >= 3:
                        x, y = keypoint[0], keypoint[1]
                        confidence = keypoint[2]
                    else:
                        x, y, confidence = None, None, None
                    frame_data['features']['keypoints'].append([x, y, confidence])

        video_data['frames'].append(frame_data)
        cv2.imshow('Detections', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

        frame_count += 1

    cap.release()
    cv2.destroyAllWindows()

    with open(output_yaml, 'w') as file:
        yaml.dump(video_data, file, default_flow_style=False)

video_path = r'C:\Users\harme\Desktop\violence detection\data\fi21_xvid.avi'
detection_model_path = r'C:\Users\harme\Desktop\video-detect-gpu\yolo11m.pt'
segmentation_model_path = r'C:\Users\harme\Desktop\vide0-detect-gpu\yolo11m-seg.pt'
pose_model_path = r'C:\Users\harme\Desktop\violence-detect-gpu\yolo11m-pose.pt'
output_yaml = r'C:\Users\harme\Desktop\video-detect-gpu\video_features.yaml'
label = 1

extract_features_from_video(video_path, detection_model_path, segmentation_model_path, pose_model_path, output_yaml, label)

In [24]:
import cv2
import yaml
import torch
from ultralytics import YOLO
import numpy as np
from IPython.display import display, clear_output
from tkinter import Tk, filedialog

def preprocess_frame(frame, target_size=(640, 640)):
    """
    Preprocesses the input frame by resizing, padding, and normalizing.
    """
    h, w, _ = frame.shape
    scale = min(target_size[0] / h, target_size[1] / w)
    new_w, new_h = int(w * scale), int(h * scale)
    resized_frame = cv2.resize(frame, (new_w, new_h))
    pad_w = (target_size[1] - new_w) // 2
    pad_h = (target_size[0] - new_h) // 2
    padded_frame = cv2.copyMakeBorder(
        resized_frame, pad_h, pad_h, pad_w, pad_w, cv2.BORDER_CONSTANT, value=(0, 0, 0)
    )
    rgb_frame = cv2.cvtColor(padded_frame, cv2.COLOR_BGR2RGB)
    normalized_frame = rgb_frame / 255.0
    return normalized_frame

def draw_detections(frame, boxes, scores, class_ids, class_names):
    """
    Draw bounding boxes and labels on the frame
    """
    for box, score, class_id in zip(boxes, scores, class_ids):
        x1, y1, x2, y2 = map(int, box)
        class_name = class_names.get(class_id, f"Class {class_id}")
        
        # Draw rectangle
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        
        # Draw label
        label = f'{class_name} {score:.2f}'
        (label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
        cv2.rectangle(frame, (x1, y1 - label_height - 10), (x1 + label_width, y1), (0, 255, 0), -1)
        cv2.putText(frame, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2)
    
    return frame

def draw_keypoints(frame, keypoints, confidence_threshold=0.5):
    """
    Draw pose keypoints and connections on the frame
    """
    if keypoints is None:
        return frame

    # Define keypoint connections (you can customize this based on your needs)
    connections = [
        (5, 7), (7, 9),    # Right arm
        (6, 8), (8, 10),   # Left arm
        (11, 13), (13, 15), # Right leg
        (12, 14), (14, 16), # Left leg
        (5, 6),            # Shoulders
        (11, 12),          # Hips
        (5, 11), (6, 12)   # Spine
    ]

    # Draw keypoints
    for kpt in keypoints:
        for i, (x, y, conf) in enumerate(kpt):
            if conf > confidence_threshold:
                cv2.circle(frame, (int(x), int(y)), 4, (255, 0, 0), -1)

    # Draw connections
    for kpt in keypoints:
        for connection in connections:
            if (kpt[connection[0]][2] > confidence_threshold and 
                kpt[connection[1]][2] > confidence_threshold):
                pt1 = (int(kpt[connection[0]][0]), int(kpt[connection[0]][1]))
                pt2 = (int(kpt[connection[1]][0]), int(kpt[connection[1]][1]))
                cv2.line(frame, pt1, pt2, (0, 255, 255), 2)

    return frame

def apply_segmentation_mask(frame, masks, alpha=0.5):
    """
    Apply segmentation masks to the frame
    """
    if masks is None:
        return frame

    overlay = frame.copy()
    
    for mask in masks:
        color = np.random.randint(0, 255, 3).tolist()
        mask_np = mask.cpu().numpy() if torch.is_tensor(mask) else mask
        mask_np = (mask_np > 0.5).astype(np.uint8)
        
        colored_mask = np.zeros_like(frame)
        colored_mask[mask_np > 0] = color
        
        cv2.addWeighted(colored_mask, alpha, overlay, 1 - alpha, 0, overlay)
    
    return overlay

def process_frame(frame, detection_model, segmentation_model, pose_model):
    """
    Process a single frame through all models
    """
    # Run models
    det_results = detection_model(frame, verbose=False)
    seg_results = segmentation_model(frame, verbose=False)
    pose_results = pose_model(frame, verbose=False)

    frame_data = {
        'detections': [],
        'segmentations': [],
        'poses': []
    }

    # Process detections
    if det_results:
        for result in det_results:
            if hasattr(result, 'boxes'):
                boxes = result.boxes
                for box in boxes:
                    det_data = {
                        'bbox': box.xyxy[0].cpu().numpy().tolist(),
                        'confidence': float(box.conf[0]),
                        'class_id': int(box.cls[0])
                    }
                    frame_data['detections'].append(det_data)

    # Process segmentations
    if seg_results:
        for result in seg_results:
            if hasattr(result, 'masks') and result.masks is not None:
                for mask in result.masks.data:
                    seg_data = {
                        'mask': mask.cpu().numpy().tolist()
                    }
                    frame_data['segmentations'].append(seg_data)

    # Process poses
    if pose_results:
        for result in pose_results:
            if hasattr(result, 'keypoints') and result.keypoints is not None:
                for keypoint in result.keypoints:
                    pose_data = {
                        'keypoints': keypoint.data[0].cpu().numpy().tolist()
                    }
                    frame_data['poses'].append(pose_data)

    return frame_data

In [25]:
def extract_features_from_video(video_path, detection_model_path, segmentation_model_path, pose_model_path, output_yaml, label):
    """
    Main function to extract features from video
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load models
    detection_model = YOLO(detection_model_path)
    segmentation_model = YOLO(segmentation_model_path)
    pose_model = YOLO(pose_model_path)

    # Open video
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Could not open video file")
        return

    # Get video properties
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # Create video writer
    output_video_path = video_path.rsplit('.', 1)[0] + '_annotated.mp4'
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

    # Initialize video data structure
    video_data = {
        'video_metadata': {
            'path': str(video_path),
            'fps': float(fps),
            'frame_count': total_frames,
            'width': frame_width,
            'height': frame_height
        },
        'frames': []
    }

    frame_count = 0
    processing_interval = 2  # Process every nth frame

    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break

            if frame_count % processing_interval == 0:
                # Create copy for display
                display_frame = frame.copy()

                # Process frame
                frame_data = process_frame(frame, detection_model, segmentation_model, pose_model)
                
                # Add metadata to frame data
                frame_data.update({
                    'frame_number': frame_count,
                    'timestamp': frame_count / fps,
                    'label': label
                })

                # Draw visualizations
                if frame_data['detections']:
                    boxes = [det['bbox'] for det in frame_data['detections']]
                    scores = [det['confidence'] for det in frame_data['detections']]
                    class_ids = [det['class_id'] for det in frame_data['detections']]
                    display_frame = draw_detections(display_frame, boxes, scores, class_ids, detection_model.names)

                if frame_data['poses']:
                    keypoints = [pose['keypoints'] for pose in frame_data['poses']]
                    display_frame = draw_keypoints(display_frame, keypoints)

                if frame_data['segmentations']:
                    masks = [np.array(seg['mask']) for seg in frame_data['segmentations']]
                    display_frame = apply_segmentation_mask(display_frame, masks)

                # Add frame data to video data
                video_data['frames'].append(frame_data)

                # Display progress
                if frame_count % 30 == 0:
                    progress = (frame_count / total_frames) * 100
                    print(f"Processing: {progress:.1f}% complete")

                # Display frame
                cv2.imshow('Processing', display_frame)
                out.write(display_frame)

                # Handle key events
                key = cv2.waitKey(1) & 0xFF
                if key == ord('q'):
                    break
                elif key == ord('p'):
                    cv2.waitKey(0)

            frame_count += 1

    except Exception as e:
        print(f"Error during processing: {e}")
        import traceback
        traceback.print_exc()

    finally:
        # Cleanup
        cap.release()
        out.release()
        cv2.destroyAllWindows()

        # Save features to YAML
        try:
            with open(output_yaml, 'w') as file:
                yaml.dump(video_data, file, default_flow_style=False)
            print(f"Features saved to {output_yaml}")
            print(f"Annotated video saved to {output_video_path}")
        except Exception as e:
            print(f"Error saving data: {e}")

if __name__ == "__main__":
    # Initialize Tkinter
    root = Tk()
    root.withdraw()

    # Get video file
    video_path = filedialog.askopenfilename(
        title="Select Video File",
        filetypes=[("Video Files", "*.mp4;*.avi")]
    )

    if not video_path:
        print("No video file selected")
        exit()

    # Define paths
    detection_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m.pt'
    segmentation_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m-seg.pt'
    pose_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m-pose.pt'
    output_yaml = r'C:\Users\harme\Desktop\video-detect-gpu\video_features.yaml'
    label = 1

    # Run feature extraction
    extract_features_from_video(
        video_path,
        detection_model_path,
        segmentation_model_path,
        pose_model_path,
        output_yaml,
        label
    )

Using device: cuda
Processing: 0.0% complete
Processing: 2.8% complete
Processing: 5.6% complete
Processing: 8.4% complete
Processing: 11.2% complete
Processing: 13.9% complete
Processing: 16.7% complete
Processing: 19.5% complete
Processing: 22.3% complete
Processing: 25.1% complete
Processing: 27.9% complete
Processing: 30.7% complete
Processing: 33.5% complete
Processing: 36.2% complete
Processing: 39.0% complete
Processing: 41.8% complete
Processing: 44.6% complete
Processing: 47.4% complete
Processing: 50.2% complete
Processing: 53.0% complete
Processing: 55.8% complete
Processing: 58.6% complete
Processing: 61.3% complete
Processing: 64.1% complete


KeyboardInterrupt: 

In [27]:
import torch
import requests
import os

def download_model(url, save_path):
    """Download model if it doesn't exist"""
    if not os.path.exists(save_path):
        print(f"Downloading model to {save_path}...")
        response = requests.get(url)
        with open(save_path, 'wb') as f:
            f.write(response.content)
        print("Download completed!")
    return save_path

# Model URL and save path
model_url = "https://github.com/keremberke/weapon-detection-yolov8/releases/download/v1.0.0/yolov8n-weapon-detection.pt"
model_path = "yolov8n-weapon-detection.pt"

# Download the model
model_path = download_model(model_url, model_path)

Downloading model to yolov8n-weapon-detection.pt...
Download completed!


In [28]:
import cv2
import yaml
import torch
import requests
import os
from ultralytics import YOLO
import numpy as np
from tkinter import Tk, filedialog

def download_model(url, save_path):
    """Download model if it doesn't exist"""
    if not os.path.exists(save_path):
        print(f"Downloading model to {save_path}...")
        response = requests.get(url)
        with open(save_path, 'wb') as f:
            f.write(response.content)
        print("Download completed!")
    return save_path

def draw_detections(frame, results, conf_threshold=0.3):
    """Draw weapon detections on frame"""
    for result in results:
        boxes = result.boxes
        for box in boxes:
            conf = float(box.conf[0])
            if conf < conf_threshold:
                continue

            cls = int(box.cls[0])
            class_name = result.names[cls]
            x1, y1, x2, y2 = map(int, box.xyxy[0])

            color = (0, 0, 255) if 'weapon' in class_name.lower() or 'gun' in class_name.lower() else (0, 255, 0)
            cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
            label = f'{class_name} {conf:.2f}'
            cv2.putText(frame, label, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    return frame

def extract_features_from_video(video_path, model_path, output_yaml, label):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load weapon detection model
    model = YOLO(model_path)
    print("Available classes:", model.names)

    # Open video
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Could not open video file")
        return

    # Get video properties
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # Create video writer
    output_video_path = video_path.rsplit('.', 1)[0] + '_weapon_detection.mp4'
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

    video_data = {
        'video_metadata': {
            'path': str(video_path),
            'fps': float(fps),
            'frame_count': total_frames
        },
        'frames': []
    }

    frame_count = 0
    
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break

            # Run detection
            results = model(frame, verbose=False)
            
            # Process detections
            frame_data = {
                'frame_number': frame_count,
                'timestamp': frame_count / fps,
                'detections': []
            }

            for result in results:
                for box in result.boxes:
                    detection = {
                        'class': result.names[int(box.cls[0])],
                        'confidence': float(box.conf[0]),
                        'bbox': box.xyxy[0].cpu().numpy().tolist()
                    }
                    frame_data['detections'].append(detection)

            # Draw detections
            display_frame = draw_detections(frame.copy(), results)

            # Add frame data
            video_data['frames'].append(frame_data)

            # Display progress
            if frame_count % 30 == 0:
                progress = (frame_count / total_frames) * 100
                print(f"Processing: {progress:.1f}% complete")

            # Display frame
            cv2.imshow('Weapon Detection', display_frame)
            out.write(display_frame)

            # Handle key events
            key = cv2.waitKey(1) & 0xFF
            if key == ord('q'):
                break
            elif key == ord('p'):
                cv2.waitKey(0)

            frame_count += 1

    except Exception as e:
        print(f"Error during processing: {e}")
        import traceback
        traceback.print_exc()

    finally:
        # Cleanup
        cap.release()
        out.release()
        cv2.destroyAllWindows()

        # Save features
        try:
            with open(output_yaml, 'w') as file:
                yaml.dump(video_data, file, default_flow_style=False)
            print(f"Features saved to {output_yaml}")
            print(f"Annotated video saved to {output_video_path}")
        except Exception as e:
            print(f"Error saving data: {e}")

if __name__ == "__main__":
    # Initialize Tkinter
    root = Tk()
    root.withdraw()

    # Download the weapon detection model
    model_url = "https://github.com/keremberke/weapon-detection-yolov8/releases/download/v1.0.0/yolov8n-weapon-detection.pt"
    model_path = "yolov8n-weapon-detection.pt"
    model_path = download_model(model_url, model_path)

    # Get video file
    video_path = filedialog.askopenfilename(
        title="Select Video File",
        filetypes=[("Video Files", "*.mp4;*.avi")]
    )

    if not video_path:
        print("No video file selected")
        exit()

    output_yaml = r'C:\Users\harme\Desktop\video-detect-gpu\weapon_detection_features.yaml'
    label = 1

    # Run detection
    extract_features_from_video(
        video_path,
        model_path,
        output_yaml,
        label
    )

Using device: cuda


UnpicklingError: could not find MARK

In [34]:
import cv2
import yaml
import torch
from ultralytics import YOLO
import numpy as np
from tkinter import Tk, filedialog
from scipy.spatial.distance import cdist

class ViolenceFeatureExtractor:
    def __init__(self, detection_model_path, segmentation_model_path, pose_model_path):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.detection_model = YOLO(detection_model_path)
        self.segmentation_model = YOLO(segmentation_model_path)
        self.pose_model = YOLO(pose_model_path)
        self.violence_objects = ['knife', 'gun', 'baseball bat', 'stick']

    def calculate_motion_features(self, prev_poses, current_poses):
        if not isinstance(prev_poses, list) or not isinstance(current_poses, list) or not prev_poses or not current_poses:
            return {
                'average_speed': 0,
                'motion_intensity': 0,
                'sudden_movements': 0
            }

        try:
            speeds = []
            for prev_pose, curr_pose in zip(prev_poses, current_poses):
                if prev_pose is not None and curr_pose is not None:
                    displacement = np.linalg.norm(curr_pose - prev_pose, axis=1)
                    speed = np.mean(displacement)
                    speeds.append(speed)

            if not speeds:
                return {
                    'average_speed': 0,
                    'motion_intensity': 0,
                    'sudden_movements': 0
                }

            average_speed = np.mean(speeds)
            motion_intensity = np.std(speeds)
            sudden_movements = np.sum(np.array(speeds) > np.mean(speeds) + 2 * np.std(speeds))

            return {
                'average_speed': float(average_speed),
                'motion_intensity': float(motion_intensity),
                'sudden_movements': int(sudden_movements)
            }
        except Exception as e:
            print(f"Error in calculate_motion_features: {e}")
            return {
                'average_speed': 0,
                'motion_intensity': 0,
                'sudden_movements': 0
            }

    def calculate_interaction_features(self, poses, boxes):
        try:
            if len(poses) == 0 or len(boxes) == 0:
                return {
                    'proximity': 0,
                    'interaction_intensity': 0,
                    'group_density': 0
                }

            # Convert to numpy arrays if they aren't already
            poses_array = np.array(poses)
            boxes_array = np.array(boxes)

            # Calculate distances between all pairs of people
            distances = cdist(boxes_array[:, :2], boxes_array[:, :2])
            
            # Calculate proximity (inverse of average distance)
            proximity = 1 / (np.mean(distances) + 1e-6)
            
            # Calculate interaction intensity
            poses_flat = poses_array.reshape(poses_array.shape[0], -1)
            pose_distances = cdist(poses_flat, poses_flat)
            interaction_intensity = np.mean(1 / (pose_distances + 1e-6))
            
            # Calculate group density
            if len(boxes_array) > 1:
                hull = cv2.convexHull(boxes_array[:, :2].astype(np.float32))
                area = cv2.contourArea(hull)
                group_density = len(boxes_array) / (area + 1e-6)
            else:
                group_density = 0

            return {
                'proximity': float(proximity),
                'interaction_intensity': float(interaction_intensity),
                'group_density': float(group_density)
            }
        except Exception as e:
            print(f"Error in calculate_interaction_features: {e}")
            return {
                'proximity': 0,
                'interaction_intensity': 0,
                'group_density': 0
            }

    def extract_violence_features(self, frame, prev_frame_data=None):
        try:
            # Run models
            det_results = self.detection_model(frame)
            seg_results = self.segmentation_model(frame)
            pose_results = self.pose_model(frame)

            # Initialize feature dictionary
            features = {
                'object_features': {},
                'pose_features': {},
                'motion_features': {},
                'interaction_features': {},
                'context_features': {}
            }

            # Extract object detection features
            objects_detected = []
            violence_objects_count = 0
            person_boxes = []
            
            for result in det_results:
                for box in result.boxes:
                    cls = result.names[int(box.cls[0])]
                    conf = float(box.conf[0])
                    
                    if cls in self.violence_objects:
                        violence_objects_count += 1
                    
                    if cls == 'person':
                        person_boxes.append(box.xyxy[0].cpu().numpy())
                    
                    objects_detected.append({
                        'class': cls,
                        'confidence': conf,
                        'box': box.xyxy[0].cpu().numpy().tolist()
                    })

            features['object_features'] = {
                'total_objects': len(objects_detected),
                'violence_objects': violence_objects_count,
                'person_count': len(person_boxes)
            }

            # Extract pose features
            poses_list = []
            pose_confidences = []
            
            for result in pose_results:
                if result.keypoints is not None:
                    for keypoints in result.keypoints:
                        poses_list.append(keypoints.data[0].cpu().numpy())
                        pose_confidences.extend(keypoints.data[0, :, 2].cpu().numpy())

            if poses_list:
                poses_array = np.array(poses_list)
                features['pose_features'] = {
                    'pose_count': len(poses_list),
                    'average_confidence': float(np.mean(pose_confidences)),
                    'pose_variance': float(np.var(poses_array.reshape(poses_array.shape[0], -1), axis=1).mean())
                }
            else:
                features['pose_features'] = {
                    'pose_count': 0,
                    'average_confidence': 0,
                    'pose_variance': 0
                }

            # Calculate motion features
            if prev_frame_data and 'poses' in prev_frame_data and prev_frame_data['poses'] is not None:
                features['motion_features'] = self.calculate_motion_features(
                    prev_frame_data['poses'], poses_list)
            else:
                features['motion_features'] = {
                    'average_speed': 0,
                    'motion_intensity': 0,
                    'sudden_movements': 0
                }

            # Calculate interaction features
            if len(poses_list) > 0 and len(person_boxes) > 0:
                features['interaction_features'] = self.calculate_interaction_features(
                    poses_list, person_boxes)
            else:
                features['interaction_features'] = {
                    'proximity': 0,
                    'interaction_intensity': 0,
                    'group_density': 0
                }

            return features, poses_list

        except Exception as e:
            print(f"Error in extract_violence_features: {e}")
            return {
                'object_features': {'total_objects': 0, 'violence_objects': 0, 'person_count': 0},
                'pose_features': {'pose_count': 0, 'average_confidence': 0, 'pose_variance': 0},
                'motion_features': {'average_speed': 0, 'motion_intensity': 0, 'sudden_movements': 0},
                'interaction_features': {'proximity': 0, 'interaction_intensity': 0, 'group_density': 0},
                'context_features': {}
            }, []

def process_video_for_violence_detection(video_path, extractor, output_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error opening video file")
        return

    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    video_data = {
        'metadata': {
            'path': video_path,
            'fps': fps,
            'frame_count': frame_count
        },
        'frames': []
    }

    prev_frame_data = None
    frame_idx = 0

    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break

            # Extract features
            features, poses = extractor.extract_violence_features(frame, prev_frame_data)
            
            # Store frame data
            frame_data = {
                'frame_index': frame_idx,
                'timestamp': frame_idx / fps,
                'features': features,
                'poses': poses
            }
            
            video_data['frames'].append(frame_data)
            prev_frame_data = frame_data
            
            # Show progress
            if frame_idx % 30 == 0:
                print(f"Processed {frame_idx}/{frame_count} frames")
            
            frame_idx += 1

    except Exception as e:
        print(f"Error processing video: {e}")

    finally:
        cap.release()

        # Save features to file
        try:
            with open(output_path, 'w') as f:
                yaml.dump(video_data, f)
        except Exception as e:
            print(f"Error saving features: {e}")

    return video_data

if __name__ == "__main__":
    # Initialize Tkinter
    root = Tk()
    root.withdraw()

    # Get video file
    video_path = filedialog.askopenfilename(
        title="Select Video File",
        filetypes=[("Video Files", "*.mp4;*.avi")]
    )

    if not video_path:
        print("No video file selected")
        exit()

    # Define model paths
    detection_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m.pt'
    segmentation_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m-seg.pt'
    pose_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m-pose.pt'
    
    # Initialize feature extractor
    extractor = ViolenceFeatureExtractor(
        detection_model_path,
        segmentation_model_path,
        pose_model_path
    )

    # Process video
    output_path = r'C:\Users\harme\Desktop\video-detect-gpu\violence_features.yaml'
    video_data = process_video_for_violence_detection(video_path, extractor, output_path)

    print("Feature extraction complete!")
    print(f"Features saved to: {output_path}")


0: 384x640 2 persons, 2 chairs, 1 dining table, 23.7ms
Speed: 3.5ms preprocess, 23.7ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 2 chairs, 1 dining table, 29.8ms
Speed: 2.0ms preprocess, 29.8ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 19.7ms
Speed: 2.5ms preprocess, 19.7ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)
Processed 0/66 frames

0: 384x640 2 persons, 2 chairs, 1 dining table, 20.7ms
Speed: 6.0ms preprocess, 20.7ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 2 chairs, 1 dining table, 22.0ms
Speed: 2.0ms preprocess, 22.0ms inference, 3.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 18.1ms
Speed: 2.8ms preprocess, 18.1ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 2 chairs, 1 dining table, 16.0ms
Speed: 3.0ms preprocess, 16.0ms inference, 3.0ms postpro

In [44]:
import cv2
import yaml
import torch
from ultralytics import YOLO
import numpy as np
from tkinter import Tk, filedialog
from scipy.spatial.distance import cdist

class ViolenceFeatureExtractor:
    def __init__(self, detection_model_path, segmentation_model_path, pose_model_path):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")
        
        # Load models
        self.detection_model = YOLO(detection_model_path)
        self.segmentation_model = YOLO(segmentation_model_path)
        self.pose_model = YOLO(pose_model_path)
        
        # Define violence-related objects
        self.violence_objects = ['knife', 'gun', 'baseball bat', 'stick']
        
        # Define colors for visualization
        self.colors = {
            'violence': (0, 0, 255),    # Red
            'person': (0, 255, 0),      # Green
            'other': (255, 0, 0),       # Blue
            'keypoint': (255, 255, 0),  # Yellow
            'connection': (0, 255, 255)  # Cyan
        }

    def calculate_motion_features(self, prev_poses, current_poses):
        """Calculate motion features between consecutive frames"""
        try:
            if not prev_poses or not current_poses:
                return {
                    'average_speed': 0,
                    'motion_intensity': 0,
                    'sudden_movements': 0
                }

            # Convert poses to numpy arrays if they aren't already
            prev_poses = np.array(prev_poses)
            current_poses = np.array(current_poses)

            # Calculate displacement between frames
            if prev_poses.shape == current_poses.shape:
                displacement = np.linalg.norm(current_poses - prev_poses, axis=2)
                average_speed = np.mean(displacement)
                motion_intensity = np.std(displacement)
                sudden_movements = np.sum(displacement > np.mean(displacement) + 2 * np.std(displacement))

                return {
                    'average_speed': float(average_speed),
                    'motion_intensity': float(motion_intensity),
                    'sudden_movements': int(sudden_movements)
                }
            else:
                return {
                    'average_speed': 0,
                    'motion_intensity': 0,
                    'sudden_movements': 0
                }
        except Exception as e:
            print(f"Error in calculate_motion_features: {e}")
            return {
                'average_speed': 0,
                'motion_intensity': 0,
                'sudden_movements': 0
            }

    def draw_detections(self, frame, det_results, pose_results, seg_results):
        """Draw all detections on the frame"""
        display_frame = frame.copy()

        # Draw segmentation masks first (if any)
        if seg_results:
            for result in seg_results:
                if result.masks is not None:
                    for mask in result.masks.data:
                        try:
                            # Get mask as numpy array
                            mask_np = mask.cpu().numpy()
                            
                            # Resize mask to match frame size
                            mask_np = cv2.resize(mask_np, (frame.shape[1], frame.shape[0]))
                            
                            # Create binary mask
                            mask_binary = (mask_np > 0.5).astype(np.uint8) * 255
                            
                            # Create colored mask
                            colored_mask = np.zeros_like(frame)
                            colored_mask[mask_binary > 0] = [0, 0, 255]  # Red color for mask
                            
                            # Apply mask
                            display_frame = cv2.addWeighted(display_frame, 1, colored_mask, 0.3, 0)
                        except Exception as e:
                            print(f"Error processing mask: {e}")
                            continue

        # Draw object detections
        for result in det_results:
            boxes = result.boxes
            for box in boxes:
                try:
                    # Get box coordinates
                    x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
                    cls = result.names[int(box.cls[0])]
                    conf = float(box.conf[0])

                    # Choose color based on class
                    if cls in self.violence_objects:
                        color = self.colors['violence']
                    elif cls == 'person':
                        color = self.colors['person']
                    else:
                        color = self.colors['other']

                    # Draw box and label
                    cv2.rectangle(display_frame, (x1, y1), (x2, y2), color, 2)
                    label = f'{cls} {conf:.2f}'
                    
                    # Add background to text for better visibility
                    (text_w, text_h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
                    cv2.rectangle(display_frame, (x1, y1-text_h-5), (x1+text_w, y1), color, -1)
                    cv2.putText(display_frame, label, (x1, y1-5), 
                              cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)
                except Exception as e:
                    print(f"Error drawing detection box: {e}")
                    continue

        # Draw pose keypoints and connections
        if pose_results:
            for result in pose_results:
                if result.keypoints is not None:
                    for kpts in result.keypoints:
                        try:
                            # Get keypoints data
                            keypoints_data = kpts.data[0].cpu().numpy()
                            
                            # Draw keypoints
                            for keypoint in keypoints_data:
                                x, y, conf = keypoint
                                if conf > 0.5:
                                    cv2.circle(display_frame, 
                                             (int(float(x)), int(float(y))), 
                                             4, self.colors['keypoint'], -1)

                            # Draw connections
                            connections = [(5,7), (7,9), (6,8), (8,10), (5,6), 
                                         (11,13), (13,15), (12,14), (14,16), (11,12)]
                            for connection in connections:
                                pt1 = keypoints_data[connection[0]]
                                pt2 = keypoints_data[connection[1]]
                                
                                if pt1[2] > 0.5 and pt2[2] > 0.5:
                                    cv2.line(display_frame, 
                                           (int(float(pt1[0])), int(float(pt1[1]))),
                                           (int(float(pt2[0])), int(float(pt2[1]))),
                                           self.colors['connection'], 2)
                        except Exception as e:
                            print(f"Error drawing pose: {e}")
                            continue

        # Add frame information
        cv2.putText(display_frame, "Press 'q' to quit, 'p' to pause/resume", 
                   (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)

        return display_frame
    def extract_features(self, frame, prev_frame_data=None):
        """Extract all features from a frame"""
        try:
            # Run models
            det_results = self.detection_model(frame, verbose=False)
            seg_results = self.segmentation_model(frame, verbose=False)
            pose_results = self.pose_model(frame, verbose=False)

            # Draw detections
            annotated_frame = self.draw_detections(frame, det_results, pose_results, seg_results)

            # Initialize features dictionary
            features = {
                'objects': [],
                'poses': [],
                'segmentation': [],
                'motion': {},
                'violence_indicators': {
                    'weapon_present': False,
                    'rapid_motion': False,
                    'close_interaction': False,
                    'aggressive_pose': False
                }
            }

            # Process detections
            for result in det_results:
                for box in result.boxes:
                    try:
                        cls = result.names[int(box.cls[0])]
                        conf = float(box.conf[0])
                        box_coords = box.xyxy[0].cpu().numpy().tolist()
                        
                        features['objects'].append({
                            'class': cls,
                            'confidence': conf,
                            'box': box_coords
                        })
                        
                        if cls in self.violence_objects:
                            features['violence_indicators']['weapon_present'] = True
                    except Exception as e:
                        print(f"Error processing detection: {e}")
                        continue

            # Process poses
            if pose_results:
                for result in pose_results:
                    if result.keypoints is not None:
                        for kpts in result.keypoints:
                            try:
                                pose_data = kpts.data[0].cpu().numpy().tolist()
                                features['poses'].append(pose_data)
                            except Exception as e:
                                print(f"Error processing pose: {e}")
                                continue

            # Process segmentation
            if seg_results:
                for result in seg_results:
                    if result.masks is not None:
                        for mask in result.masks.data:
                            try:
                                mask_np = mask.cpu().numpy()
                                features['segmentation'].append(np.sum(mask_np > 0.5))
                            except Exception as e:
                                print(f"Error processing segmentation: {e}")
                                continue

            # Calculate motion features if previous frame exists
            if prev_frame_data and 'poses' in prev_frame_data:
                motion_features = self.calculate_motion_features(
                    prev_frame_data['poses'], features['poses'])
                features['motion'] = motion_features
                
                # Check for rapid motion
                if motion_features.get('average_speed', 0) > 10:  # Threshold can be adjusted
                    features['violence_indicators']['rapid_motion'] = True

            return features, annotated_frame

        except Exception as e:
            print(f"Error in feature extraction: {e}")
            return None, frame

def process_video(video_path, extractor, output_path):
    """Process video and extract features"""
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Could not open video file")
        return

    # Get video properties
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Create video writer
    output_video_path = video_path.rsplit('.', 1)[0] + '_analyzed.mp4'
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

    # Initialize data storage
    video_data = {
        'metadata': {
            'path': video_path,
            'fps': fps,
            'frame_count': frame_count,
            'width': frame_width,
            'height': frame_height
        },
        'frames': []
    }

    frame_idx = 0
    prev_frame_data = None
    paused = False

    try:
        while True:
            if not paused:
                ret, frame = cap.read()
                if not ret:
                    break

                # Extract features and get annotated frame
                features, annotated_frame = extractor.extract_features(frame, prev_frame_data)
                
                if features is not None:
                    # Store frame data
                    frame_data = {
                        'frame_index': frame_idx,
                        'timestamp': frame_idx / fps,
                        'features': features
                    }
                    
                    video_data['frames'].append(frame_data)
                    prev_frame_data = features

                    # Write frame to output video
                    out.write(annotated_frame)

                    # Show progress
                    if frame_idx % 30 == 0:
                        progress = (frame_idx / frame_count) * 100
                        print(f"Processing: {progress:.1f}% complete")

                    frame_idx += 1

                # Display frame
                cv2.imshow('Violence Detection Analysis', annotated_frame)

            # Handle key events
            key = cv2.waitKey(1) & 0xFF
            if key == ord('q'):
                break
            elif key == ord('p'):
                paused = not paused
                if paused:
                    print("Paused - Press 'p' to resume")

    except Exception as e:
        print(f"Error during processing: {e}")
        import traceback
        traceback.print_exc()

    finally:
        # Cleanup
        cap.release()
        out.release()
        cv2.destroyAllWindows()

        # Save features to YAML
        try:
            with open(output_path, 'w') as f:
                yaml.dump(video_data, f, default_flow_style=False)
            print(f"Features saved to: {output_path}")
            print(f"Analyzed video saved to: {output_video_path}")
        except Exception as e:
            print(f"Error saving data: {e}")

    return video_data

if __name__ == "__main__":
    # Initialize Tkinter
    root = Tk()
    root.withdraw()

    # Get video file
    video_path = filedialog.askopenfilename(
        title="Select Video File",
        filetypes=[("Video Files", "*.mp4;*.avi")]
    )

    if not video_path:
        print("No video file selected")
        exit()

    # Define model paths
    detection_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m.pt'
    segmentation_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m-seg.pt'
    pose_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m-pose.pt'
    
    # Initialize feature extractor
    extractor = ViolenceFeatureExtractor(
        detection_model_path,
        segmentation_model_path,
        pose_model_path
    )

    # Process video
    output_path = r'C:\Users\harme\Desktop\video-detect-gpu\violence_features.yaml'
    video_data = process_video(video_path, extractor, output_path)

    print("Analysis complete!")

Using device: cuda
Processing: 0.0% complete
Processing: 26.3% complete
Processing: 52.6% complete
Paused - Press 'p' to resume
Processing: 78.9% complete


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x000002515BB15DC0>>
Traceback (most recent call last):
  File "c:\Users\harme\Desktop\video-detect-gpu\myenv\lib\site-packages\ipykernel\ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


Features saved to: C:\Users\harme\Desktop\video-detect-gpu\violence_features.yaml
Analyzed video saved to: C:/Users/harme/Desktop/video-detect-gpu/V_5_analyzed.mp4
Analysis complete!


In [None]:
import cv2
import yaml
import torch
import torch.cuda
import numpy as np
from ultralytics import YOLO
from tkinter import Tk, filedialog
from scipy.spatial.distance import cdist

class ViolenceFeatureExtractor:
    def __init__(self, detection_model_path, segmentation_model_path, pose_model_path):
        # Initialize device and GPU settings
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.print_gpu_info()
        
        # GPU optimization settings
        if torch.cuda.is_available():
            torch.backends.cudnn.benchmark = True
            torch.backends.cudnn.deterministic = False
            torch.cuda.empty_cache()
            torch.cuda.set_per_process_memory_fraction(0.8)
        
        # Load models and move to GPU
        self.detection_model = YOLO(detection_model_path).to(self.device)
        self.segmentation_model = YOLO(segmentation_model_path).to(self.device)
        self.pose_model = YOLO(pose_model_path).to(self.device)
        
        # Define violence-related objects
        self.violence_objects = ['knife', 'gun', 'baseball bat', 'stick']
        
        # Define colors for visualization
        self.colors = {
            'violence': (0, 0, 255),    # Red
            'person': (0, 255, 0),      # Green
            'other': (255, 0, 0),       # Blue
            'keypoint': (255, 255, 0),  # Yellow
            'connection': (0, 255, 255)  # Cyan
        }
        
        # Performance settings
        self.frame_skip = 2
        self.batch_size = 4
        self.scale_factor = 0.75

    def print_gpu_info(self):
        """Print GPU information"""
        print("\nGPU Information:")
        if torch.cuda.is_available():
            print(f"GPU Device: {torch.cuda.get_device_name(0)}")
            print(f"CUDA Version: {torch.version.cuda}")
            print(f"Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
            print(f"Available Memory: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
        else:
            print("No GPU available. Using CPU.")

    def preprocess_frame(self, frame):
        """Preprocess frame for model input"""
        # Resize frame for faster processing
        if self.scale_factor != 1.0:
            width = int(frame.shape[1] * self.scale_factor)
            height = int(frame.shape[0] * self.scale_factor)
            frame = cv2.resize(frame, (width, height))
        
        # Convert to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Normalize
        frame_normalized = frame_rgb.astype(np.float32) / 255.0
        
        return frame_normalized

    def calculate_motion_features(self, prev_poses, current_poses):
        """Calculate motion features between consecutive frames"""
        try:
            if not prev_poses or not current_poses:
                return {
                    'average_speed': 0,
                    'motion_intensity': 0,
                    'sudden_movements': 0
                }

            # Convert poses to numpy arrays
            prev_poses = np.array(prev_poses)
            current_poses = np.array(current_poses)

            if prev_poses.shape == current_poses.shape:
                # Calculate displacement
                displacement = np.linalg.norm(current_poses - prev_poses, axis=2)
                average_speed = np.mean(displacement)
                motion_intensity = np.std(displacement)
                sudden_movements = np.sum(displacement > np.mean(displacement) + 2 * np.std(displacement))

                return {
                    'average_speed': float(average_speed),
                    'motion_intensity': float(motion_intensity),
                    'sudden_movements': int(sudden_movements)
                }
            
            return {
                'average_speed': 0,
                'motion_intensity': 0,
                'sudden_movements': 0
            }
            
        except Exception as e:
            print(f"Error in motion calculation: {e}")
            return {
                'average_speed': 0,
                'motion_intensity': 0,
                'sudden_movements': 0
            }

    def draw_detections(self, frame, det_results, pose_results, seg_results):
        """Draw all detections on the frame"""
        display_frame = frame.copy()

        # Draw segmentation masks
        if seg_results:
            for result in seg_results:
                if result.masks is not None:
                    for mask in result.masks.data:
                        try:
                            mask_np = mask.cpu().numpy()
                            mask_np = cv2.resize(mask_np, (frame.shape[1], frame.shape[0]))
                            mask_binary = (mask_np > 0.5).astype(np.uint8) * 255
                            colored_mask = np.zeros_like(frame)
                            colored_mask[mask_binary > 0] = [0, 0, 255]
                            display_frame = cv2.addWeighted(display_frame, 1, colored_mask, 0.3, 0)
                        except Exception as e:
                            print(f"Error in mask drawing: {e}")
                            continue

        # Draw object detections
        for result in det_results:
            boxes = result.boxes
            for box in boxes:
                try:
                    x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
                    cls = result.names[int(box.cls[0])]
                    conf = float(box.conf[0])

                    color = (self.colors['violence'] if cls in self.violence_objects 
                            else self.colors['person'] if cls == 'person' 
                            else self.colors['other'])

                    cv2.rectangle(display_frame, (x1, y1), (x2, y2), color, 2)
                    label = f'{cls} {conf:.2f}'
                    
                    (text_w, text_h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
                    cv2.rectangle(display_frame, (x1, y1-text_h-5), (x1+text_w, y1), color, -1)
                    cv2.putText(display_frame, label, (x1, y1-5), 
                              cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)
                except Exception as e:
                    print(f"Error in detection drawing: {e}")
                    continue
                
        


GPU Information:
GPU Device: NVIDIA GeForce RTX 3050 Laptop GPU
CUDA Version: 11.8
Total GPU Memory: 4.29 GB
Available Memory: 0.00 GB


  with torch.cuda.amp.autocast():


Features saved to: C:\Users\harme\Desktop\video-detect-gpu\violence_features.yaml
Analyzed video saved to: C:/Users/harme/Desktop/video-detect-gpu/NV_1_analyzed.mp4
Analysis complete!


In [None]:
import cv2
import yaml
import torch
import torch.cuda
import numpy as np
from ultralytics import YOLO
from tkinter import Tk, filedialog
from scipy.spatial.distance import cdist

class ViolenceFeatureExtractor:
    def __init__(self, detection_model_path, segmentation_model_path, pose_model_path):
        # Initialize device and GPU settings
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.print_gpu_info()
        
        # GPU optimization settings
        if torch.cuda.is_available():
            torch.backends.cudnn.benchmark = True
            torch.backends.cudnn.deterministic = False
            torch.cuda.empty_cache()
            torch.cuda.set_per_process_memory_fraction(0.8)
        
        # Load models and move to GPU
        self.detection_model = YOLO(detection_model_path).to(self.device)
        self.segmentation_model = YOLO(segmentation_model_path).to(self.device)
        self.pose_model = YOLO(pose_model_path).to(self.device)
        
        # Define violence-related objects
        self.violence_objects = ['knife', 'gun', 'baseball bat', 'stick']
        
        # Define colors for visualization
        self.colors = {
            'violence': (0, 0, 255),    # Red
            'person': (0, 255, 0),      # Green
            'other': (255, 0, 0),       # Blue
            'keypoint': (255, 255, 0),  # Yellow
            'connection': (0, 255, 255)  # Cyan
        }
        
        # Performance settings
        self.frame_skip = 2
        self.batch_size = 4
        self.input_size = 640  # YOLO input size

    def print_gpu_info(self):
        """Print GPU information"""
        print("\nGPU Information:")
        if torch.cuda.is_available():
            print(f"GPU Device: {torch.cuda.get_device_name(0)}")
            print(f"CUDA Version: {torch.version.cuda}")
            print(f"Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
            print(f"Available Memory: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
        else:
            print("No GPU available. Using CPU.")

    def preprocess_frame(self, frame):
        """Preprocess frame for model input"""
        try:
            # Convert to RGB
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            # Calculate size to maintain aspect ratio
            h, w = frame_rgb.shape[:2]
            r = self.input_size / max(h, w)  # Ratio
            new_h, new_w = int(h * r), int(w * r)
            
            # Resize
            resized = cv2.resize(frame_rgb, (new_w, new_h))
            
            # Create canvas of input_size x input_size
            canvas = np.zeros((self.input_size, self.input_size, 3), dtype=np.uint8)
            
            # Calculate padding
            pad_h = (self.input_size - new_h) // 2
            pad_w = (self.input_size - new_w) // 2
            
            # Place resized image on canvas
            canvas[pad_h:pad_h+new_h, pad_w:pad_w+new_w] = resized
            
            # Normalize
            normalized = canvas.astype(np.float32) / 255.0
            
            return normalized, (r, pad_w, pad_h)  # Return scale and padding info
            
        except Exception as e:
            print(f"Error in preprocessing: {e}")
            return None, None

    def calculate_motion_features(self, prev_poses, current_poses):
        """Calculate motion features between consecutive frames"""
        try:
            if not prev_poses or not current_poses:
                return {
                    'average_speed': 0,
                    'motion_intensity': 0,
                    'sudden_movements': 0
                }

            # Convert poses to numpy arrays
            prev_poses = np.array(prev_poses)
            current_poses = np.array(current_poses)

            if prev_poses.shape == current_poses.shape:
                # Calculate displacement
                displacement = np.linalg.norm(current_poses - prev_poses, axis=2)
                average_speed = np.mean(displacement)
                motion_intensity = np.std(displacement)
                sudden_movements = np.sum(displacement > np.mean(displacement) + 2 * np.std(displacement))

                return {
                    'average_speed': float(average_speed),
                    'motion_intensity': float(motion_intensity),
                    'sudden_movements': int(sudden_movements)
                }
            
            return {
                'average_speed': 0,
                'motion_intensity': 0,
                'sudden_movements': 0
            }
            
        except Exception as e:
            print(f"Error in motion calculation: {e}")
            return {
                'average_speed': 0,
                'motion_intensity': 0,
                'sudden_movements': 0
            }

    def rescale_coords(self, x, y, scale_info):
        """Rescale coordinates back to original image size"""
        scale, pad_w, pad_h = scale_info
        x_orig = (x - pad_w) / scale
        y_orig = (y - pad_h) / scale
        return int(x_orig), int(y_orig)

    def draw_detections(self, frame, det_results, pose_results, seg_results, scale_info):
        """Draw all detections on the frame"""
        display_frame = frame.copy()

        # Draw segmentation masks
        if seg_results:
            for result in seg_results:
                if result.masks is not None:
                    for mask in result.masks.data:
                        try:
                            mask_np = mask.cpu().numpy()
                            mask_np = cv2.resize(mask_np, (frame.shape[1], frame.shape[0]))
                            mask_binary = (mask_np > 0.5).astype(np.uint8) * 255
                            colored_mask = np.zeros_like(frame)
                            colored_mask[mask_binary > 0] = [0, 0, 255]
                            display_frame = cv2.addWeighted(display_frame, 1, colored_mask, 0.3, 0)
                        except Exception as e:
                            print(f"Error in mask drawing: {e}")
                            continue
                        
                # Draw object detections
        for result in det_results:
            boxes = result.boxes
            for box in boxes:
                try:
                    # Get box coordinates and rescale them
                    x1, y1, x2, y2 = map(float, box.xyxy[0].cpu().numpy())
                    x1, y1 = self.rescale_coords(x1, y1, scale_info)
                    x2, y2 = self.rescale_coords(x2, y2, scale_info)
                    
                    cls = result.names[int(box.cls[0])]
                    conf = float(box.conf[0])

                    color = (self.colors['violence'] if cls in self.violence_objects 
                            else self.colors['person'] if cls == 'person' 
                            else self.colors['other'])

                    cv2.rectangle(display_frame, (x1, y1), (x2, y2), color, 2)
                    label = f'{cls} {conf:.2f}'
                    
                    (text_w, text_h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
                    cv2.rectangle(display_frame, (x1, y1-text_h-5), (x1+text_w, y1), color, -1)
                    cv2.putText(display_frame, label, (x1, y1-5), 
                              cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)
                except Exception as e:
                    print(f"Error in detection drawing: {e}")
                    continue

        # Draw pose keypoints and connections
        if pose_results:
            for result in pose_results:
                if result.keypoints is not None:
                    for kpts in result.keypoints:
                        try:
                            keypoints_data = kpts.data[0].cpu().numpy()
                            
                            # Draw keypoints
                            for keypoint in keypoints_data:
                                x, y, conf = keypoint
                                if conf > 0.5:
                                    x, y = self.rescale_coords(x, y, scale_info)
                                    cv2.circle(display_frame, (x, y), 4, self.colors['keypoint'], -1)

                            # Draw connections
                            connections = [(5,7), (7,9), (6,8), (8,10), (5,6), 
                                         (11,13), (13,15), (12,14), (14,16), (11,12)]
                            for connection in connections:
                                pt1 = keypoints_data[connection[0]]
                                pt2 = keypoints_data[connection[1]]
                                
                                if pt1[2] > 0.5 and pt2[2] > 0.5:
                                    x1, y1 = self.rescale_coords(pt1[0], pt1[1], scale_info)
                                    x2, y2 = self.rescale_coords(pt2[0], pt2[1], scale_info)
                                    cv2.line(display_frame, (x1, y1), (x2, y2),
                                           self.colors['connection'], 2)
                        except Exception as e:
                            print(f"Error in pose drawing: {e}")
                            continue

        # Add frame information
        cv2.putText(display_frame, "Press 'q' to quit, 'p' to pause/resume", 
                   (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)

        return display_frame

    def extract_features(self, frame, prev_frame_data=None):
        """Extract features from a single frame"""
        try:
            # Preprocess frame
            processed_frame, scale_info = self.preprocess_frame(frame)
            if processed_frame is None:
                return None, frame

            # Convert to tensor and add batch dimension
            frame_tensor = torch.from_numpy(processed_frame).permute(2, 0, 1).unsqueeze(0).to(self.device)

            # Run models with GPU acceleration
            with torch.cuda.amp.autocast():
                det_results = self.detection_model(frame_tensor, verbose=False)
                seg_results = self.segmentation_model(frame_tensor, verbose=False)
                pose_results = self.pose_model(frame_tensor, verbose=False)

            # Draw detections
            annotated_frame = self.draw_detections(frame, det_results, pose_results, seg_results, scale_info)

            # Initialize features
            features = {
                'objects': [],
                'poses': [],
                'segmentation': [],
                'motion': {},
                'violence_indicators': {
                    'weapon_present': False,
                    'rapid_motion': False,
                    'close_interaction': False,
                    'aggressive_pose': False
                }
            }

            # Process detections
            for result in det_results:
                for box in result.boxes:
                    try:
                        cls = result.names[int(box.cls[0])]
                        conf = float(box.conf[0])
                        box_coords = box.xyxy[0].cpu().numpy().tolist()
                        
                        features['objects'].append({
                            'class': cls,
                            'confidence': conf,
                            'box': box_coords
                        })
                        
                        if cls in self.violence_objects:
                            features['violence_indicators']['weapon_present'] = True
                    except Exception as e:
                        print(f"Error processing detection: {e}")
                        continue

            # Process poses
            if pose_results:
                for result in pose_results:
                    if result.keypoints is not None:
                        for kpts in result.keypoints:
                            try:
                                pose_data = kpts.data[0].cpu().numpy().tolist()
                                features['poses'].append(pose_data)
                            except Exception as e:
                                print(f"Error processing pose: {e}")
                                continue

            # Process segmentation
            if seg_results:
                for result in seg_results:
                    if result.masks is not None:
                        for mask in result.masks.data:
                            try:
                                mask_np = mask.cpu().numpy()
                                features['segmentation'].append(np.sum(mask_np > 0.5))
                            except Exception as e:
                                print(f"Error processing segmentation: {e}")
                                continue

            # Calculate motion features
            if prev_frame_data and 'poses' in prev_frame_data:
                motion_features = self.calculate_motion_features(
                    prev_frame_data['poses'], features['poses'])
                features['motion'] = motion_features
                
                if motion_features.get('average_speed', 0) > 10:
                    features['violence_indicators']['rapid_motion'] = True

            return features, annotated_frame

        except Exception as e:
            print(f"Error in feature extraction: {e}")
            return None, frame

def process_video(video_path, extractor, output_path):
    """Process video with GPU acceleration"""
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Could not open video file")
        return

    # Get video properties
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Create video writer
    output_video_path = video_path.rsplit('.', 1)[0] + '_analyzed.mp4'
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

    # Initialize data storage
    video_data = {
        'metadata': {
            'path': video_path,
            'fps': fps,
            'frame_count': frame_count,
            'width': frame_width,
            'height': frame_height
        },
        'frames': []
    }

    frame_idx = 0
    prev_frame_data = None
    paused = False

    try:
        while True:
            if not paused:
                ret, frame = cap.read()
                if not ret:
                    break

                # Skip frames if needed
                if frame_idx % extractor.frame_skip != 0:
                    frame_idx += 1
                    continue

                # Extract features and get annotated frame
                features, annotated_frame = extractor.extract_features(frame, prev_frame_data)
                
                if features is not None:
                    frame_data = {
                        'frame_index': frame_idx,
                        'timestamp': frame_idx / fps,
                        'features': features
                    }
                    
                    video_data['frames'].append(frame_data)
                    prev_frame_data = features
                    out.write(annotated_frame)

                    # Show progress
                    if frame_idx % (30 * extractor.frame_skip) == 0:
                        progress = (frame_idx / frame_count) * 100
                        print(f"Processing: {progress:.1f}% complete")
                        if torch.cuda.is_available():
                            print(f"GPU Memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

                    # Display frame
                    cv2.imshow('Violence Detection Analysis', annotated_frame)

                frame_idx += 1

                # Handle key events
                key = cv2.waitKey(1) & 0xFF
                if key == ord('q'):
                    break
                elif key == ord('p'):
                    paused = not paused
                    print("Paused - Press 'p' to resume" if paused else "Resumed")

    except Exception as e:
        print(f"Error during processing: {e}")
        import traceback
        traceback.print_exc()

    finally:
        # Cleanup
        cap.release()
        out.release()
        cv2.destroyAllWindows()
        
        # Final GPU cleanup
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        # Save features
        try:
            with open(output_path, 'w') as f:
                yaml.dump(video_data, f, default_flow_style=False)
            print(f"Features saved to: {output_path}")
            print(f"Analyzed video saved to: {output_video_path}")
        except Exception as e:
            print(f"Error saving data: {e}")

    return video_data

if __name__ == "__main__":
    # Initialize Tkinter
    root = Tk()
    root.withdraw()

    # Get video file
    video_path = filedialog.askopenfilename(
        title="Select Video File",
        filetypes=[("Video Files", "*.mp4;*.avi")]
    )

    if not video_path:
        print("No video file selected")
        exit()

    # Define model paths
    detection_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m.pt'
    segmentation_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m-seg.pt'
    pose_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m-pose.pt'
    
    # Initialize feature extractor
    extractor = ViolenceFeatureExtractor(
        detection_model_path,
        segmentation_model_path,
        pose_model_path
    )

    # Process video
    output_path = r'C:\Users\harme\Desktop\video-detect-gpu\violence_features.yaml'
    video_data = process_video(video_path, extractor, output_path)

    print("Analysis complete!")


GPU Information:
GPU Device: NVIDIA GeForce RTX 3050 Laptop GPU
CUDA Version: 11.8
Total GPU Memory: 4.29 GB
Available Memory: 0.29 GB


  with torch.cuda.amp.autocast():


Processing: 0.0% complete
GPU Memory: 0.30 GB
Processing: 90.9% complete
GPU Memory: 0.30 GB
Features saved to: C:\Users\harme\Desktop\video-detect-gpu\violence_features.yaml
Analyzed video saved to: C:/Users/harme/Desktop/video-detect-gpu/NV_1_analyzed.mp4
Analysis complete!


In [3]:
import cv2
import yaml
import torch
import torch.cuda
import numpy as np
from ultralytics import YOLO
from tkinter import Tk, filedialog
from scipy.spatial.distance import cdist

class ViolenceFeatureExtractor:
    def __init__(self, detection_model_path, segmentation_model_path, pose_model_path):
        # Initialize device and GPU settings
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.print_gpu_info()
        
        # GPU optimization settings
        if torch.cuda.is_available():
            torch.backends.cudnn.benchmark = True
            torch.backends.cudnn.deterministic = False
            torch.cuda.empty_cache()
            torch.cuda.set_per_process_memory_fraction(0.8)
        
        # Load models and move to GPU
        self.detection_model = YOLO(detection_model_path).to(self.device)
        self.segmentation_model = YOLO(segmentation_model_path).to(self.device)
        self.pose_model = YOLO(pose_model_path).to(self.device)
        
        # Define violence-related objects and relevant classes
        self.violence_objects = ['knife', 'gun', 'baseball bat', 'stick', 'bottle']
        self.relevant_classes = ['person'] + self.violence_objects
        
        # Define colors for visualization
        self.colors = {
            'violence': (0, 0, 255),    # Red
            'person': (0, 255, 0),      # Green
            'interaction': (255, 0, 0),  # Blue
            'keypoint': (255, 255, 0),  # Yellow
            'connection': (0, 255, 255)  # Cyan
        }
        
        # Performance and detection settings
        self.frame_skip = 2
        self.input_size = 640
        self.conf_threshold = 0.5
        self.interaction_threshold = 0.5  # For person-to-person interaction detection

    def print_gpu_info(self):
        """Print GPU information"""
        print("\nGPU Information:")
        if torch.cuda.is_available():
            print(f"GPU Device: {torch.cuda.get_device_name(0)}")
            print(f"CUDA Version: {torch.version.cuda}")
            print(f"Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
            print(f"Available Memory: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
        else:
            print("No GPU available. Using CPU.")

    def preprocess_frame(self, frame):
        """Preprocess frame for model input"""
        try:
            # Convert to RGB
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            # Calculate size to maintain aspect ratio
            h, w = frame_rgb.shape[:2]
            r = self.input_size / max(h, w)
            new_h, new_w = int(h * r), int(w * r)
            
            # Resize
            resized = cv2.resize(frame_rgb, (new_w, new_h))
            
            # Create canvas of input_size x input_size
            canvas = np.zeros((self.input_size, self.input_size, 3), dtype=np.uint8)
            
            # Calculate padding
            pad_h = (self.input_size - new_h) // 2
            pad_w = (self.input_size - new_w) // 2
            
            # Place resized image on canvas
            canvas[pad_h:pad_h+new_h, pad_w:pad_w+new_w] = resized
            
            # Normalize
            normalized = canvas.astype(np.float32) / 255.0
            
            return normalized, (r, pad_w, pad_h)
            
        except Exception as e:
            print(f"Error in preprocessing: {e}")
            return None, None

    def analyze_person_interactions(self, person_boxes):
        """Analyze interactions between detected people"""
        interactions = []
        if len(person_boxes) < 2:
            return interactions

        for i in range(len(person_boxes)):
            for j in range(i + 1, len(person_boxes)):
                box1 = person_boxes[i]
                box2 = person_boxes[j]
                
                # Calculate centers
                center1 = [(box1[0] + box1[2])/2, (box1[1] + box1[3])/2]
                center2 = [(box2[0] + box2[2])/2, (box2[1] + box2[3])/2]
                
                # Calculate distance and box sizes
                distance = np.sqrt((center1[0] - center2[0])**2 + (center1[1] - center2[1])**2)
                box1_size = (box1[2] - box1[0]) * (box1[3] - box1[1])
                box2_size = (box2[2] - box2[0]) * (box2[3] - box2[1])
                avg_size = (box1_size + box2_size) / 2
                
                # Check for close interaction
                if distance < avg_size * self.interaction_threshold:
                   interactions.append({
                         'person1_idx': i,
                         'person2_idx': j,
                         'distance': distance,
                         'relative_distance': distance / avg_size,
                         'center1': center1,
                         'center2': center2,
                         'box1': box1,
                         'box2': box2
                     })
        
        return interactions

    def calculate_motion_features(self, prev_poses, current_poses):
        """Calculate motion features between consecutive frames"""
        try:
            if not prev_poses or not current_poses:
                return {
                    'average_speed': 0,
                    'motion_intensity': 0,
                    'sudden_movements': 0
                }

            # Convert poses to numpy arrays
            prev_poses = np.array(prev_poses)
            current_poses = np.array(current_poses)

            if prev_poses.shape == current_poses.shape:
                # Calculate displacement
                displacement = np.linalg.norm(current_poses - prev_poses, axis=2)
                average_speed = np.mean(displacement)
                motion_intensity = np.std(displacement)
                sudden_movements = np.sum(displacement > np.mean(displacement) + 2 * np.std(displacement))

                return {
                    'average_speed': float(average_speed),
                    'motion_intensity': float(motion_intensity),
                    'sudden_movements': int(sudden_movements)
                }
            
            return {
                'average_speed': 0,
                'motion_intensity': 0,
                'sudden_movements': 0
            }
            
        except Exception as e:
            print(f"Error in motion calculation: {e}")
            return {
                'average_speed': 0,
                'motion_intensity': 0,
                'sudden_movements': 0
            }
    def analyze_poses_for_violence(self, poses):
        """Analyze poses for potential aggressive/violent behavior"""
        try:
            if not poses:
                return False

            for pose in poses:
                # Convert pose to numpy array for calculations
                pose_array = np.array(pose)
                
                # Check for rapid arm movements (high confidence keypoints only)
                arm_keypoints = [5, 7, 9, 6, 8, 10]  # Shoulders, elbows, wrists
                arm_positions = pose_array[arm_keypoints]
                arm_confidences = arm_positions[:, 2]
                
                if np.mean(arm_confidences) > 0.5:
                    # Calculate arm angles and velocities
                    # Add your specific pose analysis logic here
                    return True
                    
            return False
            
        except Exception as e:
            print(f"Error in pose analysis: {e}")
            return False

    def rescale_coords(self, x, y, scale_info):
        """Rescale coordinates back to original image size"""
        scale, pad_w, pad_h = scale_info
        x_orig = (x - pad_w) / scale
        y_orig = (y - pad_h) / scale
        return int(x_orig), int(y_orig)

    def draw_detections(self, frame, det_results, pose_results, interactions, scale_info):
        """Draw detections, poses, and interactions"""
        display_frame = frame.copy()

        # Draw object detections
        for result in det_results:
            boxes = result.boxes
            for box in boxes:
                try:
                    # Get box coordinates and rescale them
                    x1, y1, x2, y2 = map(float, box.xyxy[0].cpu().numpy())
                    x1, y1 = self.rescale_coords(x1, y1, scale_info)
                    x2, y2 = self.rescale_coords(x2, y2, scale_info)
                    
                    cls = result.names[int(box.cls[0])]
                    conf = float(box.conf[0])

                    # Only draw relevant classes
                    if cls in self.relevant_classes:
                        color = (self.colors['violence'] if cls in self.violence_objects 
                                else self.colors['person'])

                        cv2.rectangle(display_frame, (x1, y1), (x2, y2), color, 2)
                        label = f'{cls} {conf:.2f}'
                        
                        (text_w, text_h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
                        cv2.rectangle(display_frame, (x1, y1-text_h-5), (x1+text_w, y1), color, -1)
                        cv2.putText(display_frame, label, (x1, y1-5), 
                                  cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)

                except Exception as e:
                    print(f"Error in detection drawing: {e}")
                    continue

        # Draw interactions
        for interaction in interactions:
            try:
            # Get centers from interaction data
                 x1, y1 = self.rescale_coords(interaction['center1'][0], interaction['center1'][1], scale_info)
                 x2, y2 = self.rescale_coords(interaction['center2'][0], interaction['center2'][1], scale_info)
              
            # Draw line between interacting people
                 cv2.line(display_frame, (x1, y1), (x2, y2), self.colors['interaction'], 2)
            
            # Optional: Draw interaction distance
                 mid_point = ((x1 + x2)//2, (y1 + y2)//2)
                 distance_label = f"D: {interaction['relative_distance']:.2f}"
                 cv2.putText(display_frame, distance_label, mid_point, 
                         cv2.FONT_HERSHEY_SIMPLEX, 0.5, self.colors['interaction'], 2)
            
            except Exception as e:
                 print(f"Error drawing interaction: {e}")
                 continue

        # Draw pose keypoints and connections
        if pose_results:
            for result in pose_results:
                if result.keypoints is not None:
                    for kpts in result.keypoints:
                        try:
                            keypoints_data = kpts.data[0].cpu().numpy()
                            
                            # Draw keypoints
                            for keypoint in keypoints_data:
                                x, y, conf = keypoint
                                if conf > 0.5:
                                    x, y = self.rescale_coords(x, y, scale_info)
                                    cv2.circle(display_frame, (x, y), 4, self.colors['keypoint'], -1)

                            # Draw connections
                            connections = [(5,7), (7,9), (6,8), (8,10), (5,6), 
                                         (11,13), (13,15), (12,14), (14,16), (11,12)]
                            for connection in connections:
                                pt1 = keypoints_data[connection[0]]
                                pt2 = keypoints_data[connection[1]]
                                
                                if pt1[2] > 0.5 and pt2[2] > 0.5:
                                    x1, y1 = self.rescale_coords(pt1[0], pt1[1], scale_info)
                                    x2, y2 = self.rescale_coords(pt2[0], pt2[1], scale_info)
                                    cv2.line(display_frame, (x1, y1), (x2, y2),
                                           self.colors['connection'], 2)
                        except Exception as e:
                            print(f"Error in pose drawing: {e}")
                            continue

        # Add violence indicators
        if self.current_risk_level > 0.7:  # High risk threshold
            cv2.putText(display_frame, "HIGH RISK", (10, 60), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

        # Add frame information
        cv2.putText(display_frame, "Press 'q' to quit, 'p' to pause/resume", 
                   (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)

        return display_frame

    def extract_features(self, frame, prev_frame_data=None):
        """Extract violence-relevant features from frame"""
        try:
            # Preprocess frame
            processed_frame, scale_info = self.preprocess_frame(frame)
            if processed_frame is None:
                return None, frame

            # Convert to tensor and add batch dimension
            frame_tensor = torch.from_numpy(processed_frame).permute(2, 0, 1).unsqueeze(0).to(self.device)

            # Run models with GPU acceleration
            with torch.cuda.amp.autocast():
                det_results = self.detection_model(frame_tensor, verbose=False)
                pose_results = self.pose_model(frame_tensor, verbose=False)

            # Initialize features
            features = {
                'objects': [],
                'poses': [],
                'interactions': [],
                'motion': {},
                'violence_indicators': {
                    'weapon_present': False,
                    'close_interaction': False,
                    'rapid_motion': False,
                    'aggressive_pose': False
                }
            }

            # Process relevant detections
            person_boxes = []
            for result in det_results:
                for box in result.boxes:
                    try:
                        cls = result.names[int(box.cls[0])]
                        if cls in self.relevant_classes:
                            conf = float(box.conf[0])
                            box_coords = box.xyxy[0].cpu().numpy().tolist()
                            
                            features['objects'].append({
                                'class': cls,
                                'confidence': conf,
                                'box': box_coords
                            })
                            
                            if cls == 'person':
                                person_boxes.append(box_coords)
                            elif cls in self.violence_objects:
                                features['violence_indicators']['weapon_present'] = True
                    except Exception as e:
                        print(f"Error processing detection: {e}")
                        continue

            # Analyze person interactions
            if len(person_boxes) >= 2:
                interactions = self.analyze_person_interactions(person_boxes)
                features['interactions'] = interactions
                features['violence_indicators']['close_interaction'] = len(interactions) > 0

            # Process poses and analyze for violence
            if pose_results:
                for result in pose_results:
                    if result.keypoints is not None:
                        for kpts in result.keypoints:
                            try:
                                pose_data = kpts.data[0].cpu().numpy().tolist()
                                features['poses'].append(pose_data)
                            except Exception as e:
                                print(f"Error processing pose: {e}")
                                continue

                features['violence_indicators']['aggressive_pose'] = self.analyze_poses_for_violence(features['poses'])

            # Calculate motion features
            if prev_frame_data and 'poses' in prev_frame_data:
                motion_features = self.calculate_motion_features(
                    prev_frame_data['poses'], features['poses'])
                features['motion'] = motion_features
                
                features['violence_indicators']['rapid_motion'] = motion_features.get('average_speed', 0) > 10

            # Calculate overall risk level
            risk_weights = {
                'weapon_present': 0.4,
                'close_interaction': 0.3,
                'rapid_motion': 0.2,
                'aggressive_pose': 0.1
            }
            
            self.current_risk_level = sum(
                risk_weights[indicator] * int(value)
                for indicator, value in features['violence_indicators'].items()
            )

            # Draw detections
            annotated_frame = self.draw_detections(
                frame, det_results, pose_results, 
                features['interactions'], scale_info
            )

            return features, annotated_frame

        except Exception as e:
            print(f"Error in feature extraction: {e}")
            return None, frame
def process_video(video_path, extractor, output_path):
    """Process video with GPU acceleration"""
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Could not open video file")
        return

    # Get video properties
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Create video writer
    output_video_path = video_path.rsplit('.', 1)[0] + '_analyzed.mp4'
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

    # Initialize data storage
    video_data = {
        'metadata': {
            'path': video_path,
            'fps': fps,
            'frame_count': frame_count,
            'width': frame_width,
            'height': frame_height
        },
        'frames': []
    }

    frame_idx = 0
    prev_frame_data = None
    paused = False

    try:
        while True:
            if not paused:
                ret, frame = cap.read()
                if not ret:
                    break

                # Skip frames if needed
                if frame_idx % extractor.frame_skip != 0:
                    frame_idx += 1
                    continue

                # Extract features and get annotated frame
                features, annotated_frame = extractor.extract_features(frame, prev_frame_data)
                
                if features is not None:
                    frame_data = {
                        'frame_index': frame_idx,
                        'timestamp': frame_idx / fps,
                        'features': features
                    }
                    
                    video_data['frames'].append(frame_data)
                    prev_frame_data = features
                    out.write(annotated_frame)

                    # Show progress
                    if frame_idx % (30 * extractor.frame_skip) == 0:
                        progress = (frame_idx / frame_count) * 100
                        print(f"Processing: {progress:.1f}% complete")
                        if torch.cuda.is_available():
                            print(f"GPU Memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

                    # Display frame
                    cv2.imshow('Violence Detection Analysis', annotated_frame)

                frame_idx += 1

                # Handle key events
                key = cv2.waitKey(1) & 0xFF
                if key == ord('q'):
                    break
                elif key == ord('p'):
                    paused = not paused
                    print("Paused - Press 'p' to resume" if paused else "Resumed")

    except Exception as e:
        print(f"Error during processing: {e}")
        import traceback
        traceback.print_exc()

    finally:
        # Cleanup
        cap.release()
        out.release()
        cv2.destroyAllWindows()
        
        # Final GPU cleanup
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        # Save features
        try:
            with open(output_path, 'w') as f:
                yaml.dump(video_data, f, default_flow_style=False)
            print(f"Features saved to: {output_path}")
            print(f"Analyzed video saved to: {output_video_path}")
        except Exception as e:
            print(f"Error saving data: {e}")

    return video_data

if __name__ == "__main__":
    # Initialize Tkinter
    root = Tk()
    root.withdraw()

    # Get video file
    video_path = filedialog.askopenfilename(
        title="Select Video File",
        filetypes=[("Video Files", "*.mp4;*.avi")]
    )

    if not video_path:
        print("No video file selected")
        exit()

    # Define model paths
    detection_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m.pt'
    segmentation_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m-seg.pt'
    pose_model_path = r'C:\Users\harme\Desktop\violence detection\yolo11m-pose.pt'
    
    # Initialize feature extractor
    extractor = ViolenceFeatureExtractor(
        detection_model_path,
        segmentation_model_path,
        pose_model_path
    )

    # Process video
    output_path = r'C:\Users\harme\Desktop\video-detect-gpu\violence_features.yaml'
    video_data = process_video(video_path, extractor, output_path)

    print("Analysis complete!")




GPU Information:
GPU Device: NVIDIA GeForce RTX 3050 Laptop GPU
CUDA Version: 11.8
Total GPU Memory: 4.29 GB
Available Memory: 0.00 GB


  with torch.cuda.amp.autocast():


Processing: 0.0% complete
GPU Memory: 0.30 GB
Processing: 52.6% complete
GPU Memory: 0.30 GB
Features saved to: C:\Users\harme\Desktop\video-detect-gpu\violence_features.yaml
Analyzed video saved to: C:/Users/harme/Desktop/video-detect-gpu/V_5_analyzed.mp4
Analysis complete!


In [None]:
import cv2
import yaml
import torch
import torch.cuda
import numpy as np
from ultralytics import YOLO
from tkinter import Tk, filedialog
from scipy.spatial.distance import cdist

class FeatureExtractor:
    def __init__(self, model_state_path = 'model_state.pth'):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        if torch.cuda.is_available():
            torch.backends.cudnn.benchmark = True
            torch.backends.cudnn.deterministic = False
            torch.cuda.empty_cache()
            torch.cuda.set_per_process_memory_fraction(0.8)
            
        self.print_gpu_info()
        self.load_models(model_state_path)
        
    def print_gpu_info(self):
        """Print GPU information"""
        print("\nGPU Information:")
        if torch.cuda.is_available():
            print(f"GPU Device: {torch.cuda.get_device_name(0)}")
            print(f"CUDA Version: {torch.version.cuda}")
            print(f"Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
            print(f"Available Memory: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
        else:
            print("No GPU available. Using CPU.")
        
        
    
    def load_models(self, model_state_path):
        """Load models from the saved state"""
        print("Loading models...")
        model_state = torch.load(model_state_path, map_location=self.device)
        
        # Initialize models
        self.detection_model = YOLO().to(self.device)
        self.segmentation_model = YOLO().to(self.device)
        self.pose_model = YOLO().to(self.device)
        
        # Load state dicts
        self.detection_model.load_state_dict(model_state['detection_model'])
        self.segmentation_model.load_state_dict(model_state['segmentation_model'])
        self.pose_model.load_state_dict(model_state['pose_model'])
        
        print("Models loaded successfully!")
        
    def define_violence_detection_parameters(self):
        self.violence_objects = ["knife","gun","baseball bat", "stick","bottle"]
        self.relevant_classes = ["person"] + self.violence_objects
        
        self.colors = {
            'violence': (0, 0, 255),    # Red
            'person': (0, 255, 0),      # Green
            'interaction': (255, 0, 0),  # Blue
            'keypoint': (255, 255, 0),  # Yellow
            'connection': (0, 255, 255)  # Cyan
        }
        
        self.frame_skip = 2
        self.input_size = 640
        self.conf_threshold = 0.5
        self.interaction_threshold = 0.5
        
    def preprocess_frame(self,frame):
        
        try:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            h, w = frame_rgb.shape[:2]
            r = self.input_size / max(h, w)
            new_h, new_w = int(h * r), int(w * r)
            
            resized = cv2.resize(frame_rgb, (new_h, new_w))
            
            canvas = np.zeros((self.input_size, self.input_size, 3), dtype=np.uint8)
            
            pad_h = (self.input_size, - new_h) // 2
            pad_w = (self.input_size - new_w) // 2
            
            canvas[pad_h : pad_h + new_h, pad_w : pad_w + new_w] = resized
            
            normalized = canvas.astype(np.float32) / 255.0
            
            return normalized, (r, pad_w, pad_h)
        
        except Exception as e:
            print(f"Error in preprocessing: {e}")
            return None, None
        
    def analyze_person_interactions(self, person_boxes):
        
        interactions = []
        if len(person_boxes) < 2:
            return interactions
        
        for i in range(len(person_boxes)):
            for j in range(i+1, len(person_boxes)):
                box1 = person_boxes[i]
                box2 = person_boxes[j]
                
                center1 = [(box1[0] + box1[2])/2, (box1[1] + box1[3])/2]
                center2 = [(box2[0] + box2[2])/2, (box2[1] + box2[3])/2]
                
                distance = np.sqrt((center1[0] - center2[0])**2 + (center1[1] - center2[1])**2)
                box1_size = (box1[2] - box1[0]) * (box1[3] - box1[1])
                box2_size = (box2[2] - box2[0]) * (box2[3] - box2[1])
                avg_size = (box1_size + box2_size) / 2
                
                # Check for close interaction
                if distance < avg_size * self.interaction_threshold:
                   interactions.append({
                         'person1_idx': i,
                         'person2_idx': j,
                         'distance': distance,
                         'relative_distance': distance / avg_size,
                         'center1': center1,
                         'center2': center2,
                         'box1': box1,
                         'box2': box2
                     })
        
        return interactions

    def calculate_motion_features(self, prev_poses, current_poses):
        """Calculate motion features between consecutive frames"""
        try:
            if not prev_poses or not current_poses:
                return {
                    'average_speed': 0,
                    'motion_intensity': 0,
                    'sudden_movements': 0
                }

            # Convert poses to numpy arrays
            prev_poses = np.array(prev_poses)
            current_poses = np.array(current_poses)

            if prev_poses.shape == current_poses.shape:
                # Calculate displacement
                displacement = np.linalg.norm(current_poses - prev_poses, axis=2)
                average_speed = np.mean(displacement)
                motion_intensity = np.std(displacement)
                sudden_movements = np.sum(displacement > np.mean(displacement) + 2 * np.std(displacement))

                return {
                    'average_speed': float(average_speed),
                    'motion_intensity': float(motion_intensity),
                    'sudden_movements': int(sudden_movements)
                }
            
            return {
                'average_speed': 0,
                'motion_intensity': 0,
                'sudden_movements': 0
            }
            
        except Exception as e:
            print(f"Error in motion calculation: {e}")
            return {
                'average_speed': 0,
                'motion_intensity': 0,
                'sudden_movements': 0
            }
    def analyze_poses_for_violence(self, poses):
        """Analyze poses for potential aggressive/violent behavior"""
        try:
            if not poses:
                return False

            for pose in poses:
                # Convert pose to numpy array for calculations
                pose_array = np.array(pose)
                
                # Check for rapid arm movements (high confidence keypoints only)
                arm_keypoints = [5, 7, 9, 6, 8, 10]  # Shoulders, elbows, wrists
                arm_positions = pose_array[arm_keypoints]
                arm_confidences = arm_positions[:, 2]
                
                if np.mean(arm_confidences) > 0.5:
                    # Calculate arm angles and velocities
                    # Add your specific pose analysis logic here
                    return True
                    
            return False
            
        except Exception as e:
            print(f"Error in pose analysis: {e}")
            return False

    def rescale_coords(self, x, y, scale_info):
        """Rescale coordinates back to original image size"""
        scale, pad_w, pad_h = scale_info
        x_orig = (x - pad_w) / scale
        y_orig = (y - pad_h) / scale
        return int(x_orig), int(y_orig)

    def draw_detections(self, frame, det_results, pose_results, interactions, scale_info):
        """Draw detections, poses, and interactions"""
        display_frame = frame.copy()

        # Draw object detections
        for result in det_results:
            boxes = result.boxes
            for box in boxes:
                try:
                    # Get box coordinates and rescale them
                    x1, y1, x2, y2 = map(float, box.xyxy[0].cpu().numpy())
                    x1, y1 = self.rescale_coords(x1, y1, scale_info)
                    x2, y2 = self.rescale_coords(x2, y2, scale_info)
                    
                    cls = result.names[int(box.cls[0])]
                    conf = float(box.conf[0])

                    # Only draw relevant classes
                    if cls in self.relevant_classes:
                        color = (self.colors['violence'] if cls in self.violence_objects 
                                else self.colors['person'])

                        cv2.rectangle(display_frame, (x1, y1), (x2, y2), color, 2)
                        label = f'{cls} {conf:.2f}'
                        
                        (text_w, text_h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
                        cv2.rectangle(display_frame, (x1, y1-text_h-5), (x1+text_w, y1), color, -1)
                        cv2.putText(display_frame, label, (x1, y1-5), 
                                  cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)

                except Exception as e:
                    print(f"Error in detection drawing: {e}")
                    continue

        # Draw interactions
        for interaction in interactions:
            try:
            # Get centers from interaction data
                 x1, y1 = self.rescale_coords(interaction['center1'][0], interaction['center1'][1], scale_info)
                 x2, y2 = self.rescale_coords(interaction['center2'][0], interaction['center2'][1], scale_info)
              
            # Draw line between interacting people
                 cv2.line(display_frame, (x1, y1), (x2, y2), self.colors['interaction'], 2)
            
            # Optional: Draw interaction distance
                 mid_point = ((x1 + x2)//2, (y1 + y2)//2)
                 distance_label = f"D: {interaction['relative_distance']:.2f}"
                 cv2.putText(display_frame, distance_label, mid_point, 
                         cv2.FONT_HERSHEY_SIMPLEX, 0.5, self.colors['interaction'], 2)
            
            except Exception as e:
                 print(f"Error drawing interaction: {e}")
                 continue

        # Draw pose keypoints and connections
        if pose_results:
            for result in pose_results:
                if result.keypoints is not None:
                    for kpts in result.keypoints:
                        try:
                            keypoints_data = kpts.data[0].cpu().numpy()
                            
                            # Draw keypoints
                            for keypoint in keypoints_data:
                                x, y, conf = keypoint
                                if conf > 0.5:
                                    x, y = self.rescale_coords(x, y, scale_info)
                                    cv2.circle(display_frame, (x, y), 4, self.colors['keypoint'], -1)

                            # Draw connections
                            connections = [(5,7), (7,9), (6,8), (8,10), (5,6), 
                                         (11,13), (13,15), (12,14), (14,16), (11,12)]
                            for connection in connections:
                                pt1 = keypoints_data[connection[0]]
                                pt2 = keypoints_data[connection[1]]
                                
                                if pt1[2] > 0.5 and pt2[2] > 0.5:
                                    x1, y1 = self.rescale_coords(pt1[0], pt1[1], scale_info)
                                    x2, y2 = self.rescale_coords(pt2[0], pt2[1], scale_info)
                                    cv2.line(display_frame, (x1, y1), (x2, y2),
                                           self.colors['connection'], 2)
                        except Exception as e:
                            print(f"Error in pose drawing: {e}")
                            continue

        # Add violence indicators
        if self.current_risk_level > 0.7:  # High risk threshold
            cv2.putText(display_frame, "HIGH RISK", (10, 60), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

        # Add frame information
        cv2.putText(display_frame, "Press 'q' to quit, 'p' to pause/resume", 
                   (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)

        return display_frame

    def extract_features(self, frame, prev_frame_data=None):
        """Extract violence-relevant features from frame"""
        try:
            # Preprocess frame
            processed_frame, scale_info = self.preprocess_frame(frame)
            if processed_frame is None:
                return None, frame

            # Convert to tensor and add batch dimension
            frame_tensor = torch.from_numpy(processed_frame).permute(2, 0, 1).unsqueeze(0).to(self.device)

            # Run models with GPU acceleration
            with torch.cuda.amp.autocast():
                det_results = self.detection_model(frame_tensor, verbose=False)
                pose_results = self.pose_model(frame_tensor, verbose=False)

            # Initialize features
            features = {
                'objects': [],
                'poses': [],
                'interactions': [],
                'motion': {},
                'violence_indicators': {
                    'weapon_present': False,
                    'close_interaction': False,
                    'rapid_motion': False,
                    'aggressive_pose': False
                }
            }

            # Process relevant detections
            person_boxes = []
            for result in det_results:
                for box in result.boxes:
                    try:
                        cls = result.names[int(box.cls[0])]
                        if cls in self.relevant_classes:
                            conf = float(box.conf[0])
                            box_coords = box.xyxy[0].cpu().numpy().tolist()
                            
                            features['objects'].append({
                                'class': cls,
                                'confidence': conf,
                                'box': box_coords
                            })
                            
                            if cls == 'person':
                                person_boxes.append(box_coords)
                            elif cls in self.violence_objects:
                                features['violence_indicators']['weapon_present'] = True
                    except Exception as e:
                        print(f"Error processing detection: {e}")
                        continue

            # Analyze person interactions
            if len(person_boxes) >= 2:
                interactions = self.analyze_person_interactions(person_boxes)
                features['interactions'] = interactions
                features['violence_indicators']['close_interaction'] = len(interactions) > 0

            # Process poses and analyze for violence
            if pose_results:
                for result in pose_results:
                    if result.keypoints is not None:
                        for kpts in result.keypoints:
                            try:
                                pose_data = kpts.data[0].cpu().numpy().tolist()
                                features['poses'].append(pose_data)
                            except Exception as e:
                                print(f"Error processing pose: {e}")
                                continue

                features['violence_indicators']['aggressive_pose'] = self.analyze_poses_for_violence(features['poses'])

            # Calculate motion features
            if prev_frame_data and 'poses' in prev_frame_data:
                motion_features = self.calculate_motion_features(
                    prev_frame_data['poses'], features['poses'])
                features['motion'] = motion_features
                
                features['violence_indicators']['rapid_motion'] = motion_features.get('average_speed', 0) > 10

            # Calculate overall risk level
            risk_weights = {
                'weapon_present': 0.4,
                'close_interaction': 0.3,
                'rapid_motion': 0.2,
                'aggressive_pose': 0.1
            }
            
            self.current_risk_level = sum(
                risk_weights[indicator] * int(value)
                for indicator, value in features['violence_indicators'].items()
            )

            # Draw detections
            annotated_frame = self.draw_detections(
                frame, det_results, pose_results, 
                features['interactions'], scale_info
            )

            return features, annotated_frame

        except Exception as e:
            print(f"Error in feature extraction: {e}")
            return None, frame
def process_video(video_path, extractor, output_path):
    """Process video with GPU acceleration"""
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Could not open video file")
        return

    # Get video properties
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Create video writer
    output_video_path = video_path.rsplit('.', 1)[0] + '_analyzed.mp4'
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

    # Initialize data storage
    video_data = {
        'metadata': {
            'path': video_path,
            'fps': fps,
            'frame_count': frame_count,
            'width': frame_width,
            'height': frame_height
        },
        'frames': []
    }

    frame_idx = 0
    prev_frame_data = None
    paused = False

    try:
        while True:
            if not paused:
                ret, frame = cap.read()
                if not ret:
                    break

                # Skip frames if needed
                if frame_idx % extractor.frame_skip != 0:
                    frame_idx += 1
                    continue

                # Extract features and get annotated frame
                features, annotated_frame = extractor.extract_features(frame, prev_frame_data)
                
                if features is not None:
                    frame_data = {
                        'frame_index': frame_idx,
                        'timestamp': frame_idx / fps,
                        'features': features
                    }
                    
                    video_data['frames'].append(frame_data)
                    prev_frame_data = features
                    out.write(annotated_frame)

                    # Show progress
                    if frame_idx % (30 * extractor.frame_skip) == 0:
                        progress = (frame_idx / frame_count) * 100
                        print(f"Processing: {progress:.1f}% complete")
                        if torch.cuda.is_available():
                            print(f"GPU Memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

                    # Display frame
                    cv2.imshow('Violence Detection Analysis', annotated_frame)

                frame_idx += 1

                # Handle key events
                key = cv2.waitKey(1) & 0xFF
                if key == ord('q'):
                    break
                elif key == ord('p'):
                    paused = not paused
                    print("Paused - Press 'p' to resume" if paused else "Resumed")

    except Exception as e:
        print(f"Error during processing: {e}")
        import traceback
        traceback.print_exc()

    finally:
        # Cleanup
        cap.release()
        out.release()
        cv2.destroyAllWindows()
        
        # Final GPU cleanup
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        # Save features
        try:
            with open(output_path, 'w') as f:
                yaml.dump(video_data, f, default_flow_style=False)
            print(f"Features saved to: {output_path}")
            print(f"Analyzed video saved to: {output_video_path}")
        except Exception as e:
            print(f"Error saving data: {e}")

    return video_data

if __name__ == "__main__":
    # Initialize Tkinter
    root = Tk()
    root.withdraw()

    # Get video file
    video_path = filedialog.askopenfilename(
        title="Select Video File",
        filetypes=[("Video Files", "*.mp4;*.avi")]
    )

    if not video_path:
        print("No video file selected")
        exit()                
                
            
        
        
    
        
    
    
    
    

In [None]:


import torch
from ultralytics import YOLO
import argparse

class ModelSetup:
    def __init__(self, detection_model_path, segmentation_model_path, pose_model_path):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.print_gpu_info()
        
        # Load models
        self.detection_model = YOLO(detection_model_path).to(self.device)
        self.segmentation_model = YOLO(segmentation_model_path).to(self.device)
        self.pose_model = YOLO(pose_model_path).to(self.device)

    def print_gpu_info(self):
        """Print GPU information"""
        if torch.cuda.is_available():
            print(f"Using GPU: {torch.cuda.get_device_name(0)}")
            print(f"CUDA Version: {torch.version.cuda}")
            print(f"Total Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
        else:
            print("Using CPU.")

    def save_models(self):
        """Save the model setup state for future use"""
        torch.save({
            'detection_model': self.detection_model.state_dict(),
            'segmentation_model': self.segmentation_model.state_dict(),
            'pose_model': self.pose_model.state_dict()
        }, 'model_state.pth')

def main():
    parser = argparse.ArgumentParser(description="Set up models for feature extraction")
    parser.add_argument('detection_model', type=str, help='Path to detection model')
    parser.add_argument('segmentation_model', type=str, help='Path to segmentation model')
    parser.add_argument('pose_model', type=str, help='Path to pose model')
    args = parser.parse_args()
    
    setup = ModelSetup(args.detection_model, args.segmentation_model, args.pose_model)
    setup.save_models()

if __name__ == "__main__":
    main()



In [1]:
import tempfile