### Nota: El modelo solo funciona con tensorflow y keras 2.15, para hacer su instalación se debe tener Python 3.9

In [1]:
import os
from tensorflow.keras.models import load_model
from collections import defaultdict
import cv2
import numpy as np
import time
import json
from ultralytics import YOLO
from ultralytics.utils.plotting import Annotator, colors
from collections import deque




In [3]:
class Person:
    def __init__(self, id, x1, y1, x2, y2):
        self.id = id
        self.x1 = x1
        self.y1 = y1
        self.x2 = x2
        self.y2 = y2
        self.thief = False  # Indica si la persona es sospechosa de robo
        self.zones = list()

    def set_thief_status(self, status: bool):
        """Actualizar estado de sospecha de la persona."""
        self.thief = status

    def get_color(self):
        """Devuelve el color correspondiente a la persona según su estado."""
        return (0, 0, 255) if self.thief else (0, 255, 0)

    def draw(self, frame):
        """Dibuja un rectángulo alrededor de la persona."""
        cv2.rectangle(frame, (self.x1, self.y1), (self.x2, self.y2), self.get_color(), 2)
        label = "Ladrón" if self.thief else "Persona"
        cv2.putText(frame, label, (self.x1, self.y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, self.get_color(), 2)

In [2]:
class YoloDetector:
    def __init__(self, yolo_model, steal_model, confidence_threshold=0.7, video_path="video.mp4", 
                 individual_persons=False, output_video_path="output_video.mp4"):
        
        self.confidence_threshold = confidence_threshold
        self.video_path = video_path
        self.output_video_path = output_video_path  # Ruta para guardar el video procesado
        self.people = list()
        self.zones = list()

        self.IMG_SIZE = (90, 90)
        self.SEQUENCE_LENGTH = 160  

        self.frame_queue = deque(maxlen=self.SEQUENCE_LENGTH)

        # Cargar modelo YOLO
        self.yolo_model = YOLO(yolo_model)
        # Cargar modelo LRCN para robos
        self.steal_model = load_model(steal_model)

        if individual_persons:
            self.detect_thief()
        else:
            self.detect_thief_seen()

    def preprocess_frame(self, frame):
        """Preprocesa un frame para que sea compatible con el modelo."""
        frame = cv2.resize(frame, self.IMG_SIZE)  # Redimensionar a 90x90
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)  # Convertir a escala de grises
        frame = frame / 255.0  # Normalización
        frame = np.expand_dims(frame, axis=-1)  # Añadir canal de profundidad
        return frame

    def predict_thief(self, frame):
        input_sequence = np.expand_dims(np.array(frame), axis=0)
        predictions = self.steal_model.predict(input_sequence)

        # Obtenemos el indice del mayor valor de la predicción
        prediction = np.argmax(predictions)
        confidence = predictions[0][prediction]

        return prediction, confidence

    def detect_thief(self):
        cap = cv2.VideoCapture(self.video_path)

        # Obtener las propiedades del video (como el FPS y la resolución)
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        fps = cap.get(cv2.CAP_PROP_FPS)

        # Configurar VideoWriter para guardar el video de salida
        fourcc = cv2.VideoWriter_fourcc(*"XVID")  # Usar códec XVID
        out = cv2.VideoWriter(self.output_video_path, fourcc, fps, (width, height))

        while True:
            ret, frame = cap.read()

            if not ret or frame is None:
                print("Fin del video o error en la lectura del frame.")
                break

            annotator = Annotator(frame, line_width=2)
            results = self.yolo_model.track(frame, persist=True)

            if results[0].boxes.id is not None:
                boxes = results[0].boxes.xyxy.cpu().numpy()
                confs = results[0].boxes.conf.cpu().numpy()
                class_ids = results[0].boxes.cls.cpu().numpy()
                track_ids = results[0].boxes.id.int().cpu().tolist()
                
                current_frame_people = set()

                for box, conf, class_id, track_id in zip(boxes, confs, class_ids, track_ids):
                    if int(class_id) == 0:  # Detectar solo personas

                        centroid_x = int((box[0] + box[2]) / 2)
                        centroid_y = int((box[1] + box[3]) / 2)
                        cv2.circle(frame, (centroid_x, centroid_y), 5, (0, 0, 255), -1)

                        # Extraer coordenadas del bounding box
                        x1, y1, x2, y2 = map(int, box)  # Convertir a enteros

                        # Asegurar que las coordenadas estén dentro del tamaño del frame
                        x1, y1 = max(0, x1), max(0, y1)
                        x2, y2 = min(width, x2), min(height, y2)

                        # Recortar la región de la persona detectada
                        person_crop = frame[y1:y2, x1:x2]

                        processed_frame = self.preprocess_frame(person_crop)
                        self.frame_queue.append(processed_frame)
                        
                        if len(self.frame_queue) == self.SEQUENCE_LENGTH:
                            prediction, confidence = self.predict_thief(self.frame_queue)

                            if str(prediction) == "0" and confidence > self.confidence_threshold:
                                label = "Ladron"
                                annotator.box_label(box, label, color=(0, 0, 255))

            # Guardar el frame procesado en el video de salida
            out.write(frame)

            cv2.imshow("object-detection-tracking", frame)

            if cv2.waitKey(1) & 0xFF == ord("q"):
                break

        cap.release()
        out.release()  # Liberar el VideoWriter
        cv2.destroyAllWindows()

    def detect_thief_seen(self):
        cap = cv2.VideoCapture(self.video_path)

        # Obtener las propiedades del video (como el FPS y la resolución)
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        fps = cap.get(cv2.CAP_PROP_FPS)

        # Configurar VideoWriter para guardar el video de salida
        fourcc = cv2.VideoWriter_fourcc(*"XVID")  # Usar códec XVID
        out = cv2.VideoWriter(self.output_video_path, fourcc, fps, (width, height))

        while True:
            ret, frame = cap.read()

            if ret == False:
                break

            processed_frame = self.preprocess_frame(frame)
            self.frame_queue.append(processed_frame)
            
            if len(self.frame_queue) == self.SEQUENCE_LENGTH:
                prediction, confidence = self.predict_thief(self.frame_queue)

                if str(prediction) == "0" and confidence > self.confidence_threshold:
                    label = "Robo detectado"
                    cv2.putText(frame, label, (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

            # Guardar el frame procesado en el video de salida
            out.write(frame)

            cv2.imshow("object-detection-tracking", frame)

            if cv2.waitKey(1) & 0xFF == ord("q"):
                break

        cap.release()
        out.release()  # Liberar el VideoWriter
        cv2.destroyAllWindows()

In [3]:
yolo_model = "../src/models/yolo11n.pt"
steal_model = "../src/models/lrcn_160S_90_90Q.h5"
video_path = "../data/robo1.mp4"
output_video_path = "../outputs/steal_output.mp4"
individual_persons = True
confidence_threshold = 0.7

detector = YoloDetector(yolo_model, steal_model, confidence_threshold, video_path, individual_persons, output_video_path)




0: 384x640 1 person, 120.0ms
Speed: 4.9ms preprocess, 120.0ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 106.0ms
Speed: 5.4ms preprocess, 106.0ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 94.3ms
Speed: 3.9ms preprocess, 94.3ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 91.9ms
Speed: 2.8ms preprocess, 91.9ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 97.2ms
Speed: 3.8ms preprocess, 97.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 89.3ms
Speed: 2.6ms preprocess, 89.3ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 158.6ms
Speed: 2.6ms preprocess, 158.6ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 177.7ms
Speed: 3.9ms preprocess, 177.7ms inference, 1.3ms postprocess per image at shape