In [None]:
import cv2
import torch
import clip
from PIL import Image
from torchvision import transforms
from ultralytics import YOLO
from ultralytics.nn.tasks import attempt_load_one_weight, torch_safe_load

ModuleNotFoundError: No module named 'clip'

In [None]:
class GenderDetectorCLIP:
    def __init__(self, yolo_model_path="train18_best_model.pt", device=None):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        ckpt, _ = torch_safe_load(yolo_model_path, safe_only=True)
        self.yolo_model, _ = attempt_load_one_weight(ckpt)
        self.clip_model, self.preprocess = clip.load("ViT-B/32", device=self.device)
        self.text_inputs = clip.tokenize(["a photo of a man", "a photo of a woman"]).to(self.device)

    def detect_people(self, frame):
        results = self.yolo_model(frame)[0]
        boxes = []
        for r in results.boxes:
            x1, y1, x2, y2 = map(int, r.xyxy[0])
            cls_id = int(r.cls[0])
            if cls_id == 0:  # 'person' class
                boxes.append((x1, y1, x2, y2))
        return boxes

    def batch_detect_gender_clip(self, faces):
        if not faces:
            return []

        face_tensors = [self.preprocess(Image.fromarray(cv2.cvtColor(f, cv2.COLOR_BGR2RGB))) for f in faces]
        face_batch = torch.stack(face_tensors).to(self.device)

        with torch.no_grad():
            image_features = self.clip_model.encode_image(face_batch)
            text_features = self.clip_model.encode_text(self.text_inputs)

            image_features /= image_features.norm(dim=-1, keepdim=True)
            text_features /= text_features.norm(dim=-1, keepdim=True)

            similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
            genders = ["Man" if sim[0] > sim[1] else "Woman" for sim in similarity]
        return genders

    def annotate_frame(self, frame, boxes):
        cropped_faces = [frame[y1:y2, x1:x2] for (x1, y1, x2, y2) in boxes if (y2 - y1 > 40 and x2 - x1 > 40)]
        genders = self.batch_detect_gender_clip(cropped_faces)

        for ((x1, y1, x2, y2), gender) in zip(boxes, genders):
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 255), 2)
            cv2.putText(frame, gender, (x1, y1 - 8),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
        return frame

    def process_video(self, video_path=0, output_path=None):
        cap = cv2.VideoCapture(video_path)
        writer = None

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            boxes = self.detect_people(frame)
            frame = self.annotate_frame(frame, boxes)

            cv2.imshow("CLIP Gender Detection", frame)

            if output_path and writer is None:
                fourcc = cv2.VideoWriter_fourcc(*"mp4v")
                writer = cv2.VideoWriter(output_path, fourcc, 20.0,
                                         (frame.shape[1], frame.shape[0]))

            if writer:
                writer.write(frame)

            if cv2.waitKey(1) & 0xFF == ord("q"):
                break

        cap.release()
        if writer:
            writer.release()
        cv2.destroyAllWindows()


In [None]:
detector = GenderDetectorCLIP(yolo_model_path="train18_best_model.pt")

RuntimeError: PytorchStreamReader failed reading zip archive: failed finding central directory

In [None]:
detector.process_video("MOT20-010raw.mp4")

In [None]:
fourcc = cv2.VideoWriter_fourcc(*'XVID')

out = cv2.VideoWriter(f'prediction/{vid_filename}/{vid_filename}.{extension}',
                                    fourcc, fps, (width, height))

df = pd.DataFrame(columns=['frame num', 'person id', 'bb_xmin', 'bb_ymin', 'bb_height', 'bb_width', 'age_min', 'age_max', 'age_actual', 'gender'])

i = 1

pbar = tqdm(total=frame_count)

while cap.isOpened():
  ret, frame = cap.read()
  pbar.update(1)

  frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

  val_tran = transforms.Compose([transforms.ToTensor()])
  im_pil = Image.fromarray(frame)
  im_pil = val_tran(im_pil)

  image = im_pil.to(device).unsqueeze(0)

  detection_model=detection_model.to(device)

  detection_model = detection_model.eval()
  output = detection_model(image)
  scores = output[0]['scores'].detach().cpu().numpy()
  num_people = len(scores[scores > 0.5])

  boxes = output[0]['boxes'].detach().cpu().numpy()
  boxes = boxes[:num_people]

  for j in range(num_people):
    x1, y1, x2, y2 = int(boxes[j][0]), int(boxes[j][1]), int(boxes[j][2]), int(boxes[j][3])
    gender_pred, age_pred = get_pred_attributes(frame, x1, y1, x2, y2)
    cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), thickness=5)
    cv2.putText(frame, f"ID: {j}", (x1, y1 - 10),
                      cv2.FONT_HERSHEY_PLAIN, 1, (255, 0, 0), thickness=2)

    width = abs(x2 - x1)
    height = abs(y2 - y1)

    exact_age = (age_pred[0] + age_pred[1])/2

    row = {'frame num' : i,
          'person id' : j,
          'bb_xmin': x1,
          'bb_ymin': y1,
          'bb_height': height,
          'bb_width': width,
          'age_min': age_pred[0],
          'age_max': age_pred[1],
          'age_actual': exact_age,
          'gender': gender_pred}

    df = df.append(row, ignore_index = True)

  frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

  if pred_video:
    out.write(frame)

  if pred_frame_by_frame:
    cv2.imwrite(f'prediction/{vid_filename}/images/FRAME{i}.jpg', frame)
  # plt.imshow(frame)
  # plt.show()

  i += 1

  if cv2.waitKey(1) & 0xFF == ord('q'):
    break

  if i > frame_count:
    cap.release()
    cv2.destroyAllWindows()

NameError: name 'cv2' is not defined

In [None]:
# class GenderDetector:
#     def __init__(self, yolo_model_path="yolov8n.pt"):
#         self.model = YOLO(yolo_model_path)

#     def detect_faces(self, frame):
#         results = self.model(frame)[0]
#         boxes = []
#         for r in results.boxes:
#             x1, y1, x2, y2 = map(int, r.xyxy[0])
#             boxes.append((x1, y1, x2, y2))
#         return boxes

#     def detect_gender(self, face_img):
#         try:
#             result = DeepFace.analyze(face_img, actions=['gender'], enforce_detection=False)
#             return result[0]['gender']
#         except Exception as e:
#             return "Unknown"

#     def annotate_frame(self, frame, boxes):
#         for (x1, y1, x2, y2) in boxes:
#             face = frame[y1:y2, x1:x2]
#             gender = self.detect_gender(face)

#             cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
#             cv2.putText(frame, gender, (x1, y1 - 10),
#                         cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
#         return frame

#     def process_video(self, video_path=0, output_path=None):
#         cap = cv2.VideoCapture(video_path)
#         writer = None

#         while cap.isOpened():
#             ret, frame = cap.read()
#             if not ret:
#                 break

#             boxes = self.detect_faces(frame)
#             frame = self.annotate_frame(frame, boxes)

#             cv2.imshow("Gender Detection", frame)

#             if writer:
#                 writer.write(frame)

#             if cv2.waitKey(1) & 0xFF == ord("q"):
#                 break

#         cap.release()
#         if writer:
#             writer.release()
#         cv2.destroyAllWindows()


Collecting deepface
  Downloading deepface-0.0.93-py3-none-any.whl.metadata (30 kB)
Collecting flask-cors>=4.0.1 (from deepface)
  Downloading flask_cors-5.0.1-py3-none-any.whl.metadata (961 bytes)
Collecting mtcnn>=0.1.0 (from deepface)
  Downloading mtcnn-1.0.0-py3-none-any.whl.metadata (5.8 kB)
Collecting retina-face>=0.0.1 (from deepface)
  Downloading retina_face-0.0.17-py3-none-any.whl.metadata (10 kB)
Collecting fire>=0.4.0 (from deepface)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gunicorn>=20.1.0 (from deepface)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting lz4>=4.3.3 (from mtcnn>=0.1.0->deepface)
  Downloading lz4-4.4.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading deepface-0.0.93-py3-none-any.whl (108 kB)
[2K   [90m━━

In [None]:
pip install torch==1.13.1+cu117 torchvision==0.15.0+cu117 -f https://download.pytorch.org/whl/torch_stable.html

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.13.1+cu117
  Using cached https://download.pytorch.org/whl/cu117/torch-1.13.1%2Bcu117-cp311-cp311-linux_x86_64.whl (1801.8 MB)
Collecting torchvision==0.15.0+cu117
  Downloading https://download.pytorch.org/whl/cu117/torchvision-0.15.0%2Bcu117-cp311-cp311-linux_x86_64.whl (6.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.1/6.1 MB[0m [31m68.0 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of torchvision to determine which version is compatible with other requirements. This could take a while.
[31mERROR: Cannot install torch==1.13.1+cu117 and torchvision==0.15.0+cu117 because these package versions have conflicting dependencies.[0m[31m
[0m
The conflict is caused by:
    The user requested torch==1.13.1+cu117
    torchvision 0.15.0+cu117 depends on torch==2.0.0+cu117

To fix this you could try to:
1. loosen the range of package versions you've sp