In [1]:
import cv2
import numpy as np
from PIL import ImageDraw, Image, ImageFont

import torch
import torchvision

In [3]:
model_name = 'res10_300x300_ssd_iter_140000.caffemodel'
prototxt_name = 'deploy.prototxt'

def DetectByDnn(frame):
    height = frame.shape[0]
    width = frame.shape[1]
    faces = []
    model = cv2.dnn.readNetFromCaffe(prototxt_name, model_name)
    blob = cv2.dnn.blobFromImage(cv2.resize(frame, (300, 300)), 1.0, (300, 300), (104.0, 177.0, 123.0))
    model.setInput(blob)
    detections = model.forward()

    for i in range(0, detections.shape[2]):
        confidence = detections[0, 0, i, 2]
        min_confidence = 0.3
        
        if confidence > min_confidence:
            box = detections[0, 0, i, 3:7] * np.array([width, height, width, height])
            (startX, startY, endX, endY) = box.astype("int")
            faces.append((startX, startY, endX-startX, endY-startY))
            
    return faces

In [4]:
model_weights = torchvision.models.ViT_B_16_Weights.DEFAULT
model_transform = model_weights.transforms()
model = torchvision.models.vit_b_16(weights=model_weights)

for parameter in model.parameters():
    parameter.requires_grad = False

model.heads = torch.nn.Sequential(
    torch.nn.Linear(in_features=768, out_features=2, bias=True),
)

model.load_state_dict(torch.load("./model/vit.pth"))
model

VisionTransformer(
  (conv_proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
  (encoder): Encoder(
    (dropout): Dropout(p=0.0, inplace=False)
    (layers): Sequential(
      (encoder_layer_0): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0.0, inplace=False)
        (ln_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=3072, out_features=768, bias=True)
          (4): Dropout(p=0.0, inplace=False)
        )
      )
      (encoder_layer_1): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_a

In [5]:
cap = cv2.VideoCapture(0)

font_eng = ImageFont.truetype("arial.ttf", 30)

while cap.isOpened():
    ret, frame = cap.read()
    
    if ret:
        faces = DetectByDnn(frame)
        
        for face in faces:
            (x,y,w,h) = face
            cut = frame[y:y+h+10, x:x+w+10]
            input_face = model_transform(Image.fromarray(cut)).unsqueeze(dim=0)

            model.eval()
            with torch.inference_mode():
                pred = model(input_face)
                label = torch.softmax(pred, dim=1).argmax(dim=1)
            
            cv2.rectangle(frame, (x, y), (x+w+10, y+h+10), (255, 0, 0), 3)
            frame_pil = frame
            frame_pil = Image.fromarray(cv2.cvtColor(frame_pil, cv2.COLOR_BGR2RGB))
            draw = ImageDraw.Draw(frame_pil)
            classes = ["men", "women"]
            draw.text((x, y-35), f"{classes[label]}", fill=(0, 0, 255), font=font_eng)
            frame = cv2.cvtColor(np.array(frame_pil), cv2.COLOR_RGB2BGR)
            
        cv2.imshow("Video", frame)
        if cv2.waitKey(1) & 0xff == 27:
            break
        
    else:
        break

cap.release()
cv2.destroyAllWindows()