In [1]:
import pandas as pd
import numpy as np
import cv2
import imutils
import urllib.request

## Get XML File

In [2]:
def read_xml():
    url = "https://raw.githubusercontent.com/opencv/opencv/master/data/haarcascades/haarcascade_frontalface_default.xml"
    response = urllib.request.urlopen(url)
    xml_bytes = response.read()

    # Load the classifier
    xml_string = xml_bytes.decode('utf-8')
    fs = cv2.FileStorage(xml_string, cv2.FILE_STORAGE_READ | cv2.FILE_STORAGE_MEMORY)
    return fs

## Detect Faces

In [9]:
def extract_faces(filepath, start_time, end_time, start_frame, end_frame, fs):
    # Path to video file
    vid_obj = cv2.VideoCapture(filepath)

    # FPS of the video
    fps = int(vid_obj.get(cv2.CAP_PROP_FPS))

    # How much we are scaling each frame by when detecting faces
    scale_factor = int(vid_obj.get(cv2.CAP_PROP_FRAME_WIDTH)) / 500

    rows = []
    frame_count = 0

    # Load face detector
    detector = cv2.CascadeClassifier()
    detector.read(fs.getFirstTopLevelNode())

    # Get the starting frame (the lower of start_time or start_frame)
    start_time_frame = start_time * fps
    start_frame = min(start_time_frame, start_frame)

    # Get the ending frame (the higher of end_time or end_frame)
    if end_time is None:
        if end_frame is None:
            end_frame = int(vid_obj.get(cv2.CAP_PROP_FRAME_COUNT))
            end_time = end_frame / fps
        else:
            end_time = end_frame / fps
    else:
        if end_frame is None:
            end_frame = int(end_time * fps)
        else:
            end_frame = max(end_frame, int(end_time * fps))
            end_time = end_frame / fps

    # Create an empty numpy array to store face coordinates
    face_coords = np.zeros((end_frame - start_frame + 1, 4))

    # Skip frames before start_time
    while frame_count < start_frame:
        success = vid_obj.grab()
        if not success:
            return start_time, end_time, start_frame, end_frame, fps, pd.DataFrame(rows)
        frame_count += 1

    while True:
        # Read the next video object
        success, image = vid_obj.read()

        # End when reaches end_frame
        if not success or frame_count > end_frame:
            break

        # Resize image and convert it to grayscale
        image = imutils.resize(image, width=500)
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # Apply histogram equalization to enhance contrast
        gray = cv2.equalizeHist(gray)

        # Apply Gaussian blur to reduce noise
        gray = cv2.GaussianBlur(gray, (5, 5), 0)

        # detect faces in image using the haar cascade face detector
        rects = detector.detectMultiScale(image=gray,
                                          scaleFactor=1.1,
                                          minNeighbors=9,
                                          flags=cv2.CASCADE_SCALE_IMAGE)

        # Extract x,y,w,h of each rect and store in numpy array
        for i, (x, y, w, h) in enumerate(rects):
            face_coords[frame_count - start_frame + i, :] = np.array([round(x * scale_factor, 3), 
                                                                      round(y * scale_factor, 3), 
                                                                      round(w * scale_factor, 3), 
                                                                      round(h * scale_factor, 3)])

        frame_count += len(rects)

    # Convert numpy array to pandas dataframe
    rows = [{'frame': i} for i in range(start_frame, end_frame + 1)]
    rows = pd.DataFrame(rows)
    rows[['x', 'y', 'w', 'h']] = pd.DataFrame(face_coords)

    vid_obj.release()
    return start_time, end_time, start_frame, end_frame, fps, rows

## JSON Converter

In [10]:
from datetime import datetime

def json_converter(filename, video_metadata):
    # Get today's date
    creation_date = datetime.now().strftime("%Y-%m-%d")

    # Convert video_metadata into appropriate format
    start_time, end_time, start_frame, end_frame, fps, video_df = video_metadata

    rows_dict = {}
    for index, row in video_df.iterrows():
        key = "Frame" + str(int(row['frame']))
        values = [str(row['x']), str(row['y']), str(row['w']), str(row['h'])]
        rows_dict[key] = values

    json_obj = {
        "VideoInformation": filename,
        "CreationDate": creation_date,
        "VideoMetadata": {
            "StartTime": start_time,
            "EndTime": end_time,
            "StartFrame": start_frame,
            "EndFrame": end_frame,
            "Fps": fps,
            "FrameData": rows_dict
        }
    }

    return json_obj

## Main Function 

In [12]:
# time should be in seconds
def face_detector(filename, filepath, start_time=0, end_time=None, start_frame=0, end_frame=None):
    fs = read_xml()
    video_metadata = extract_faces(filepath, start_time, end_time, start_frame, end_frame, fs)
    return json_converter(filename, video_metadata)

In [13]:
json_format = face_detector('test_file', 'ML0001_1.mp4')

In [14]:
json_format

{'VideoInformation': 'test_file',
 'CreationDate': '2023-04-15',
 'VideoMetadata': {'StartTime': 0,
  'EndTime': 184.0,
  'StartFrame': 0,
  'EndFrame': 5520,
  'Fps': 30,
  'FrameData': {'Frame0': ['416.88', '762.48', '324.0', '324.0'],
   'Frame1': ['419.04', '762.48', '324.0', '324.0'],
   'Frame2': ['414.72', '762.48', '326.16', '326.16'],
   'Frame3': ['410.4', '760.32', '328.32', '328.32'],
   'Frame4': ['421.2', '764.64', '319.68', '319.68'],
   'Frame5': ['423.36', '771.12', '313.2', '313.2'],
   'Frame6': ['414.72', '762.48', '330.48', '330.48'],
   'Frame7': ['408.24', '760.32', '334.8', '334.8'],
   'Frame8': ['408.24', '762.48', '334.8', '334.8'],
   'Frame9': ['408.24', '760.32', '334.8', '334.8'],
   'Frame10': ['408.24', '762.48', '336.96', '336.96'],
   'Frame11': ['410.4', '766.8', '330.48', '330.48'],
   'Frame12': ['414.72', '766.8', '330.48', '330.48'],
   'Frame13': ['419.04', '768.96', '321.84', '321.84'],
   'Frame14': ['423.36', '768.96', '317.52', '317.52'],
  

## Visualizer

In [None]:
video_path = 'ML0001_1.mp4'
video_obj = cv2.VideoCapture(video_path)

fs = read_xml()
start_time, end_time, start_frame, end_frame, fps, video_df = extract_faces(video_path, 0, None, 0, None, fs)


for index, row in video_df.iterrows():
    # read the frame from the video
    video_obj.set(cv2.CAP_PROP_POS_FRAMES, row['frame'])
    ret, frame = video_obj.read()
    
    x, y, w, h = int(row['x']), int(row['y']), int(row['w']), int(row['h'])
    cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)

    cv2.imshow("Frame with bounding box", frame)
    cv2.waitKey(1)

# Release the video object and close all windows
video_obj.release()
cv2.destroyAllWindows()