In [2]:
!pip install mtcnn
import pandas as pd
import numpy as np
import cv2
import imutils
import urllib.request
from mtcnn import MTCNN

Collecting mtcnn
  Downloading mtcnn-0.1.1-py3-none-any.whl (2.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: mtcnn
Successfully installed mtcnn-0.1.1


## Get XML File

In [2]:
def read_xml():
    url = "https://raw.githubusercontent.com/opencv/opencv/master/data/haarcascades/haarcascade_frontalface_default.xml"
    response = urllib.request.urlopen(url)
    xml_bytes = response.read()

    # Load the classifier
    xml_string = xml_bytes.decode('utf-8')
    fs = cv2.FileStorage(xml_string, cv2.FILE_STORAGE_READ | cv2.FILE_STORAGE_MEMORY)
    return fs

## Detect Faces

In [7]:
def extract_faces(filepath, start_time, end_time, start_frame, end_frame):
    # Path to video file
    vid_obj = cv2.VideoCapture(filepath)

    # FPS of the video
    fps = int(vid_obj.get(cv2.CAP_PROP_FPS))

    # How much we are scaling each frame by when detecting faces
    scale_factor = int(vid_obj.get(cv2.CAP_PROP_FRAME_WIDTH)) / 500

    rows = []
    frame_count = 0

    # Create an MTCNN detector object
    detector = MTCNN()

    # Get the starting frame (the lower of start_time or start_frame)
    start_time_frame = start_time * fps
    start_frame = min(start_time_frame, start_frame)

    # Get the ending frame (the higher of end_time or end_frame)
    if end_time is None:
        if end_frame is None:
            end_frame = int(vid_obj.get(cv2.CAP_PROP_FRAME_COUNT))
            end_time = end_frame / fps
        else:
            end_time = end_frame / fps
    else:
        if end_frame is None:
            end_frame = int(end_time * fps)
        else:
            end_frame = max(end_frame, int(end_time * fps))
            end_time = end_frame / fps

    # Create an empty numpy array to store face coordinates
    face_coords = np.zeros((end_frame - start_frame + 1, 4))

    # Skip frames before start_time
    while frame_count < start_frame:
        success = vid_obj.grab()
        if not success:
            return start_time, end_time, start_frame, end_frame, fps, pd.DataFrame(rows)
        frame_count += 1

    while True:
        # Read the next video object
        success, image = vid_obj.read()

        # End when reaches end_frame
        if not success or frame_count > end_frame:
            break

        # Resize image and convert it to RGB
        image = cv2.cvtColor(imutils.resize(image, width=500), cv2.COLOR_BGR2RGB)

        # Detect faces using MTCNN
        results = detector.detect_faces(image)

        # Extract bounding boxes and store in numpy array
        for i, result in enumerate(results):
            x, y, w, h = result['box']
            face_coords[frame_count - start_frame + i, :] = np.array([round(x * scale_factor, 3), 
                                                                      round(y * scale_factor, 3), 
                                                                      round(w * scale_factor, 3), 
                                                                      round(h * scale_factor, 3)])

        frame_count += len(results)

    # Convert numpy array to pandas dataframe
    rows = [{'frame': i} for i in range(start_frame, end_frame + 1)]
    rows = pd.DataFrame(rows)
    rows[['x', 'y', 'w', 'h']] = pd.DataFrame(face_coords)

    vid_obj.release()
    return start_time, end_time, start_frame, end_frame, fps, rows


## JSON Converter

In [32]:
from datetime import datetime

def json_converter(filename, video_metadata):
    # Get today's date
    creation_date = datetime.now().strftime("%Y-%m-%d")

    # Convert video_metadata into appropriate format
    start_time, end_time, start_frame, end_frame, fps, video_df = video_metadata

    rows_dict = {}
    for index, row in video_df.iterrows():
        key = "Frame" + str(int(row['frame']))
        values = [str(row['x']), str(row['y']), str(row['w']), str(row['h'])]
        rows_dict[key] = values

    json_obj = {
        "VideoInformation": filename,
        "CreationDate": creation_date,
        "VideoMetadata": {
            "StartTime": start_time,
            "EndTime": round(end_time, 3),
            "StartFrame": start_frame,
            "EndFrame": end_frame,
            "Fps": fps,
            "FrameData": rows_dict
        }
    }

    return json_obj

## Main Function 

In [9]:
# time should be in seconds
def face_detector(filename, filepath, start_time=0, end_time=None, start_frame=0, end_frame=None):
    video_metadata = extract_faces(filepath, start_time, end_time, start_frame, end_frame)
    return json_converter(filename, video_metadata)

In [10]:
json_format = face_detector('test_file', 'dummy.mp4')



2023-04-15 08:08:07.527108: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz




In [11]:
json_format

{'VideoInformation': 'test_file',
 'CreationDate': '2023-04-15',
 'VideoMetadata': {'StartTime': 0,
  'EndTime': 134.58333333333334,
  'StartFrame': 0,
  'EndFrame': 1615,
  'Fps': 12,
  'FrameData': {'Frame0': ['311.808', '99.84', '109.056', '152.064'],
   'Frame1': ['311.808', '99.84', '109.056', '152.064'],
   'Frame2': ['311.808', '99.84', '109.056', '152.064'],
   'Frame3': ['317.952', '101.376', '104.448', '147.456'],
   'Frame4': ['311.808', '101.376', '109.056', '150.528'],
   'Frame5': ['311.808', '101.376', '109.056', '150.528'],
   'Frame6': ['311.808', '99.84', '109.056', '152.064'],
   'Frame7': ['311.808', '99.84', '110.592', '153.6'],
   'Frame8': ['311.808', '110.592', '112.128', '145.92'],
   'Frame9': ['310.272', '109.056', '115.2', '147.456'],
   'Frame10': ['310.272', '101.376', '112.128', '152.064'],
   'Frame11': ['310.272', '109.056', '112.128', '147.456'],
   'Frame12': ['310.272', '109.056', '112.128', '147.456'],
   'Frame13': ['308.736', '99.84', '112.128', '

## Visualizer

In [29]:
# convert json to dataframe
video_df = pd.DataFrame.from_dict(json_format['VideoMetadata']['FrameData']).T
video_df.head(5)

Unnamed: 0,0,1,2,3
Frame0,311.808,99.84,109.056,152.064
Frame1,311.808,99.84,109.056,152.064
Frame2,311.808,99.84,109.056,152.064
Frame3,317.952,101.376,104.448,147.456
Frame4,311.808,101.376,109.056,150.528


In [33]:
video_path = 'dummy.mp4'
video_obj = cv2.VideoCapture(video_path)

# Check if the video file can be opened
if not video_obj.isOpened():
    print("Error opening video file")
    exit()

while True:
    ret, frame = video_obj.read()
    
    # Check if a frame was read successfully
    if not ret:
        print("Done")
        break
    
    # Check if there are any more rows in the DataFrame
    if len(video_df) == 0:
        break
    
    # Get the first row of the DataFrame and remove it from the DataFrame
    row = video_df.iloc[0]
    video_df = video_df.iloc[1:]
    
    x, y, w, h = int(float(row[0])), int(float(row[1])), int(float(row[2])), int(float(row[3]))
    cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)

    cv2.imshow("Frame with bounding box", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the video object and close all windows
video_obj.release()
cv2.destroyAllWindows()
