<a href="https://colab.research.google.com/github/khaqanashraf/video-scraping/blob/master/video_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:

import urllib
from bs4 import BeautifulSoup
# from pytube import YouTube
import cv2
import os
import glob
import matplotlib.pyplot as plt
import numpy as np
from os import listdir
from os.path import isfile, join


In [15]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
# the function scrap frames from an image
def get_frames_from_video(video_path , delay):
    
    """
        video_path is the path to the video with video formate
        e.g '/content/gdrive/My Drive/videos/video.mp4'


        delay is the parameter to fetch a frame from video after a certain delay
        like you want to fetch each frame after 5 seconds


        it will return each frame fetched from the video given by video_path after 
        each delay given by 'delay' parameter in numpy
    
    """
        

    
    vidcap = cv2.VideoCapture(video_path)
    success,image = vidcap.read()
    count = 0
    frames = list()
    fps = int(vidcap.get(cv2.CAP_PROP_FPS))
    i = 0
    while success: 
        frames.append(image)
        success,image = vidcap.read()

        print(f'Read a new frame{i}: ', success)
        count += delay*fps
        i+=1
        vidcap.set(1, count)
    return  np.array(frames)


def scrap_faces_from_image(image, cascade_path):

    """
          image: is the numpy multidimensional array having pixel values of images at location

          cascade_path: it is a complete path to an xml document. 
          this xml document contains real values of pretrained model.
          for scraping of faces we can use cascade_frontalface_default or any other varient.
          the related pre trained values can be found at 'https://github.com/opencv/opencv/tree/master/data/haarcascades'

          it will return boxes e.i coordinates of faces detected in the image
          there could be more than one face
          each face is coordinated with x,y,w,h (x and y are the left top corner of the box whereas 'w' is the width of the box and 'h' is the height of the box)
    
    """

    face_cascade = cv2.CascadeClassifier(cascade_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, 1.1, 4)
    return faces

def crop_faces_from_image(faces, img):

    """
          faces: is the numpy multidimensional array containing coordinates of each faces the image contained
          each face box can be expressed as the following tuple (x,y,w,h)

          img: is the image containing some faces


          return: the function return a multidiemnsional numpy array containg cropped faces according to given coordinates in 'faces'
    """

    face_images = list()
    for (x, y, w, h) in faces:
        cv2.rectangle(img, (x, y), (x+w, y+h), (255, 0, 0), 2)
        face_image = img[y:y+h, x:x+w]
        face_images.append(face_image)

    return np.array(face_images)



def fetch_files_from_directory(directory):
    """
    
        directory: the source directory which contains files
        the function fetch all files within the directory and return a list of file names
    
    """
  
    return [f for f in listdir(directory) if isfile(join(directory, f))]



def scrap_faces_from_videos(src_dir, des_dir, cascade_path, gray_scale = True, delay=5):

    """
          src_dir: it is a source directory path where video files are placed. this function fetch all video files from that source directory.
          

          des_dir: the destination directory where all scaped faces would be placed


          cascade_path: it is a complete path to an xml document. 
          this xml document contains real values of pretrained model.
          for scraping of faces we can use cascade_frontalface_default or any other varient.
          the related pre trained values can be found at 'https://github.com/opencv/opencv/tree/master/data/haarcascades'

          gray_scale: a boolean variable which determins either you want to save faces in a gray scale or note. Default is True.


          delay: is the parameter to fetch a frame from video after a certain delay
          like you want to fetch each frame after 5 seconds

    
    """

    files = fetch_files_from_directory(src_dir)
    i = 0
    for f in files:
        video_path = f'{src_dir}/{f}'
        frames = get_frames_from_video(video_path, delay)
        croped_faces = list()
        for frame in frames:
            faces = scrap_faces_from_image(frame, cascade_path)
            croped_faces.extend(crop_faces_from_image(faces, frame))

        croped_faces = np.array(croped_faces)

        for face in croped_faces:
            if gray_scale:
                face = cv2.cvtColor(face, cv2.COLOR_BGR2GRAY)
            cv2.imwrite(f'{des_dir}/face{i}.jpg', face)
            i += 1



In [0]:
# directory path where I've placed some mp4 video files for scraping
src_dir = '/content/gdrive/My Drive/video_scraping/videos'
# directory path for which I want to save scrapped faces from video files
des_dir = '/content/gdrive/My Drive/video_scraping/faces'
# cascade xml file of pre trained model to extract faces
cascade_path = '/content/gdrive/My Drive/video_scraping/haarcascade_frontalface_default.xml'

In [21]:
scrap_faces_from_videos(src_dir, des_dir, cascade_path)

Read a new frame0:  True
Read a new frame1:  True
Read a new frame2:  True
Read a new frame3:  True
Read a new frame4:  True
Read a new frame5:  True
Read a new frame6:  True
Read a new frame7:  True
Read a new frame8:  True
Read a new frame9:  True
Read a new frame10:  True
Read a new frame11:  True
Read a new frame12:  True
Read a new frame13:  True
Read a new frame14:  True
Read a new frame15:  True
Read a new frame16:  True
Read a new frame17:  True
Read a new frame18:  True
Read a new frame19:  True
Read a new frame20:  True
Read a new frame21:  True
Read a new frame22:  True
Read a new frame23:  True
Read a new frame24:  True
Read a new frame25:  True
Read a new frame26:  False
Read a new frame0:  True
Read a new frame1:  True
Read a new frame2:  True
Read a new frame3:  True
Read a new frame4:  True
Read a new frame5:  True
Read a new frame6:  True
Read a new frame7:  True
Read a new frame8:  True
Read a new frame9:  True
Read a new frame10:  True
Read a new frame11:  True
Read 