In [None]:
import pandas as pd
import numpy as np
import boto3
import csv
import cv2 as cv
import os
# import time
import random 
import json
from joblib import dump, load
import math
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.python import keras
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import *# Dense, Flatten, Conv2D, Dropout, Activation, BatchNormalization, MaxPooling2D
from keras.utils import to_categorical
from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import load_model
from mtcnn.mtcnn import MTCNN
face_detector = MTCNN()
#sensitive variables in config.py file that is on .gitignore
from config import key_, secret_, s3_bucket, kaggle_cookie

#from functions_for_testing import *

In [None]:
#explore meta.json file
with open('../meta.json') as m:
    meta = json.load(m)

In [None]:
#load csv files with videos that exist in S3 bucket
video_df = pd.read_csv('../video_information.csv')
video_list = video_df['video_names'].to_list()

In [None]:
def get_faces_from_video(video_link,
                         video_link_real=None,
                         skipped_frames=(4, 9) , 
                         new_max_size=750, 
                         face_confidence = 0.75,
                         face_dim = (146, 225)): #
    '''
     todo -- improve documentation for inputs and outputs
    '''
    #load the video
    video = cv.VideoCapture(video_link)
#     frame_count = int(video.get(cv.CAP_PROP_FRAME_COUNT)) #not needed, but takes very little runtime
    #skip appropiate number of frames based on skipped_frames input
    for skipped_frame in np.arange(0, (skipped_frames[0])):
        _ = video.grab()
    count = 0
    def find_faces():
        '''
        nested function that accesses the next frame within the current video, and finds all faces in that frame
        returns:
            face_dictionaries: list of dictionaries that contain the 'box' and 'confidence' of detected faces
        '''
        _ = video.grab()
        _, frame = video.retrieve()
        #convert the frame to color
        #unsure if this step is necessary, however cvtColor takes very little time (~200 µs )
        img = cv.cvtColor(frame, cv.COLOR_BGR2RGB)
        original_height = frame.shape[0]
        original_width = frame.shape[1]
        #get original shape of frame
        original_height, original_width = frame.shape[0], frame.shape[1]
        #get aspect ratio -- want to maintain this
        img_size_ratio = original_height / original_width
        #if the height is greater than the width, make new height the new_max_size, and
        #make new width the new height divided by the aspect ratio
        if original_height > original_width:
            new_height = new_max_size
            new_width = new_height / img_size_ratio
        #otherwise, make the new width equal to the new max size, and 
        #the new height the new width times the aspect ratio
        else:
            new_width = new_max_size
            new_height = new_width * img_size_ratio
        #new dimensions -- the aspect ratio will not match exactly due to rounding, but will be close
        new_dim = (int(new_width), int(new_height))
        #resize the image while maintaining the aspect ratio, and changing the maximum edge length to new_max_size
        resized_image = cv.resize(img, new_dim, interpolation = cv.INTER_AREA)
        face_dictionaries = face_detector.detect_faces(resized_image)
        return face_dictionaries, resized_image
    
    
    def get_face_array(face_dictionaries, resized_image):
        '''
        todo -- document function inputs and outputs
        '''
        faces = []
        boxes = []
        for face in range(len(face_dictionaries)):
            #only review faces that have more than a face_confidence% confidence of being a face
            if face_dictionaries[face]['confidence'] > face_confidence:
                #the 'box' of the face is a list of pixel values as: '[x, y, width, height]'
                box = face_dictionaries[face]['box']
                #this is the left side of the face. 
                start_x = box[0] 
                #right side of the face. 
                end_x = box[0] + box[2]
                #bottom of face
                start_y = box[1] 
                #top of face
                end_y = box[1] + box[3]
                #keep consistant - do additional research on this
                face_ratio = round(face_dim[1] / face_dim[0], 2) # will keep horizontal size the same 
                #(can experiment with adjusting the horizontal axis later)
                face_image = resized_image[start_y:end_y, start_x:end_x]
                new_face = cv.resize(face_image, face_dim, interpolation = cv.INTER_AREA)#change new_dim_ to face_dim
                faces.append(new_face)
                boxes.append(box)
        return faces, boxes
    
    #get the locations of faces and image of the 5th (default params) frame
    face_box1, image1 = find_faces()
    #get the faces from the 5th (default params) frame
    found_faces_1, face_boxes_1 = get_face_array(face_box1, image1)
    #find number of frames need to skip 
    num_skip_frames = skipped_frames[1] - skipped_frames[0] - 1
    #skip the appropiate number of frames
    #this will result in the current frame being grabed being the skipped_frames[1]-th frame
    for skipped_frame in np.arange(0, num_skip_frames):
        _ = video.grab()
    face_box2, image2 = find_faces()
    found_faces_2, face_boxes_2 = get_face_array(face_box2, image2)
    video.release()
    #create empty lists for the faces found in each frame 
    #a True value means the pixels in the face are real
    face1_match = []
    face2_match = []
    #create empty list for resized face arrays
    faces1_resized = []
    faces2_resized = []
    #calculate resized shape
    new_dim = face_dim[0] * face_dim[1] * 3
    #if there is a link passed into the functino for the corresponding real video
    if video_link_real is not None:
        video = cv.VideoCapture(video_link_real)
        #skip the appropiate number of frames
        for skipped_frame in np.arange(0, (skipped_frames[0])):
            _ = video.grab()
        _, real_image1 = find_faces()
        real_faces1, _ = get_face_array(face_box1, real_image1)

        for skipped_frame in np.arange(0, num_skip_frames):
            _ = video.grab()
        _, real_image2 = find_faces()
        real_faces2, _ = get_face_array(face_box2, real_image2)
        video.release()
        #loop through the faces found in the 5th frame
        for face in range(len(found_faces_1)):
            #if every pixel matches in the corresponding real image, label the face as 'real' via
            #appending 'True' to a list whose index will be associated with the face
            if (real_faces1[face] == found_faces_1[face]).all():#fix
                face1_match.append(True)
                face_resized = found_faces_1[face].reshape(new_dim,)
                faces1_resized.append(face_resized)
            else:
                face1_match.append(False)
                face_resized = found_faces_1[face].reshape(new_dim,)
                faces1_resized.append(face_resized)
        
        for face in range(len(found_faces_2)):
            if (real_faces2[face] == found_faces_2[face]).all():
                face2_match.append(True)
                face_resized = found_faces_2[face].reshape(new_dim,)
                faces2_resized.append(face_resized)
            else:
                face2_match.append(False)
                face_resized = found_faces_2[face].reshape(new_dim,)
                faces2_resized.append(face_resized)
    else:
        #if there is no corresponding real video passed into the function, label all faces as real
        for face in range(len(found_faces_1)):
            face1_match.append(True)
            face_resized = found_faces_1[face].reshape(new_dim,)
            faces1_resized.append(face_resized)
        
        for face in range(len(found_faces_2)):
            face2_match.append(True)
            face_resized = found_faces_2[face].reshape(new_dim,)
            faces2_resized.append(face_resized)
        
        
    
    return faces1_resized, faces2_resized, face1_match, face2_match
                
        

In [None]:
def get_video_link(video_name, aws_key=key_, aws_secret=secret_, bucket=s3_bucket):
    '''
    ##Intended for use when not using Sagemaker##
    takes a video name as input, and returns a downloaded video from s3 bucket in an array
    '''
    s3 = boto3.client('s3',
                      aws_access_key_id=aws_key, 
                      aws_secret_access_key=aws_secret,
                      region_name='us-east-2', #region is hardcoded - this is not a security risk to keep public
                      config= boto3.session.Config(signature_version='s3v4')) #the sig version needs to be s3v4 or the url will error
    video_url = s3.generate_presigned_url('get_object',
                                        Params={"Bucket": bucket,
                                               'Key': video_name},
                                        ExpiresIn=6000)
    return video_url

In [None]:
face_arrays = []
face_labels = []
video_names = []
for video in range(80):#len(video_list)):
    video_name = video_list[video]
    video_link = get_video_link(video_name)
    fake_or_real = meta[video_name]['label']
    if fake_or_real == 'FAKE':
        real_video = meta[video_name]['original']
        real_link = get_video_link(real_video)
    else:
        real_link = None
    try:
        faces_5thframe, faces_10thframe, real_faces_5th, real_faces_10th = get_faces_from_video(video_link, real_link)
    except:
        pass #problem video -- gilzdnbpep.mp4 - investigate 
    for idx in range(len(faces_5thframe)):
        face_arrays.append(faces_5thframe[idx])
        face_labels.append(real_faces_5th[idx])
        video_names.append(video_name)
    for idx in range(len(faces_10thframe)):
        face_arrays.append(faces_10thframe[idx])
        face_labels.append(real_faces_10th[idx])
        video_names.append(video_name)
    
npz_filepath = 'face_arrays.npz'
np.savez(npz_filepath, *face_arrays)
output_path_csv = 'labels_and_video_names_test.csv'
df = pd.DataFrame({'face_labels': face_labels, 'video_names': video_names})
df.to_csv(output_path_csv, index=False)

In [None]:
s3 = boto3.client('s3', aws_access_key_id=key_, aws_secret_access_key=secret_)
s3.upload_file(npz_filepath, s3_bucket, npz_filepath)
s3.upload_file(output_path_csv, s3_bucket, output_path_csv)