In [None]:
import pandas as pd
import numpy as np
import boto3
from splinter import Browser
from zipfile import ZipFile 
import os
import time
import requests
import random 
import json
# import io - don't think will use this library
import skvideo.io as sk
from sklearn.model_selection import train_test_split
from skimage.transform import resize
from tensorflow.python import keras
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Flatten, Conv2D, Dropout, Activation
from keras.utils import to_categorical
#sensitive variables in config.py file that is on .gitignore
from config import key_, secret_, s3_bucket, kaggle_cookie


In [None]:
#sagemaker dependencies
from sagemaker import get_execution_role
role = get_execution_role()

region = boto3.Session().region_name

s3_bucket='myBucket' # Replace with your s3 bucket name


In [None]:
#explore meta.json file
with open('meta.json') as m:
    meta = json.load(m)
video_and_labels = {}
video_label_only = {}
for video in meta:
    video_and_labels[video] = meta[video]
    video_label_only[video] = meta[video]['label']

In [None]:
#may want to consider changing the array to a video file name/path and incorporate opeing the video in this function
def preprocess_video(video_array, max_size=315):
    '''
    takes a video array as an input, and returns an array with each 10th frame (strating from the 9th), along with
    the difference between the frame in question and the 3 and 5th frame back and forward
    function will crop the video into a box format by cropping the center of the video and resizing into a 315x315 
    pixel video
    '''
    num_frames_div_10 = len(video_array)/10
    num_rounds = num_frames_div_10 -1
    round_num = 0
    frame_list = []
    num_frames, x_pixel, y_pixel, _ = video_array.shape
    min_pixel = min(x_pixel, y_pixel)
    x_pixel_max = int((x_pixel/2) + (min_pixel/2))
    y_pixel_max = int((y_pixel/2) + (min_pixel/2))
    x_pixel_min = int((x_pixel/2) - (min_pixel/2))
    y_pixel_min = int((y_pixel/2) - (min_pixel/2))
    video_box_shape = video_array[:, x_pixel_min: x_pixel_max, y_pixel_min:y_pixel_max,:]
    for x in np.arange(0, len(video_array)):            
        if x % 10 == 9:
            if round_num < num_rounds:
                frame_sized = resize(video_box_shape[x], [max_size, max_size])
                back_3 = resize(video_box_shape[x-3], [max_size, max_size])
                back_5 = resize(video_box_shape[x-5], [max_size, max_size])
                forward_3 = resize(video_box_shape[x + 3], [max_size, max_size])
                forward_5 = resize(video_box_shape[x + 5], [max_size, max_size])
                minus_3 = np.array(abs(frame_sized - back_3))
                minus_5 = np.array(abs(frame_sized - back_5))
                plus_3 = np.array(abs(frame_sized - forward_3))
                plus_5 = np.array(abs(frame_sized - forward_5))
                frame_list.append([minus_3, minus_5, plus_3, plus_5])#, frame_sized])
                round_num += 1
            else:
                pass
    frame_list = np.array(frame_list)
    #reshape
    ndims = frame_list.shape[1] * frame_list.shape[2] * frame_list.shape[3] * frame_list.shape[4]
    frame_list_ = frame_list.reshape(frame_list.shape[0], ndims)
    return frame_list_

In [None]:
#consider returning a list of arrays, eg process x number of videos at a time
def download_video_from_s3_bucket(video_name, aws_key=key_, aws_secret=secret_, bucket=s3_bucket):
    '''
    ##Intended for use when not using Sagemaker##
    takes a video name as input, and returns a downloaded video from s3 bucket 
    '''
    s3 = boto3.client('s3', aws_access_key_id=aws_key, aws_secret_access_key=aws_secret)
    s3.download_file(bucket, video_name, video_name)
    video_array = sk.vread(video_name)
    os.remove(video_name)
    return video_array
    

In [None]:
def get_video(video, computer=True):
    '''
    takes a video name, and if you are using a computer as input
    calls appropiate function to download video from s3 bucket, depending if you are using a computer or sagemaker
    '''
    if computer==True:
        response = download_video_from_s3_bucket(video)
    #todo - create function to obtain video via sagemaker notebook instance
    #once created, call function below
    else:
        response = 0
    
    x_values = preprocess_video(response)
    y_value = meta[video]['label']
    y_values = []
    for frame in np.arange(0, len(x_values)):
        if y_value == 'FAKE':
            y_values.append(0)
        else:
            y_values.append(1)
    y_values_ = to_categorical(y_values, num_classes=2)
    return x_values, y_values_

In [None]:
def generator(video_dictionary, batch_size=1, train=True):
    '''
    takes a dictionary or list of video names, and returns the output from get_video function for one video at a time
    if train is set to false, the list will be randomized initially
    '''
    count = 0
    video_list = []
    for video in video_dictionary:
        video_list.append(video)
    #split dataset into training and testing sets
    _, _, y_train, y_test = train_test_split(video_list, video_list, test_size=.1, random_state=3)
    #if not training, set the video list to the test set, otherwise set it to the training set
    if train == False:
        video_list_ = y_test
    else:
        video_list_ = y_train
    random.shuffle(video_list_)
    while True:
#         x_batch = np.empty(0)
#         y_batch = np.empty(0)
        for x in np.arange(0, batch_size):
            if count == len(video_list_):
                count = 0
                random.shuffle(video_list_)
            x, y = get_video(video_list_[count])
#         yield x_batch, y_batch
        yield x, y


In [None]:
#consider eventually hardcoding the x shape
# x, y = get_video('vpmyeepbep.mp4')

In [None]:
# y.shape

In [None]:
# x.shape

In [None]:
model = Sequential()
model.add(Dense(100, activation='relu', input_dim=1190700)) #input_dim=x.shape[1] <- hard code the input_dim
model.add(Dense(100, activation='relu'))
model.add(Activation('relu'))
#output layer
model.add(Dense(2, activation='softmax'))
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
model.fit(x=generator(meta, 1), validation_data=generator(meta, 1, train=False))

In [None]:
model.metrics_names

In [None]:
x, _ = get_video('xpzfhhwkwb.mp4') # fake video
model.predict(x)

In [None]:
x, _ = get_video('xmkwsnuzyq.mp4')
model.predict(x) #real video

In [None]:
model.summary()

In [None]:
model.to_json()

In [None]:
# model.save_weights('sample_model.m5')
video_list = []
for x in meta:
    video_list.append(x)