In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.cloud import storage
from google.oauth2 import service_account
import os

In [2]:
from google.cloud import storage
try:
    from google.colab import auth
    auth.authenticate_user()
    credentials=None
except ModuleNotFoundError:

    credentials = service_account.Credentials.from_service_account_file( #file location of GCS private key
        '/Users/jeremiahherberg/Downloads/deepfake-research-a6b5e4f02da6.json')

In [3]:
client = storage.Client(project='deepfake-research', credentials=credentials)
video_frame_bucket_name = 'deepfake-dataset'
#bucket containing test data
video_frame_bucket = client.bucket(video_frame_bucket_name)
model_bucket_name = 'jh-gan-testing'
#bucket containing model
model_bucket = client.bucket(model_bucket_name)

In [4]:
#download and load model
model_name = 'disc_4.h5'
model_blob = model_bucket.blob(model_name)
model_blob.download_to_filename(model_name)
model = tf.keras.models.load_model(model_name)





In [5]:
def make_single_prediction(frame,
                          frame_shape=(225, 146, 3),
                          model_input_shape=(192, 128, 3),
                          model=model):
    '''
    function to make a prediction if a single frame in test dataset is real or not
    
    inputs:
        frame - array with a shape of (frame_shape[0] * frame_shape[1] * frame_shape[2],)
        
        frame_shape - tuple that describes the shape of the image, defaults to (225, 146, 3)
        the last dim must be 3 (RGB)
        
        model_input_shape - tuple that describes the shape of the input that will be passed into model
        defaults to (192, 128, 3), the last dim must be 3 (RGB)
        
        model - tensorflow model that will be used to make prediction, defaults to 'model'
    returns:
        prediction - Bool, True if model predicts frame is real, False if prediction is frame is fake
        raw_prediction - float, range of -inf to inf, negative number indicates prediction is that 
        frame is fake and positive indicates the prediction is the frame is real
        the closer to 0 the raw_prediction, the less confident the model is
    ''' 
    assert(frame_shape[2] == model_input_shape[2] == 3), 'the last dim must equal 3 in the model and frame shapes'
    frame_reshaped = frame.reshape(frame_shape)
    #resize frame to be compatable with model
    frame_resized = tf.image.resize_with_pad(frame_reshaped, model_input_shape[0], model_input_shape[1])
    # (H, W, RGB) -> (1, H, W, RGB)
    frame_expanded = tf.expand_dims(frame_resized, 0)
    #convert values to between -1 and 1 (0 to 255 -> -1 to 1) (this is how the model was trained)
    frame_expanded = (tf.cast(frame_expanded, tf.float32) - 127.5) / 127.5
    #we are only making a single prediction
    raw_prediction = model.predict(frame_expanded)[0][0]
    
    if raw_prediction > 0:
        prediction = True
    else:
        prediction = False
    
    return prediction, raw_prediction
    

In [6]:
def prediction_batch(idx,
                    bucket=video_frame_bucket,
                    test_ds_file_name='face_arrays',
                    ground_file_name='labels_and_video_names'):
    '''
    function to make a batch of predictions 
    test dataset is split up into 5 files
    
    inputs:
        idx - int with a value between 1 to 5 that represents which test ds file is being used for predictions
        
        bucket - GCS bucket variable that contains files with test data, default is video_frame_bucket
        
        test_ds_file_name - str - beginning of file name of files containing test ds, default is face_arrays
        
        ground_file_name - str - beginning of file name of files containing the ground truth of each frame 
    
    returns:
        correct_real - int - the number of real frames correctly predicted
        
        total_real - int - total number of real frames
        
        correct_fake - int - the number of fake frames correctly predicted
        
        total_fake - int - total number of fake frames
        
        prediction_df - DataFrame  with the following columns:
            raw_prediction - float, range of -inf to inf, negative number indicates prediction
            is frame is fake and postive indicates the prediction is frame is real
            the closer to 0 of the value, the less confident the model is regarding the prediction
            
            accurate - Bool - reflects if the model accurately predicted if the frame is real or fake
            
            ground_truth - Bool - reflects if the frame is real or fake

    '''
    #downlaod files:
    ds_file_name = '{}{}.npz'.format(test_ds_file_name, idx)
    ground_file_name = '{}{}.csv'.format(ground_file_name, idx)
    ds_blob = bucket.blob('aaa_frames/' + ds_file_name)
    ds_blob.download_to_filename(ds_file_name)
    ground_blob = bucket.blob('aaa_frames/' + ground_file_name)
    ground_blob.download_to_filename(ground_file_name)
    
    data_file = np.load(ds_file_name)
    data = [data_file[key] for key in data_file]
    ground = pd.read_csv(ground_file_name)
    ground_list = ground['face_labels'].to_list()
    assert(len(ground_list) == len(data)), 'The len of the DS and ground truth do not match'
    
    total_real, total_fake, correct_real, correct_fake = 0, 0, 0, 0
    predictions = []
    ground_truths = []
    accurate_predictions = []
    
    for frame in range(len(data)):
        prediction, raw_prediction = make_single_prediction(data[frame])
        real = ground_list[frame]
        accurate_prediction = (real == prediction)
        if real:
            total_real +=1
            if accurate_prediction:
                correct_real +=1
        else:
            total_fake +=1
            if accurate_prediction: 
                correct_fake +=1
        predictions.append(raw_prediction)
        ground_truths.append(real)
        accurate_predictions.append(accurate_prediction)
        if frame % 1000 == 999:
          print('real frames:')
          print('accurate: {}, total: {}'.format(correct_real, total_real))
          print('fake frames:')
          print('accurate: {}, total: {} '.format(correct_fake, total_fake))
    
    
    preds_dictionary = {
        'raw_prediction':predictions,
        'accurate': accurate_predictions,
        'ground_truth': ground_truths
    }
    prediction_df = pd.DataFrame(preds_dictionary)
        
    return correct_real, total_real, correct_fake, total_fake, prediction_df
    
    
    

In [7]:
def make_predictions(start, end):
    '''
    function to make predictions on entire dataset
    
    inputs:
        start - int, starting file number in test dataset
        
        end - int, ending file number in test dataset, must be greater than or equal to start
        
    returns:
    correct_real - int - the number of real frames correctly predicted

    total_real - int - total number of real frames

    correct_fake - int - the number of fake frames correctly predicted

    total_fake - int - total number of fake frames

    prediction_df - DataFrame  with the following columns:
        raw_prediction - float, range of -inf to inf, negative number indicates prediction
        is frame is fake and postive indicates the prediction is frame is real
        the closer to 0 of the value, the less confident the model is regarding the prediction

        accurate - Bool - reflects if the model accurately predicted if the frame is real or fake

        ground_truth - Bool - reflects if the frame is real or fake
    '''
    
    assert(start <= end), 'start must be less than or equal to end'
    
    total_real, total_fake, correct_real, correct_fake = 0, 0, 0, 0
    prediction_dfs = []
    for file in np.arange(start, end+1):
        cor_real_batch, real_batch, cor_fake_batch, fake_batch, df_batch = prediction_batch(file)
        total_real += real_batch
        total_fake += fake_batch
        correct_real +=cor_real_batch
        correct_fake += cor_fake_batch
        prediction_dfs.append(df_batch)
    
    prediction_df = pd.concat(prediction_dfs)
    
    return total_real, total_fake, correct_real, correct_fake, prediction_df

In [8]:
with tf.device('/device:GPU:0'):
    total_real, total_fake, correct_real, correct_fake, prediction_df = make_predictions(1, 5)

real frames:
accurate: 44, total: 52
fake frames:
accurate: 190, total: 948 
real frames:
accurate: 112, total: 127
fake frames:
accurate: 362, total: 1873 
real frames:
accurate: 181, total: 206
fake frames:
accurate: 607, total: 2794 
real frames:
accurate: 244, total: 281
fake frames:
accurate: 911, total: 3719 
real frames:
accurate: 287, total: 333
fake frames:
accurate: 1230, total: 4667 
real frames:
accurate: 320, total: 373
fake frames:
accurate: 1573, total: 5627 
real frames:
accurate: 374, total: 443
fake frames:
accurate: 1732, total: 6557 
real frames:
accurate: 436, total: 513
fake frames:
accurate: 1861, total: 7487 
real frames:
accurate: 495, total: 579
fake frames:
accurate: 1995, total: 8421 
real frames:
accurate: 557, total: 674
fake frames:
accurate: 2143, total: 9326 
real frames:
accurate: 764, total: 946
fake frames:
accurate: 2221, total: 10054 
real frames:
accurate: 841, total: 1024
fake frames:
accurate: 2238, total: 10976 
real frames:
accurate: 944, tota

In [9]:
print('{}/{} real frames predicted correctly, and {}/{} fake frames predicted correctly'.format(correct_real,
                                                                                                total_real,
                                                                                                correct_fake,
                                                                                                total_fake))

32182/42036 real frames predicted correctly, and 44337/222156 fake frames predicted correctly


In [10]:
prediction_df.to_csv('predictions1.csv')