In [101]:
import os
import sys
import subprocess

import pandas as pd
import numpy as np
import math
import functools

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchaudio

In [102]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('the device being used is: ', device)

the device being used is:  cuda


In [103]:
saved_model_path = './state_lr-0.003_2020-03-24-14-08-18.tar'
test_audio_dir = '/home/laurence/Documents/machine-learning/deepfake/soundwaves/data/audio_dfdc_train_part_49/'
output_dir = './output/'
input_dir = './input/'

In [104]:
def create_windowed_tensor(input, window_size):
    input = input.view([1,input.shape[0],input.shape[1]])
    windows_num = math.ceil(input.shape[2]/window_size)
    stacked_partials = []
    for audio in input:
        audio_windows = []
        for window_n in range(windows_num):
            window_tensor = audio[:,window_n*window_size:(window_n+1)*window_size]
            if window_tensor.shape[1] < window_size:
                temp = torch.zeros(audio.shape[0], window_size)
                temp[:,:window_tensor.shape[1]] = window_tensor
                window_tensor = temp

            audio_windows.append(window_tensor)
        stacked_partials.append(torch.stack(audio_windows))
    stacked_total = torch.cat(stacked_partials)
    return stacked_total

In [105]:
def feed_input(video):
    audio_path = video
    waveform, sr = torchaudio.load(audio_path, out = None, normalization = True, channels_first=False)
    waveform = waveform.permute(1,0)
    windowed_waveform = create_windowed_tensor(waveform, 32000)
    return windowed_waveform

In [106]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv1d(1, 16, 64, stride=2)
        self.bn1 = nn.BatchNorm1d(16)
        self.pool1 = nn.MaxPool1d(8, stride=8)
        self.conv2 = nn.Conv1d(16, 32, 32, stride=2)
        self.bn2 = nn.BatchNorm1d(32)
        self.pool2 = nn.MaxPool1d(8, stride=8)
        self.conv3 = nn.Conv1d(32, 64, 16, stride=2)
        self.bn3 = nn.BatchNorm1d(64)
        self.conv4 = nn.Conv1d(64, 128, 8, stride=2)
        self.bn4 = nn.BatchNorm1d(128)
        self.conv5 = nn.Conv1d(128, 256, 4, stride=2)
        self.bn5 = nn.BatchNorm1d(256)
        self.pool3 = nn.MaxPool1d(4, stride=4)
        self.dropout = nn.Dropout(p=0.25)
        self.input_linear = 256*2
        self.fc1 = nn.Linear(self.input_linear, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 2)
        
    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool2(x)
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.conv5(x)
        x = F.relu(self.bn5(x))
        x = self.pool3(x)
        x = x.view(-1, self.input_linear)
        x = self.dropout(self.fc1(x)) #apply dropout on the fc layer
        x = self.dropout(self.fc2(x))
        x = self.fc3(x)
        return F.log_softmax(x)

model = Net()
params = model.parameters()
model.to(device)
print(model)

Net(
  (conv1): Conv1d(1, 16, kernel_size=(64,), stride=(2,))
  (bn1): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool1d(kernel_size=8, stride=8, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(16, 32, kernel_size=(32,), stride=(2,))
  (bn2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool1d(kernel_size=8, stride=8, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(32, 64, kernel_size=(16,), stride=(2,))
  (bn3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv4): Conv1d(64, 128, kernel_size=(8,), stride=(2,))
  (bn4): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv5): Conv1d(128, 256, kernel_size=(4,), stride=(2,))
  (bn5): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)


In [107]:
checkpoint = torch.load(saved_model_path)
model.load_state_dict(checkpoint['model_state_dict'])

model.eval()

Net(
  (conv1): Conv1d(1, 16, kernel_size=(64,), stride=(2,))
  (bn1): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool1d(kernel_size=8, stride=8, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(16, 32, kernel_size=(32,), stride=(2,))
  (bn2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool1d(kernel_size=8, stride=8, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(32, 64, kernel_size=(16,), stride=(2,))
  (bn3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv4): Conv1d(64, 128, kernel_size=(8,), stride=(2,))
  (bn4): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv5): Conv1d(128, 256, kernel_size=(4,), stride=(2,))
  (bn5): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)


In [108]:
# labels = pd.read_pickle('/home/laurence/Documents/machine-learning/deepfake/soundwaves/data/dfdc_train_part_49/output/uniques_pickle.pkl')
# correct = 0
# correct_finals = [0,0,0]
# count_windows = 0
# count = 0
# for filename in os.listdir(test_audio_dir):
#     count += 1
#     no_ext = filename[:-4]
#     for label in labels:
#         if label[0] == no_ext:
#             labelino = label[1]
#             # print(f'the label for {no_ext} is {label[1]}')
#     data = feed_input(test_audio_dir + filename)
#     data = data.to(device)
#     output = model(data)
#     window_predictions = []
#     for window in output:
#         count_windows += 1
#         prediction = torch.argmax(window)
#         window_predictions.append(prediction)
#         if labelino == prediction.item():
#             correct += 1
#     fake_window_num = window_predictions.count(1)
#     final_prediction = [0,0,0]
#     if fake_window_num == 1:
#         final_prediction[0] += 1
#     elif fake_window_num == 2:
#         final_prediction[0] += 1
#         final_prediction[1] += 1
#     elif fake_window_num > 2:
#         final_prediction[0] += 1
#         final_prediction[1] += 1
#         final_prediction[2] += 1

#     for final_pred_idx in range(0, 3):
#         if final_prediction[final_pred_idx] == labelino:
#             for correct_idx in range (0, final_pred_idx+1):
#                 correct_finals[correct_idx] += 1

    
# print(f'there have been {correct} correct window classifications over the whole {count_windows} range')
# for cf in correct_finals:
#     print(f'there have been {cf} correct classifications over the whole {count} range')

In [109]:
def extract_audio(video_name):
    video_path =  os.path.join(input_dir, video_name)
    output_wav_path = os.path.join(output_dir, video_name[:-4] + ".wav")
    probe_real = f"ffprobe -show_streams -print_format json {video_path} | grep -o 'Audio'"
    cmd_real = subprocess.run(probe_real,shell=True,stdout=subprocess.PIPE,stderr=subprocess.STDOUT)

    if "Audio" in cmd_real.stdout.decode('utf-8'):
        if not os.path.exists(output_wav_path):
            print("extracting audio from video")
            subprocess.call(['ffmpeg', '-i', video_path, '-max_muxing_queue_size', '9999',output_wav_path])
    return output_wav_path

In [110]:
def windows_prediction_aggregator(window_predictions):
    final_prediction = 0
    for window_pred in window_predictions:
        if window_pred == 1:
            final_prediction = 1
    return final_prediction

In [116]:
def predict(video_name):
    # create the audio file and assign the path to a var
    audio_path = extract_audio(video_name)
    # feed the audio file as a tensor and split it in windows
    data = feed_input(audio_path)
    data = data.to(device)
    # determine the predictions over each window
    window_output = model(data)
    # aggregate the predictions on the windows: if any window is predicted to be fake the whole video is considered fake
    prediction = windows_prediction_aggregator(window_output)
    print(f'The video {video_name} is predicted to be {"real" if prediction == 0 else "fake"}')

In [118]:
predict('1.mp4')
# extract_audio('./input/1.mp4')

'./output/./input/1.wav'

extracting audio from video
