In [25]:
import sys
sys.path.append("upmixing-final")

from unet import UpmixResnet18Scratch

import numpy as np
import os

import cv2
import torch
import torch.nn as nn
import librosa
from PIL import Image
from IPython.display import Video
import subprocess

In [26]:
SAVE_DIR = "demo"
VIDEO_PATH="leftrightdemo3.mp4"

if not os.path.isdir(SAVE_DIR):
    os.makedirs(SAVE_DIR)

In [27]:
# preprocess video using ffmpeg
video_input_path = os.path.splitext(os.path.basename(VIDEO_PATH))[0] + "_30fps"
video_input_path = os.path.join(SAVE_DIR, video_input_path + ".mp4")
cmd = "ffmpeg -i %s -filter:v fps=fps=30 -strict -2 %s" % (VIDEO_PATH, video_input_path)
print("Running in shell:", cmd)
subprocess.call(cmd, shell=True)

Running in shell: ffmpeg -i leftrightdemo3.mp4 -filter:v fps=fps=30 -strict -2 demo/leftrightdemo3_30fps.mp4


0

In [28]:
# extract audio using ffmpeg
audio_input_path = os.path.splitext(os.path.basename(video_input_path))[0]
audio_input_path = os.path.join(SAVE_DIR, audio_input_path + ".mp3")
cmd = "ffmpeg -i %s -f mp3 -ab 192000 -vn %s" % (video_input_path, audio_input_path)
print("Running in shell:", cmd)
subprocess.call(cmd, shell=True)

Running in shell: ffmpeg -i demo/leftrightdemo3_30fps.mp4 -f mp3 -ab 192000 -vn demo/leftrightdemo3_30fps.mp3


0

In [29]:
# check video
Video(video_input_path)

In [30]:
# initialize video clip loader
    
def load_video(videofile):
    capture = cv2.VideoCapture(videofile)
    cap_frames = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))

    v_tensor = []

    for idx in range(cap_frames):
        ret, frame = capture.read()
        if not ret:
            break
        v_tensor += [frame]

    v_tensor = [Image.fromarray(np.uint8(frame)).resize((224,224)) for frame in v_tensor]
    v_tensor = np.stack(v_tensor)/255

    return v_tensor, cap_frames

def load_audio(audiofile):
    audio, _ = librosa.load(audiofile, sr=16000, mono=True)
    audio = audio/np.max(np.abs(audio))
    if len(audio.shape) == 1: # one-channel input
        audio = np.stack((audio, audio), axis=0)
    return audio

class ClipGenerator(object):
    def __init__(self, video_fps=30, video_downsample_factor=5, audio_sr=16000, clip_length=2.87, hop_length=2):
        self.video_fps = video_fps
        self.video_downsample_factor = video_downsample_factor
        self.audio_sr = audio_sr
        self.clip_length = clip_length
        self.hop_length = hop_length

        self.n_video_frames = int(video_fps*clip_length)
        self.n_audio_samples = int(audio_sr*clip_length)
        
        self.n_video_frames_hop = int(video_fps*hop_length)
        
    def generator(self, videofile, audiofile):
        video, total_frames = load_video(videofile)
        audio = load_audio(audiofile)
        
        start_idx = 0
        
        while start_idx < total_frames - self.n_video_frames:
            yield self.get_clip(video, audio, start_idx)
            start_idx += self.n_video_frames_hop
            
        yield self.get_clip(video, audio, total_frames - self.n_video_frames, True)
            
    def get_clip(self, video, audio, start_idx, last_clip=False):
        clip = {}
        
        videoclip = video[start_idx : start_idx+self.n_video_frames : self.video_downsample_factor]
        videoclip = torch.from_numpy(videoclip).float()
        videoclip = videoclip.permute(3,0,1,2)
        
        if last_clip:
            audio_start_idx = audio.shape[1]-self.n_audio_samples
        else:
            audio_start_idx = int(start_idx*self.audio_sr/self.video_fps)
            
        audioclip = audio[:, audio_start_idx : audio_start_idx+self.n_audio_samples]
        
        audio_sum = audioclip[0] + audioclip[1]
        audio_sum_spec = self._get_stft(audio_sum)
        audio_sum_spec = torch.from_numpy(audio_sum_spec).float().permute(0,2,1)

        audio_diff = audioclip[0] - audioclip[1]
        audio_diff_spec = self._get_stft(audio_diff)
        audio_diff_spec = torch.from_numpy(audio_diff_spec).float().permute(0,2,1)
        
        return {'start_frame': start_idx, 'end_frame': start_idx+self.n_video_frames, 
                'start_audio_frame': audio_start_idx, 'end_audio_frame': audio_start_idx+self.n_audio_samples,
                'video': videoclip.unsqueeze(0), 'audio': audioclip, 
                'audio_sum_spec': audio_sum_spec.unsqueeze(0), 'audio_diff_spec': audio_diff_spec.unsqueeze(0)}

    def _get_stft(self, raw):
        stft = librosa.core.stft(np.ascontiguousarray(raw), 512, hop_length=160, win_length=400, center=True)
        return np.stack((np.real(stft), np.imag(stft)))[:,:-1,:]
    
    def stft_to_waveform(self, stft):
        stft = stft[0,:,:] + (1j * stft[1,:,:])
        raw = librosa.core.istft(stft, hop_length=160, win_length=400, center=True)
        return raw
    
clip_generator = ClipGenerator()

In [31]:
# load model
model = UpmixResnet18Scratch()
pretrained = ("models/upmixing-final-exp-1-flip-checkpoint-best.pth.tar")
avnet = nn.DataParallel(model)
checkpoint = torch.load(pretrained)
avnet.load_state_dict(checkpoint['state_dict'])
print("loaded pretrained model from", pretrained)
model = avnet.module
model.cuda() # use GPU
model.eval()

Loaded 2D Resnet18 - training weights from scratch
loaded pretrained model from models/upmixing-final-exp-1-flip-checkpoint-best.pth.tar


UpmixResnet18Scratch(
  (videonet): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_runni

In [32]:
### upmix audio

# load mono audio
audio = load_audio(audio_input_path)
audio_sum = audio[0] + audio[1]

# generate L/R channel difference
audio_diff = np.zeros((len(audio_sum),))

loader = clip_generator.generator(video_input_path, audio_input_path)
for idx, sample in enumerate(loader):
    #print(idx)
    keys = model.keys + ['audio_diff_spec']
    vars = {k: sample[k] for k in keys}
    vars = {k: vars[k].cuda() for k in keys}
    # debug
    #for k, v in vars.items():
    #    print(k, v.shape)
    out = model(vars)
    start_audio_frame = sample['start_audio_frame']
    end_audio_frame = sample['end_audio_frame']
    
    diff = out['pred'].squeeze(0).permute(0,2,1).cpu().data.numpy()
    diff = clip_generator.stft_to_waveform(diff)
    audio_diff[start_audio_frame:end_audio_frame] = diff
    
# adjust magnitude of audio diff
audio_diff = audio_diff*1.0
    
audio_pred = np.stack(((audio_sum + audio_diff)/2, (audio_sum - audio_diff)/2))

audio_output_path = os.path.splitext(os.path.basename(audio_input_path))[0]
audio_output_path = os.path.join(SAVE_DIR, audio_output_path + "_pred.wav")
librosa.output.write_wav(audio_output_path, np.asfortranarray(audio_pred), sr=16000, norm=False)



In [33]:
# combine audios with video
video_output_path = os.path.splitext(os.path.basename(audio_output_path))[0]
video_output_path = os.path.join(SAVE_DIR, video_output_path + ".mp4")
cmd = 'ffmpeg -i %s -i %s -c:v copy -c:a aac -strict experimental -map 0:v:0 -map 1:a:0 %s' % (video_input_path, audio_output_path, video_output_path)
subprocess.call(cmd, shell=True)
#cmd = 'ffmpeg -i demo/%s.mp4 -i demo/%s_gt.wav -c:v copy -c:a aac -strict experimental -map 0:v:0 -map 1:a:0 demo/output_%s_gt.mp4' % (VIDEO_ID,VIDEO_ID,VIDEO_ID)
#subprocess.call(cmd, shell=True)
#cmd = 'ffmpeg -i demo/justin_demo_30fps.mov -i demo/justin_demo_30fps_mono.wav -c:v copy -c:a aac -strict experimental -map 0:v:0 -map 1:a:0 demo/justin_demo_30fps_mono.mp4'
#subprocess.call(cmd, shell=True)

0

In [35]:
# check video output
Video(video_output_path)