In [1]:
#!sudo pip install pytube

In [2]:
#from pytube import YouTube
def download(url):
    youtubeObject = YouTube(url)
    youtubeObject = youtubeObject.streams.get_lowest_resolution()
    try:
        youtubeObject.download()
    except:
        print("An error has occurred")
    print("Download is completed successfully")

#url = "https://www.youtube.com/watch?v=FJZ-BHBKyos" # car chase
#url = "https://www.youtube.com/watch?v=1gwglom4FeA" # 8 hr nature
#url = "https://www.youtube.com/watch?v=8W1qF7l2A1c" # 6 hr nature
#download(url)

In [3]:

!pip install scikit-video

Defaulting to user installation because normal site-packages is not writeable


In [4]:
import cv2
import numpy as np
import skvideo.io  
import matplotlib.pyplot as plt

In [5]:
'''
##NOTE: use vreader and then create a downsampled np array
## try 36x64
resized_vid_arr = []
videodata = skvideo.io.vread("car_chase.mp4")  
for v in videodata:
  roi = cv2.resize(v, (36, 64))
  roi = roi.astype("float") / 255.0
  # roi = img_to_array(roi)
  # roi = np.expand_dims(roi, axis=0)
  resized_vid_arr.append(roi) 
resized_vid_arr = np.array(resized_vid_arr)
'''
resized_vid_arr = np.load("resized_vid_arr.npy")

In [6]:
resized_vid_arr.sum()

39977099.83529407

In [7]:
resized_vid_arr.shape

(18547, 64, 36, 3)

In [8]:
from moviepy.video.io.VideoFileClip import AudioFileClip
audioclip = AudioFileClip('car_chase.mp4', fps=9000)
# audioclip = videoclip.audio
audio_array = audioclip.to_soundarray()

In [9]:
video_and_audio_arr = []
aud_per_frame = audio_array.shape[0]// resized_vid_arr.shape[0]
for i in range(len(resized_vid_arr)):
  aud_in = audio_array[i*aud_per_frame : (i+1)*aud_per_frame+1]
  vec_in = [resized_vid_arr[i], aud_in]
  video_and_audio_arr.append(vec_in)

In [10]:
fps = 30
def make_frame(t):
  t = int(t * fps)
  # print(t)
  return video_and_audio_arr[t][0]

from moviepy.video.io.VideoFileClip import VideoClip
myclip = VideoClip(make_frame, duration = 20)

In [11]:
myclip.write_videofile('test.mp4', fps = 30)

Moviepy - Building video test.mp4.
Moviepy - Writing video test.mp4



                                                    

Moviepy - Done !
Moviepy - video ready test.mp4




In [12]:
## concatonate the audio
audio = video_and_audio_arr[0][1]
for i in range(20*40):
  next_aud = video_and_audio_arr[i+1][1]
  audio = np.vstack((audio, next_aud))
print(audio.shape)

(241101, 2)


In [13]:
!pip install pydub

Defaulting to user installation because normal site-packages is not writeable


In [14]:
import pydub 

def write(f, sr, x, normalized=True):
    """numpy array to MP3"""
    channels = 2 if (x.ndim == 2 and x.shape[1] == 2) else 1
    if normalized:  # normalized array - each item should be a float in [-1, 1)
        y = np.int16(x * 2 ** 15)
    else:
        y = np.int16(x)
    song = pydub.AudioSegment(y.tobytes(), frame_rate=sr, sample_width=2, channels=channels)
    song.export(f, format="mp3", bitrate="320k")

# sr = (audio_array.shape[0]// resized_vid_arr.shape[0])*fps
#write('test_audio_2.mp3', sr, audio)

*training loop from cs189 hw*

In [15]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader
import gc

In [16]:
class VideoAudioDataset(Dataset):
    def __init__(self, video_frames, audio_frames, num_frames):
        self.video_frames = torch.tensor(video_frames, dtype=torch.float32).permute(0,3,2,1) # Permute to (N, C, H, W)
        self.aud_per_frame = audio_frames.shape[0]// (video_frames.shape[0])
        clip_amount = audio_frames.shape[0] % self.aud_per_frame
        self.audio_frames = torch.tensor(audio_frames[:-clip_amount], dtype=torch.float32).reshape(-1, self.aud_per_frame,audio_frames.shape[1]).permute(0,2,1) # Permute to (N, C, A)
        print(self.audio_frames.shape)
        self.num_frames = num_frames

    def __len__(self):
        return len(self.video_frames)

    def __getitem__(self, idx):
        idx += 1
        if idx < self.num_frames:
          num_zeros_needed = self.num_frames - idx
          vid_zeros = torch.zeros(num_zeros_needed, *self.video_frames[0].shape)
          aud_zeros = torch.zeros(num_zeros_needed, *self.audio_frames[0].shape)
          vid = torch.vstack((vid_zeros, self.video_frames[0:idx])).transpose(0,1)
          aud = torch.vstack((aud_zeros, self.audio_frames[0:idx])).transpose(0,1).reshape(2,-1).transpose(0,1)
          #print("aud_shape:", aud.shape, "reshaped:", torch.vstack((aud_zeros, self.audio_frames[0:idx])).transpose(0,1).shape)
          return (vid, aud)
        #vid shape example torch.Size([32, 3, 10, 36, 64])
        # aud shape example torch.Size([32, 2, 14710])
        vid = self.video_frames[idx-self.num_frames:idx].transpose(0,1)
        aud = self.audio_frames[idx-self.num_frames: (idx)].transpose(0,1).reshape(2,-1).transpose(0,1)
        #print("aud_shape:", aud.shape)
        # print('idx = ', idx, ' aud.size = ', aud.shape)
        return (vid, aud)

https://github.com/antecessor/Wavenet << source!

In [17]:
class AudConvEmbedding(nn.Module):
  def __init__(self, in_channels, output_dim):
    super(AudConvEmbedding, self).__init__()
    self.conv_layers = nn.Sequential(
        nn.Conv1d(in_channels = 2, out_channels = 2, kernel_size = 5, stride = 2),
        nn.LayerNorm(7353),
        nn.Conv1d(in_channels = 2, out_channels = 2, kernel_size = 5, stride = 2),
        nn.LayerNorm(3675),
        nn.Conv1d(in_channels = 2, out_channels = 2, kernel_size = 5, stride = 2),
        nn.LayerNorm(1836),
        nn.Conv1d(in_channels = 2, out_channels = 2, kernel_size = 5, stride = 2),
        nn.LayerNorm(916),
        nn.Conv1d(in_channels = 2, out_channels = 2, kernel_size = 5, stride = 2),
        nn.LayerNorm(456),
        nn.Linear(456, output_dim)
    )
  
  def forward(self, x):
    return self.conv_layers(x)

In [18]:
class MultiheadAttention(nn.Module):

    def __init__(self, input_dim, embed_dim, num_heads, dropout):
        super().__init__()
        assert embed_dim % num_heads == 0, "Embedding dimension must be 0 modulo number of heads."

        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        self.dropout = nn.Dropout(dropout)

        # Stack all weight matrices 1...h together for efficiency
        # Note that in many implementations you see "bias=False" which is optional
        self.qkv_proj = nn.Linear(input_dim, 3*embed_dim)
        self.o_proj = nn.Linear(embed_dim, input_dim)

        self._reset_parameters()

    def _reset_parameters(self):
        # Original Transformer initialization, see PyTorch documentation
        nn.init.xavier_uniform_(self.qkv_proj.weight)
        self.qkv_proj.bias.data.fill_(0)
        nn.init.xavier_uniform_(self.o_proj.weight)
        self.o_proj.bias.data.fill_(0)

    def scaled_dot_product(self, q, k, v, mask=None):
        d_k = q.size()[-1]
        attn_logits = torch.matmul(q, k.transpose(-2, -1))
        attn_logits = attn_logits / np.sqrt(d_k)
        if mask is not None:
            attn_logits = attn_logits.masked_fill(mask == 0, -9e15)
        attention = F.softmax(attn_logits, dim=-1)
        attention = self.dropout(attention)
        values = torch.matmul(attention, v)
        return values, attention

    def forward(self, x, mask=None, return_attention=False):
        batch_size, seq_length, _ = x.size()
        qkv = self.qkv_proj(x)

        # Separate Q, K, V from linear output
        qkv = qkv.reshape(batch_size, seq_length, self.num_heads, 3*self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3) # [Batch, Head, SeqLen, Dims]
        q, k, v = qkv.chunk(3, dim=-1)
        #print(q.shape, k.shape, v.shape)
        # Determine value outputs
        values, attention = self.scaled_dot_product(q, k, v, mask=mask)
        values = values.permute(0, 2, 1, 3) # [Batch, SeqLen, Head, Dims]
        #print(values.shape)
        values = values.reshape(batch_size, seq_length, self.embed_dim)
        o = self.o_proj(values)

        if return_attention:
            return o, attention
        else:
            return o

In [19]:
class AddPosEncoding(nn.Module):
  def __init__(self, d_model = 256, input_dropout = 0.1, timing_dropout = 0.1, max_len = 512):
    super(AddPosEncoding, self).__init__()
    self.d_model = d_model
    self.input_dropout = input_dropout
    self.timing_dropout = timing_dropout
    self.max_len = max_len

    self.timing_table = nn.Parameter(torch.FloatTensor(max_len, d_model))
    nn.init.normal_(self.timing_table)
    self.input_dropout = nn.Dropout(input_dropout)
    self.timing_dropout = nn.Dropout(self.timing_dropout)

  
  def forward(self,x):
    x = self.input_dropout(x)
    timing = self.timing_table[None, :x.shape[1], :]
    timing = self.timing_dropout(timing)
    return x + timing 



In [20]:
class GLUTanh(nn.Module):
    def __init__(self, input_size, output_size):
        super(GLUTanh, self).__init__()
        self.glu = nn.GLU(dim=-1)
        self.linear = nn.Linear(input_size // 2, output_size)

    def forward(self, x):
        x = self.glu(x)
        x = self.linear(x)
        x = torch.tanh(x)
        return x

In [21]:
class AudTransformer(nn.Module):
  def __init__(self, in_channels, out_channels, hidden_dims = 256, seq_len = 100, dim_ff = 1024, n_layers = 6, n_head = 8, d_qkv = 64, dropout = 0.1):
        super(AudTransformer, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.hidden_dims = hidden_dims
        self.dim_ff = dim_ff
        self.n_layers = n_layers
        self.n_head = n_head
        self.d_qkv = d_qkv
        self.dropout = dropout

        self.mha_list = nn.ModuleList()
        self.mha_norms = nn.ModuleList()
        self.pff_list = nn.ModuleList()
        self.pff_norms = nn.ModuleList()

        #self.output_norm = nn.LayerNorm(hidden_dims)
        #self.linear = nn.Linear(self.hidden_dims, 2)
        #self.tanh = nn.Tanh()

        self.add_timing = AddPosEncoding(hidden_dims, max_len = seq_len)

        #self.embedding = AudConvEmbedding(2, self.hidden_dims)
        self.embedding2 = nn.Linear(2, self.hidden_dims)
        #self.output_proj = nn.Conv1d(in_channels = seq_len, out_channels = 1, kernel_size = 1, stride = 1)

        self.output_proj = nn.Sequential(
            nn.LayerNorm(hidden_dims),
            nn.Conv1d(in_channels = seq_len, out_channels = 1, kernel_size = 1, stride = 1),
            nn.Linear(self.hidden_dims, 1024),
            nn.GLU(),
            nn.Linear(512, 4)
        )
        

        for _ in range(n_layers):
          self.mha_list.append(MultiheadAttention(self.hidden_dims, self.d_qkv, self.n_head, self.dropout))
          self.mha_norms.append(nn.LayerNorm(hidden_dims))
          self.pff_list.append(nn.Sequential(
                              nn.Linear(hidden_dims, dim_ff),
                              nn.ReLU(),
                              nn.Linear(dim_ff, hidden_dims),
                              nn.Dropout(self.dropout)
                              ))
          self.pff_norms.append(nn.LayerNorm(hidden_dims))

  def forward(self, x):
    #print("tf:", x.shape)
    #x = self.embedding(x).transpose(2,1)
    x = self.embedding2(x)
    x = x + self.add_timing(x)
    #print("tf emb:", x.shape)
    for i in range(self.n_layers):
      x_norm = self.mha_norms[i](x)
      #print('x_norm.shape', x_norm.shape)
      att_out = self.mha_list[i](x)
      #print(att_out.shape)
      x = x_norm + att_out
      x_norm = self.pff_norms[i](x)
      ff_out = self.pff_list[i](x_norm)
      x = x_norm + ff_out
      #print("tf layer:", i, x.shape)
    
    #x = self.output_norm(x)
    #print("tf:", x.shape)
    #x = self.linear(x)
    x = self.output_proj(x)
    #print("tf:", x.shape)
    #print("tf:", x.shape)
    return x


In [22]:
class VidToAudFusion(nn.Module):
    def __init__(self, audio_dim, video_dims, audio_channels, vid_channels, dropout_prob=0.1, num_frames=10):
        super(VidToAudFusion, self).__init__()
        self.dim_proj1 = nn.Linear(video_dims[0] * video_dims[1], 200)
        self.dim_proj2 = nn.Linear(200 * num_frames, (audio_dim) - 1)
        self.audio_dim = audio_dim
        self.video_dims = video_dims
        self.vid_channels = vid_channels

        if audio_channels == vid_channels:
            self.channel_projection = nn.Identity()
        else:
            self.channel_projection = nn.Sequential(
                nn.Conv2d(vid_channels, audio_channels, kernel_size=1, stride=1, bias=True),
            )
        self.channel_projection2 = nn.Sequential(
                nn.Conv1d(num_frames, 2, kernel_size=1, stride=1, bias=True),
            )
            
    def forward(self, x):
      #x = x.reshape(x.shape[0], x.shape[1], x.shape[2], -1)
      #print("vid to aud:", x.shape)
      audio_proj = torch.flatten(x, start_dim=3)
      #print("vid to aud:", x.shape)
      #print(audio_proj.shape)  
      audio_proj = self.dim_proj1(audio_proj)
      #print("vid to aud:", audio_proj.shape)
      #print(audio_proj.shape)
      audio_proj = self.channel_projection(audio_proj).squeeze(2)
      #print("vid to aud:", audio_proj.shape)
      #audio_proj = self.channel_projection2(audio_proj)
      audio_proj = audio_proj.reshape(audio_proj.shape[0], -1)
      #print("vid to aud:", audio_proj.shape)
      audio_proj = self.dim_proj2(audio_proj)
      #print("vid to aud:", audio_proj.shape)
      #print(audio_proj.shape)
      return audio_proj

In [23]:
class ConvResBlock3D(nn.Module):
    def __init__(self, in_channels, out_channels, dropout_prob=0.1, stride=1, kernel_size=3, padding=1):
        super(ConvResBlock3D, self).__init__()
        self.conv1 = nn.Conv3d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, bias=True)
        self.norm1 = nn.BatchNorm3d(out_channels)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=dropout_prob)
        self.conv2 = nn.Conv3d(out_channels, out_channels, kernel_size=kernel_size, stride=1, padding=padding, bias=True)
        self.norm2 = nn.BatchNorm3d(out_channels)
        self.stride = stride
        
        if in_channels == out_channels:
            self.skip_connection = nn.Identity()
        else:
            self.skip_connection = nn.Sequential(
                nn.Conv3d(in_channels, out_channels, kernel_size=1, stride=stride, bias=True),
                nn.BatchNorm3d(out_channels)
            )
            
    def forward(self, x):
        residual = self.skip_connection(x)
        
        out = self.conv1(x)
        out = self.norm1(out)
        out = self.relu(out)
        out = self.dropout(out)
        
        out = self.conv2(out)
        out = self.norm2(out)
        out = self.dropout(out)
        
        out = self.relu(out)
        out = out + residual

        return out

class ConvResBlock2D(nn.Module):
    def __init__(self, in_channels, out_channels, dropout_prob=0.1, stride=1, kernel_size=3, padding=1):
        super(ConvResBlock2D, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, bias=True)
        self.norm1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=dropout_prob)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=kernel_size, stride=1, padding=padding, bias=True)
        self.norm2 = nn.BatchNorm2d(out_channels)
        self.stride = stride
        
        if in_channels == out_channels:
            self.skip_connection = nn.Identity()
        else:
            self.skip_connection = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=True),
                nn.BatchNorm2d(out_channels)
            )
            
    def forward(self, x):
        residual = self.skip_connection(x)
        
        out = self.conv1(x)
        out = self.norm1(out)
        out = self.relu(out)
        out = self.dropout(out)
        
        out = self.conv2(out)
        out = self.norm2(out)
        out = self.dropout(out)
        
        out = self.relu(out)
        out = out + residual

        return out

In [24]:
### combines video into audio input vector and then passes through wavenet

class AttAudVideoNet(nn.Module):
    def __init__(self,audio_input_shape, video_input_shape,in_channels=2,out_channels=2, seq_len=1470, num_frames=5): ## NEED TO DEBUG THESE hyperperams 4.29
        super().__init__()
        self.transformer=AudTransformer(in_channels, out_channels, hidden_dims = 256, seq_len = seq_len, dim_ff = 2048, n_layers = 8, n_head = 6, d_qkv = 60, dropout = 0.1) 
        self.activation = GLUTanh(4,2)
        self.vid_convs = nn.ModuleList([
            ConvResBlock3D(3, 128),
            ConvResBlock3D(128, 256),
            ConvResBlock3D(256, 64),
            ConvResBlock3D(64, 8)
        ])
        self.audio_input_shape = audio_input_shape
        self.video_input_shape = video_input_shape
        vid_dims_list = self.get_video_dims(video_input_shape)
        self.vid_to_aud = VidToAudFusion(self.audio_input_shape[-1], vid_dims_list[2], 1, 8, num_frames=num_frames)
        
        #self.output_lin = nn.Linear(256*in_channels, out_channels)

    def get_video_dims(self, video_input_shape):
        shape_list = []
        test_data = torch.ones(video_input_shape)
        out = self.vid_convs[0](test_data)
        shape_list.append((out.shape[-2],out.shape[-1]))
        for layer in self.vid_convs[1:]:
            out = layer(out)
            shape_list.append((out.shape[-2],out.shape[-1]))
        return shape_list

    def forward(self,vid, aud):
        #print(vid.shape, aud.shape)
        for i in range(len(self.vid_convs)):
          vid = self.vid_convs[i](vid)
        #print(vid.shape)
        vid_to_aud = self.vid_to_aud(vid)
        #print(vid_to_aud.shape, aud.shape)
        #print(vid_to_aud.shape, aud.shape)
        aud = aud + vid_to_aud.unsqueeze(2)
        aud=self.transformer(aud)
        #print(aud.shape)
        #aud=aud.reshape(aud.shape[0], -1)
        #aud = self.output_lin(aud)
        #print('output shape = ', aud.shape)
        return self.activation(aud)



In [25]:
print(resized_vid_arr.shape, resized_vid_arr.size)
print(audio_array.size)

(18547, 64, 36, 3) 128196864
11139480


In [26]:
num_frames = 5
train_data = VideoAudioDataset(resized_vid_arr[:int(len(resized_vid_arr)*0.8)], audio_array[:int(len(audio_array)*0.8)], num_frames = num_frames)
valid_data = VideoAudioDataset(resized_vid_arr[int(len(resized_vid_arr)*0.8):], audio_array[int(len(audio_array)*0.8):], num_frames = num_frames)
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=32, shuffle=True)

torch.Size([14852, 2, 300])
torch.Size([3713, 2, 300])


In [27]:
for x in train_loader:
  print(x[0].shape)
  print(x[1].shape)
  break
  
count = 0
# for x in valid_loader:
#   count += 1
#   if count == 10:
#     print(x[0].shape)
#     print(x[1].shape)
#     break

torch.Size([16, 3, 5, 36, 64])
torch.Size([16, 1500, 2])


In [28]:
from tqdm.notebook import tqdm, trange

# wavenet = WaveNet(in_channels=2,out_channels=2,kernel_size=2,stack_size=23,layer_size=6).cuda().train()
#audio_test_data = torch.ones((1,2,1471))
seq_len = 300 * num_frames
audio_test_data = torch.ones((1,2,seq_len))
vid_test_data = torch.ones((1,3,num_frames,36,64))

wavenet = AttAudVideoNet(audio_input_shape=audio_test_data.shape, video_input_shape=vid_test_data.shape,in_channels=2,out_channels=2, seq_len=seq_len-1, num_frames=num_frames).cuda().train()
#load_path = 'transformer_vid_model3_5.7.pt'
#wavenet.load_state_dict(torch.load(load_path))

lr = 4e-5
epochs= 50
globalStep=1000

optimizer=torch.optim.AdamW(wavenet.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.OneCycleLR(
  optimizer,
  lr,
  epochs=epochs,
  steps_per_epoch=len(train_loader),
  pct_start=0.03,  # Warm up for 3% of the total training time
  )
lossFunction = torch.nn.CrossEntropyLoss()

def calc_accuracy(Out,Y):
    max_vals, max_indices = torch.max(Out,1)
    train_acc = (max_indices == Y).sum().item()/max_indices.size()[0]
    return train_acc
  


for epoch in range(epochs):
    for step, (vid_frames,aud_frames) in tqdm(enumerate(train_loader),desc="Training"):
         #vid_frames = vid_frames.cuda()
         target = aud_frames[:,-1,].cuda()
         aud_frames = aud_frames[:,:-1,:].cuda()
         vid_frames = vid_frames.cuda()
         #print(target.shape)
         #print(aud_frames.shape)
         output = wavenet(vid_frames, aud_frames).squeeze()
         #print('ttt', output.shape, target.shape)
         #print(output[0].detach().cpu().numpy(), target[0].cpu().numpy())
         #print(output.shape)
         #print(output)
         #print(output.dtype, target.dtype)

         loss = lossFunction(output,target)
         optimizer.zero_grad()
         loss.backward()
         optimizer.step()
         scheduler.step()
         if step%globalStep==0:
            # scheduler.step()
            # print(output.detach().numpy())
            # print(y_train.numpy())
            with torch.no_grad():
                accuracy=0
                val_loss=0
                for stepTest, (vid_frames,aud_frames) in tqdm(enumerate(valid_loader),desc="Validation"):
                    vid_frames = vid_frames.cuda()
                    target = aud_frames[:,-1,].cuda()
                    aud_frames = aud_frames[:,:-1,:].cuda()
                    output = wavenet(vid_frames, aud_frames).squeeze()
                    if stepTest==0:
                        print(target[:3])
                        print(output[:3])
                        #print(lossFunction(output[0],target[0]).item())
                    #accuracy+=calc_accuracy(output,target)*100
                    val_loss+= lossFunction(output,target).item()
                    if stepTest>200:
                        print(output)
                        break
            print(f"loss for step {step} : {val_loss/stepTest}")

         
    print(f"epoch {epoch}")

    save_path = 'transformer_vid_model3_5.10.pt'
    torch.save(wavenet.state_dict(), save_path)

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

tensor([[ 0.0836, -0.2140],
        [-0.3243, -0.8008],
        [-0.2607, -0.6306]], device='cuda:0')
tensor([[-0.0332,  0.5647],
        [-0.0334,  0.5652],
        [-0.0328,  0.5682]], device='cuda:0')
loss for step 0 : 0.00533019143116215
epoch 0


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

tensor([[ 0.0416, -0.2072],
        [ 0.3305,  0.5553],
        [ 0.3204,  0.0612]], device='cuda:0')
tensor([[-0.0362,  0.5600],
        [-0.0090,  0.6139],
        [-0.0077,  0.6150]], device='cuda:0')
loss for step 0 : 0.005243558922539587
epoch 1


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

tensor([[ 0.2596,  0.5251],
        [ 0.3622,  0.1263],
        [-0.2284, -0.4983]], device='cuda:0')
tensor([[-0.0172,  0.5921],
        [-0.0235,  0.5819],
        [-0.0282,  0.5754]], device='cuda:0')
loss for step 0 : 0.005344857364569021
epoch 2


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

tensor([[-0.4168, -0.3827],
        [-0.4896, -0.7022],
        [ 0.2686,  0.2097]], device='cuda:0')
tensor([[0.0367, 0.6463],
        [0.0298, 0.6442],
        [0.0306, 0.6440]], device='cuda:0')
loss for step 0 : 0.005428738875881485
epoch 3


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

tensor([[-0.0954, -0.0420],
        [-0.0052, -0.0504],
        [ 0.3346,  0.2849]], device='cuda:0')
tensor([[0.0171, 0.6193],
        [0.0202, 0.6224],
        [0.0204, 0.6224]], device='cuda:0')
loss for step 0 : 0.005378820726890927
epoch 4


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

tensor([[-0.2821, -0.3802],
        [ 0.1873,  0.1178],
        [ 0.0103,  0.4026]], device='cuda:0')
tensor([[0.0068, 0.6077],
        [0.0035, 0.6044],
        [0.0059, 0.6070]], device='cuda:0')
loss for step 0 : 0.005369510679789211
epoch 5


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

tensor([[-0.0668,  0.0955],
        [-0.0309,  0.0524],
        [-0.3716, -0.2607]], device='cuda:0')
tensor([[-9.7192e-04,  5.9859e-01],
        [ 1.3735e-04,  5.9972e-01],
        [ 5.5144e-04,  6.0013e-01]], device='cuda:0')
loss for step 0 : 0.005359386516046589
epoch 6


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

tensor([[-0.3311,  0.0676],
        [-0.3234, -0.4106],
        [-0.1933, -0.1794]], device='cuda:0')
tensor([[-0.0056,  0.5940],
        [-0.0052,  0.5944],
        [-0.0045,  0.5952]], device='cuda:0')
loss for step 0 : 0.005305743804606407
epoch 7


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

tensor([[ 0.0262, -0.2697],
        [-0.1903,  0.0046],
        [ 0.1716,  0.3402]], device='cuda:0')
tensor([[-1.3402e-03,  5.9857e-01],
        [-1.9684e-03,  5.9795e-01],
        [-5.7463e-04,  5.9935e-01]], device='cuda:0')
loss for step 0 : 0.005325451059995786
epoch 8


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

tensor([[-0.1607,  0.0799],
        [ 0.1143,  0.1971],
        [ 0.1465,  0.2224]], device='cuda:0')
tensor([[-0.0196,  0.5804],
        [-0.0195,  0.5805],
        [-0.0193,  0.5806]], device='cuda:0')
loss for step 0 : 0.005393363470616548
epoch 9


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

tensor([[-0.0156,  0.1667],
        [-0.1223, -0.1744],
        [ 0.0780,  0.0391]], device='cuda:0')
tensor([[-0.0329,  0.5667],
        [-0.0328,  0.5667],
        [-0.0328,  0.5668]], device='cuda:0')
loss for step 0 : 0.005383393990442804
epoch 10


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

tensor([[-0.6973, -0.2820],
        [ 0.3960,  0.5219],
        [ 0.2460,  0.2317]], device='cuda:0')
tensor([[-0.0336,  0.5660],
        [-0.0335,  0.5660],
        [-0.0337,  0.5659]], device='cuda:0')
loss for step 0 : 0.005412422987106054
epoch 11


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

tensor([[ 0.4523,  0.5114],
        [ 0.0421, -0.1011],
        [-0.5169,  0.2387]], device='cuda:0')
tensor([[-0.0352,  0.5648],
        [-0.0352,  0.5647],
        [-0.0352,  0.5648]], device='cuda:0')
loss for step 0 : 0.005413105650602475
epoch 12


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

tensor([[ 0.0791, -0.4767],
        [ 0.7174,  0.4148],
        [-0.1507,  0.1315]], device='cuda:0')
tensor([[-0.0355,  0.5646],
        [-0.0355,  0.5646],
        [-0.0355,  0.5646]], device='cuda:0')
loss for step 0 : 0.005374142151002003
epoch 13


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

tensor([[-0.0723, -0.0804],
        [-0.3672, -0.2056],
        [ 0.2580, -0.1913]], device='cuda:0')
tensor([[-0.0358,  0.5645],
        [-0.0358,  0.5645],
        [-0.0358,  0.5645]], device='cuda:0')
loss for step 0 : 0.0053249548148849735
epoch 14


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

tensor([[-0.4247,  0.2318],
        [ 0.0030,  0.2096],
        [ 0.3585,  0.1858]], device='cuda:0')
tensor([[-0.0361,  0.5645],
        [-0.0361,  0.5645],
        [-0.0361,  0.5645]], device='cuda:0')
loss for step 0 : 0.005392076431409172
epoch 15


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

tensor([[ 0.1944,  0.2551],
        [-0.0170, -0.0097],
        [-0.0659, -0.2724]], device='cuda:0')
tensor([[-0.0364,  0.5645],
        [-0.0364,  0.5645],
        [-0.0364,  0.5645]], device='cuda:0')
loss for step 0 : 0.005295283080119154


KeyboardInterrupt: 

In [None]:
#save_path = 'transformer_vid_model2_5.7.pt'
#torch.save(wavenet.state_dict(), save_path)
load_path = 'transformer_vid_model3_5.10.pt'
wavenet.load_state_dict(torch.load(load_path))

In [None]:
class VideoOnlyDataset(Dataset):
    def __init__(self, video_frames, num_frames):
        self.video_frames = torch.tensor(video_frames, dtype=torch.float32).permute(0,3,2,1) # Permute to (N, C, H, W)
        self.num_frames = num_frames

    def __len__(self):
        return len(self.video_frames)

    def __getitem__(self, idx):
        idx += 1
        if idx < self.num_frames:
          num_zeros_needed = self.num_frames - idx
          vid_zeros = torch.zeros(num_zeros_needed, *self.video_frames[0].shape)
          vid = torch.vstack((vid_zeros, self.video_frames[0:idx])).transpose(0,1)
          return vid
        #vid shape example torch.Size([32, 3, 10, 36, 64])
        # aud shape example torch.Size([32, 2, 14710])
        vid = self.video_frames[idx-self.num_frames:idx].transpose(0,1)
        # print('idx = ', idx, ' aud.size = ', aud.shape)
        return vid

In [None]:
### generate audio for video from model
num_frames = 1

start_index = int(len(resized_vid_arr)*0.8) + num_frames
audio_list = []
vid_test_shape = (1,3,10,36,64)
zero_frame = torch.zeros((vid_test_shape[3], vid_test_shape[4]))

aud_per_vid_frame = 1500
audio_start_index = int(len(audio_array)*0.8)
print(audio_start_index)
aud_input_arr = torch.tensor(audio_array[audio_start_index:audio_start_index+(num_frames * aud_per_vid_frame) - 1], dtype=torch.float32).cuda().unsqueeze(0)
print(aud_input_arr.shape)

#aud_input_arr = torch.zeros((1,2,(num_frames * aud_per_vid_frame) - 1)).cuda()
print(aud_input_arr.shape)

input_vid = resized_vid_arr[start_index:]
gen_data = VideoOnlyDataset(input_vid, num_frames = num_frames)
gen_loader = DataLoader(gen_data, batch_size=1, shuffle=False)


#NOTE: GEN NEEDS TO BE FIXED TO ACCOUNT FOR MULTIPLE AUDIO RUNS PER FRAME
wavenet.eval()
for i,vid in enumerate(gen_loader):
    vid = vid.cuda()
    print(vid.shape)
    for j in trange(aud_per_vid_frame):
        with torch.no_grad():
            #print(aud_input_arr.shape)
            audio_output = wavenet(vid.cuda(), aud_input_arr.cuda()).cpu()
            #print(audio_output, audio_output.shape)
            audio_list.append(audio_output.squeeze().cpu().numpy())
            #print(aud_input_arr)
            aud_input_arr = aud_input_arr[:, 1:, :]
            #print(aud_input_arr.shape, audio_output.shape)
            #print(aud_input_arr)
            
            aud_input_arr = torch.cat((aud_input_arr, audio_output.cuda()), 1)
            #print(aud_input_arr)
    if i > 15:
        break
np.array(audio_list).shape

In [None]:
print(len(audio_list))

In [None]:
print(np.array(audio_list).shape)
audio_np = np.array(audio_list).transpose(1,0)
audio_np.shape

In [None]:
'''
audio_cpu = audio_output.cpu()
audio_np = audio_cpu.detach().numpy()
audio_np.shape
'''

In [None]:
path = 'test_transformer_model3_out_5.10_1.raw'
np.save(path, audio_np)

In [None]:
#print(sr)

In [None]:
fps = 30
sr = (audio_array.shape[0]// resized_vid_arr.shape[0])*fps
write('test_transformer_model3_out_5.10_1.mp3', sr, audio_np)