In [11]:
import torch
import torch.nn as nn
# from torch.nn import init
from torch.autograd import Variable
# import torchvision
# import torchvision.transforms as T
import torch.optim as optim
import torchvision
from torchvision import models

# from torch.utils.data import DataLoader
# from torch.utils.data import sampler
# import torchvision.datasets as dset

import numpy as np


Based on Code found in https://github.com/InnovArul/first-impressions/blob/master/src/2_model_LSTMSpatial.lua

Note about the architecture of this network
For GPU optimization we're creating batches of 16 videos x 10 periods/video (flattened)

In [23]:
class AudioVisualLSTM(nn.Module):
    NUM_VID_FEATURES = 128
    NUM_AUDIO_FEATURES = 32
    NUM_LSTM_HIDDEN = 128
    NUM_PARTITIONS = 10
    NUM_CLASS = 5
    
    def __init__(self):        
        super(AudioVisualLSTM, self).__init__()
        self.audioBranch =  nn.Sequential(nn.Linear(68,32))
        self.videoBranch = self._createVideoBranch()
        self.lstm = nn.LSTM(
            input_size=(self.NUM_VID_FEATURES+self.NUM_AUDIO_FEATURES),
            hidden_size=self.NUM_LSTM_HIDDEN,
            num_layers=1,
            bias=True,
            batch_first=True # input and output tensors provided as (batch, seq, feature)
            # can add dropout later
            )
        self.fc = nn.Linear(self.NUM_LSTM_HIDDEN,self.NUM_CLASS)
        self.sigmoid = nn.Sigmoid()

    def _createVideoBranch(self):
        model_pretrained = torchvision.models.resnet50(pretrained=True)
        # All of the parameters are freezed, not to change (newly constructed layers' params won't be influenced)
        for param in model_pretrained.parameters():
            param.requires_grad = False   
        model_pretrained.fc = nn.Linear(model_pretrained.fc.in_features, self.NUM_VID_FEATURES)
        return model_pretrained
    
    def forward(self, audioData, videoData):
        P = self.NUM_PARTITIONS
        assert (audioData.size().data[0] % P) == 0  # X first dimensions should be NxP'
        N = audioData.size().data[0]/P
        
        audioProcessed = self.audioBranch(audioData) # will output a (n x partitions)x 128 tensor
        videoProcessed = self.videoBrach(videoData) # will output a (n x partitions)x 32 tensor
        
        videoFeatures = videoProcessed.view(N,P,self.NUM_VID_FEATURES) 
        audioFeatures = audioProcessed.view(N,P,self.NUM_AUDIO_FEATURES) 

        x = torch.cat((videoFeatures, audioFeatures), dim=2)

        h0 = torch.zeros(1, N, self.NUM_LSTM_HIDDEN)
        c0 = torch.zeros(1, N, self.NUM_LSTM_HIDDEN)
        x,cn = self.lstm(x, (h0, c0))
        x = self.fc(x)
        x = self.sigmoid(x)
        
        return x

In [24]:
AudioVisualLSTM()

AudioVisualLSTM (
  (audioBranch): Sequential (
    (0): Linear (68 -> 32)
  )
  (videoBranch): ResNet (
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
    (relu): ReLU (inplace)
    (maxpool): MaxPool2d (size=(3, 3), stride=(2, 2), padding=(1, 1), dilation=(1, 1))
    (layer1): Sequential (
      (0): Bottleneck (
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True)
        (relu): ReLU (inplace)
        (downsample): Sequential (
          (0): Conv2d(64, 256, kernel_size=(1, 