In [1]:
train_dataset_dir = '/home/noa_glaser/dataBig/train-frames/train'
val_dataset_dir = '/home/noa_glaser/dataBig/train-frames/val'
audio_dataset_dir = '/home/noa_glaser/dataBig/train-audio'
label_dataset_dir = '/home/noa_glaser/dataBig/'
exp_name = 'avepool_dropout_L2loss_ResNet34_LSTM_experiment' # roughly 3K videos
num_classes = 5 
num_partition = 10
batch_size = 8

In [2]:
# -*- coding: utf-8 -*- 

from __future__ import print_function, division

import pickle
import sys

import os
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import torch.utils.data as data
import matplotlib.pyplot as plt
%matplotlib inline


from PIL import Image
import time
import copy
import shutil

## first, get the Images (path) and their labels (five personality traits)

In [3]:
def get_img_audio_label(dataset_dir,audio_dataset_dir,label_dataset_dir):
    """Returns a list of np.array(img_paths), np.array(audio_paths),
        np.array(labels), np.array(raw_movienames)
    Args:
    dataset for video, for audio_feats from pyAudioAnalysis, labels
    """
 
    print("processing dataset: "+ dataset_dir)
    img_paths = [] 
    audio_paths=[]
    raw_movienames = []
    labels = []

    annotaion_filename = label_dataset_dir + "/annotation_training.pkl"
    
    with open(annotaion_filename, 'rb') as f:
        label_dicts = pickle.load(f, encoding='latin1') 

    for movie in os.listdir(dataset_dir):
        fileEnding ='_50uniform' #TODO: figure out how to make more general
        if fileEnding not in movie: continue #skip non-movie files
        raw_moviename = movie.replace(fileEnding,'.mp4')      
        big_five = [label_dicts['extraversion'][raw_moviename], 
                    label_dicts['neuroticism'][raw_moviename],
                    label_dicts['agreeableness'][raw_moviename],
                    label_dicts['conscientiousness'][raw_moviename],
                    label_dicts['openness'][raw_moviename] ]
                    #label_dicts['interview'][raw_moviename]]      
        movie_path = os.path.join(dataset_dir, movie)
        mv_partitions = []
        p = 0
        all_imgs = os.listdir(movie_path)
        assert(len(all_imgs) >= num_partition)
        opened = True
        for i in range(num_partition):
            path = os.path.join(movie_path, all_imgs[i])
            try:
                open(path)
            except:
                print('image failed to open',path)
                opened = False
                
            mv_partitions.append(path)
        assert(len(mv_partitions)==num_partition)
        
        
        audiofeat_path = os.path.join(audio_dataset_dir,raw_moviename+'.wav.csv')
        try:
            open(audiofeat_path)
        except:
            print('audio failed to open',path)
            opened = False
        if opened :
            img_paths.append(mv_partitions)
            audio_paths.append(audiofeat_path)
            raw_movienames.append(raw_moviename)
            labels.append(big_five)
            
    
    return np.array(img_paths),np.array(audio_paths),\
                np.array(labels), np.array(raw_movienames)



## use this if we have seperated train/val dataset

In [4]:
train_img_paths,train_audio_paths, train_labels, train_movienames \
        = get_img_audio_label(train_dataset_dir,audio_dataset_dir,label_dataset_dir) 
val_img_paths,val_audio_paths, val_labels, val_movienames \
= get_img_audio_label(val_dataset_dir,audio_dataset_dir,label_dataset_dir)  

processing dataset: /home/noa_glaser/dataBig/train-frames/train
processing dataset: /home/noa_glaser/dataBig/train-frames/val


## Data Loader. Data is normalized before feeding into model (as required by the pretrained model)

In [5]:
def default_img_loader(img_paths,transform):
    ten_img_tensor = []
    for path in img_paths:
        img = Image.open(path).convert('RGB')
        if transform is not None:
            img = transform(img)
        ten_img_tensor.append(img)
        
    return torch.cat(ten_img_tensor)
        

def default_audio_loader(path):
	return np.loadtxt(path,delimiter=',')

class VisualAudio(data.Dataset):
    def __init__(self,split,img_paths,audio_paths, movie_names,labels,transform=None,
                 img_loader=default_img_loader,audio_loader=default_audio_loader):
        self.split = split 
        self.img_paths = img_paths
        self.audio_paths = audio_paths
        self.movie_names = movie_names
        self.labels = labels
        self.transform = transform
        self.img_loader=img_loader
        self.audio_loader= audio_loader
        
    def __getitem__(self, index):
        img_paths, audio_paths,target = self.img_paths[index], \
                                        self.audio_paths[index], self.labels[index]
        ten_img_tensor = self.img_loader(img_paths,self.transform)
        ten_audio = self.audio_loader(audio_paths)
        #return 30x224x224 , 10x68, 10 x 5
        
        assert(ten_img_tensor.size() == (30,256,256))
        
        return ten_img_tensor, ten_audio[:10,:], target

    def __len__(self):
        return len(self.img_paths)
    

In [6]:
from torchvision import  transforms

data_transforms = {
    'train': transforms.Compose([
        transforms.Scale(256),
        transforms.RandomCrop(256),
        #transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Scale(256),
        transforms.CenterCrop(256),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

        
dsets = {}
dsets['train'] = VisualAudio('train',train_img_paths,train_audio_paths,\
                    train_movienames ,train_labels,transform=data_transforms['train'] )
dsets['val'] = VisualAudio('val',val_img_paths,val_audio_paths,\
                         val_movienames,val_labels,transform=data_transforms['val'] )

dset_loaders = {x: torch.utils.data.DataLoader(dsets[x], batch_size=batch_size,
                        shuffle=True, num_workers=1) for x in ['train', 'val']}
dset_sizes = {x: len(dsets[x]) for x in ['train', 'val']}
dset_sizes

{'train': 4800, 'val': 1200}

##  Some dataset examples (each batch is 16)

In [7]:
def imshow(inp, title=None):
    """Imshow for Tensor."""
    inp = inp.numpy().transpose((1, 2, 0))
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    inp = std * inp + mean
    
    plt.imshow(inp)
    if title is not None:
        plt.title(title)
    plt.pause(0.001)  # pause a bit so that plots are updated


#train_imgsamples,train_audiosamle,train_labelsample = next(iter(dset_loaders['train']))

#train_unflattened_sample = train_imgsamples.view(-1,3,256,256)
# Make a grid from batch
#plt.figure( figsize=(10, 16))
#out = torchvision.utils.make_grid(train_unflattened_sample,nrow=10)
#imshow(out, title='trainning sample')
#plt.savefig('train_exp.png', bbox_inches='tight')

In [8]:
#val_imgsamples,val_audiosamle,val_labelsample = next(iter(dset_loaders['val']))
#val_unflattened_sample = val_imgsamples.view(-1,3,256,256)
### Make a grid from val batch
#plt.figure( figsize=(10, 16))
#out2 = torchvision.utils.make_grid(val_unflattened_sample,nrow=10)
#imshow(out2, title='validation sample')
#plt.savefig('val_exp.png', bbox_inches='tight')

#  MODEL

In [9]:
class AudioVisualLSTM(nn.Module):
    NUM_AUDIO_INPUT = 68
    NUM_VID_FEATURES = 128
    NUM_AUDIO_FEATURES = 32
    NUM_LSTM_HIDDEN = 128
    NUM_PARTITIONS = 10
    NUM_CLASS = 5
    NUM_IMG_SIZE = 256
    NUM_CHANNEL = 3
    
    def __init__(self):        
        super(AudioVisualLSTM, self).__init__()
        self.audioBranch =  nn.Sequential(nn.Linear(self.NUM_AUDIO_INPUT,self.NUM_AUDIO_FEATURES))
        self.videoBranch = self._createVideoBranch()
        self.video_dropout = nn.Dropout(p=0.2)
        
        self.lstm = nn.LSTM(
            input_size=(self.NUM_VID_FEATURES+self.NUM_AUDIO_FEATURES),
            hidden_size=self.NUM_LSTM_HIDDEN,
            num_layers=1,
            bias=True,
            batch_first=True # input and output tensors provided as (batch, seq, feature)
            # can add dropout later
            )
        self.fc = nn.Linear(self.NUM_LSTM_HIDDEN,self.NUM_CLASS)
        self.sigmoid = nn.Sigmoid()
        self.avg = nn.AvgPool1d(self.NUM_PARTITIONS,self.NUM_PARTITIONS)

    def _createVideoBranch(self):
        model_pretrained = torchvision.models.resnet34(pretrained=True)
        # All of the parameters are freezed, not to change (newly constructed layers' params won't be influenced)
        for param in model_pretrained.parameters():
            param.requires_grad = False   
        model_pretrained.fc = nn.Linear(model_pretrained.fc.in_features, self.NUM_VID_FEATURES)
        return model_pretrained
    
    def forward(self, x):
        videoData = x[0].view(-1,self.NUM_CHANNEL,self.NUM_IMG_SIZE,self.NUM_IMG_SIZE)
        audioData = x[1].view(-1,self.NUM_AUDIO_INPUT)
        videoProcessed = self.videoBranch(videoData).view(-1,self.NUM_PARTITIONS,self.NUM_VID_FEATURES) # will output a (n x partitions)x 32 tensor
        videoProcessed = self.video_dropout(videoProcessed)
        audioProcessed = self.audioBranch(audioData).view(-1,self.NUM_PARTITIONS,self.NUM_AUDIO_FEATURES)# will output a (n x partitions)x 128 tensor
        x = torch.cat((videoProcessed, audioProcessed), 2).type(gpu_dtype) #(N,10,160)
        h0 = Variable(torch.zeros(1, x.size()[0], self.NUM_LSTM_HIDDEN)).type(gpu_dtype)
        c0 = Variable(torch.zeros(1, x.size()[0], self.NUM_LSTM_HIDDEN)).type(gpu_dtype)
        x,cn = self.lstm(x, (h0, c0))
        x = x.contiguous().view(-1,self.NUM_LSTM_HIDDEN)
        x = self.fc(x)
        x = self.sigmoid(x) #(N*P,5)
        x = self.avg(x.view(-1,self.NUM_PARTITIONS,self.NUM_CLASS).transpose(1,2)).squeeze() #(N,5)
        return x

### some helper functions

In [10]:
# some gpu configs
use_gpu = True
gpu_dtype = torch.cuda.FloatTensor

def get_learnable_params(m,verbose = 0):
    ret = []
    for l in m.parameters():
        if l.requires_grad == True:
            ret.append(l)
            if verbose == 1:
                print (l.size())
            if verbose == 2:
                print (l)
    return ret

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    """Saves checkpoint to disk"""
    directory = "resnet_for_rnn/%s/"%(exp_name)
    if not os.path.exists(directory):
        os.makedirs(directory)
    filename = directory + filename
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'resnet_for_rnn/%s/'%(exp_name) + 'model_best.pth.tar')

def log_value(to_log, log_path = './log_'+ exp_name + '.txt'):
    log_file = open(log_path, 'a+')
    log_file.write(to_log)
    log_file.close()
    

## Train The Model

## train and eval functions

In [11]:
log_freq = 10

def train(train_loader, model, criterion, optimizer, epoch) :
    batch_time = AverageMeter()
    losses = AverageMeter()

    # switch to train mode
    model.train()

    end = time.time()
    for i, data in enumerate(train_loader):
        # measure data loading time
        input_image,input_audio, target = data
        input_image_var, input_audio_var,target_var = Variable(input_image.type(gpu_dtype)), \
            Variable(input_audio.type(gpu_dtype)),Variable(target.type(gpu_dtype))
        # compute output
        output = model([input_image_var,input_audio_var])
        loss = criterion(output, target_var)
        # measure accuracy and record loss
        losses.update(loss.data[0], input_image.size(0))
        # compute gradient and do Adam step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if i % log_freq == 0:
            to_log = 'Epoch: [{0}][{1}/{2}]\t Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t Loss {loss.val:f} ({loss.avg:f})\n'.format(
                   epoch, i, len(train_loader), batch_time=batch_time, loss=losses)
            log_value(to_log)
            print(to_log)
            
       
    return losses.avg


def validate(val_loader, model, criterion, epoch):
    """Perform validation on the validation set"""
    batch_time = AverageMeter()
    losses = AverageMeter()
    # switch to evaluate mode
    model.eval()

    end = time.time()
    for i, data in enumerate(val_loader):
        input_image,input_audio, target = data
        input_image_var, input_audio_var,target_var = Variable(input_image.type(gpu_dtype)), \
        Variable(input_audio.type(gpu_dtype)),Variable(target.type(gpu_dtype))
        # compute output
        output = model([input_image_var,input_audio_var])
        loss = criterion(output, target_var)
        # measure accuracy and record loss
        losses.update(loss.data[0], input_image.size(0))
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if i %  log_freq == 0:
            to_log = 'Val/Test: [{0}/{1}]\t Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t Loss {loss.val:f} ({loss.avg:f})\n'.format(
                      i, len(val_loader), batch_time=batch_time, loss=losses)
            log_value(to_log)
            print(to_log)
                        
    return losses.avg, output,input_image,target

# Train Script

In [12]:
# TODO: for larger dataset, consider a step function or exponentialdecay
def lr_scheduler(optimizer, epoch):
    return optimizer

In [14]:

# can retriecve these latest val results for plotting
val_preds,val_imgs,val_targets = None,None,None



def train_model(startModel=None, startEpoch=0, numEpochs=10):    
    model = AudioVisualLSTM().type(gpu_dtype)
    #del model_base

    #  changed to l1 loss to reflect competition 
    criterion = nn.MSELoss().type(gpu_dtype)

    #only optimizing the new_fc layer parameters, other pretrained weights are freezed¶
    optimizer  = optim.SGD(get_learnable_params(model),lr=5e-5, momentum=0.9,weight_decay=5e-4)

    best_loss = 1000 # will get overwritten
    
    if(startModel != None):
        print("=> loading checkpoint '{}'".format(startModel))
        checkpoint = torch.load(startModel)
        startEpoch = checkpoint['epoch']
        best_loss = checkpoint['best_loss'] # for now because old loss is stale
        model.load_state_dict(checkpoint['state_dict'])
        # todo - figure out why not working
        #optimizer.load_state_dict(checkpoint['optimizer'])
        # print(optimizer.param_groups)
        
        print("=> loaded checkpoint '{}' (epoch {})"
              .format(startModel, checkpoint['epoch']))
    #else:
        # benchmark the model 
        #best_loss = validate(dset_loaders['val'], model, criterion, startEpoch)
        
    bestModel = model

    for epoch in range(startEpoch,startEpoch+numEpochs):
        # train for one epoch
        train_loss = train(dset_loaders['train'], model, criterion, optimizer, epoch)

        # evaluate on validation set
        val_loss,val_preds,val_imgs,val_targets = validate(dset_loaders['val'], model, criterion, epoch)

        # log 
        log_value('Epoch: [{0}]\t Train Loss: {train_loss:f}  \t Val Loss: {val_loss:f}\n'.format(epoch,\
                                        train_loss=train_loss,val_loss=val_loss),'./%s_epoch_log.txt'%exp_name)
        print('Epoch: [{0}]\t Train Loss: {train_loss:f}  \t Val Loss: {val_loss:f}\n'.format(epoch,\
                    train_loss=train_loss,val_loss=val_loss))

        # remember best loss and save checkpoint
        is_best = val_loss <= best_loss
        best_loss = min(val_loss, best_loss)
        save_checkpoint({
            'epoch': epoch + 1,
            'arch': 'resnet34_only',
            'state_dict': model.state_dict(),
            'best_loss': best_loss,
            'optimizer' : optimizer.state_dict(),
        }, is_best)
        print (is_best, best_loss)

    print ('Best Loss: ', best_loss)

## Start to train a new model run this line

In [18]:
# train_model(startModel=None, startEpoch=0, numEpochs=1)

KeyboardInterrupt: 

## Train from the latest model checpoint run this cell, specify startepoch

In [None]:
startModel='./resnet_for_rnn/avepool_dropout_L2loss_ResNet34_LSTM_experiment/checkpoint.pth.tar'
train_model(startModel=startModel, startEpoch=1 , numEpochs=10)

=> loading checkpoint './resnet_for_rnn/avepool_dropout_ResNet34_LSTM_experiment/checkpoint.pth.tar'
=> loaded checkpoint './resnet_for_rnn/avepool_dropout_ResNet34_LSTM_experiment/checkpoint.pth.tar' (epoch 1)
Epoch: [1][0/600]	 Time 6.619 (6.619)	 Loss 0.020043 (0.020043)

Epoch: [1][10/600]	 Time 0.803 (1.247)	 Loss 0.008296 (0.017423)


# visualize the pred and groundtruth and images of the latest val batch

In [None]:
    
val_pred = (val_preds.data).cpu().numpy()
val_gt = (Variable(val_targets).data).cpu().numpy()
# select a video in batch to see: idx can be 0-7
idx = 7
plt.subplot(1,2,1)
plt.bar(np.arange(5),val_gt[idx])
plt.xticks(np.arange(5),['e','n','a','c','o'])
plt.gca().set_ylim([0.0,1.0])
plt.subplot(1,2,2)
plt.bar(np.arange(5),val_pred[idx])
plt.xticks(np.arange(5),['e','n','a','c','o'])
plt.gca().set_ylim([0.0,1.0])
plt.show()
val_unflattened_sample = val_imgs.view(-1,3,256,256)
# Make a grid from batch
plt.figure( figsize=(30, 3))
out = torchvision.utils.make_grid(val_unflattened_sample[idx*10:idx*10+10],nrow=10)
imshow(out, title='trainning sample')
#plt.savefig('train_exp.png', bbox_inches='tight')