In [None]:
import os
import numpy as np
import torch 
print(torch.__version__)
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torchvision.transforms as transforms
import torch.utils.data as data
import torchvision
from torch.autograd import Variable
import matplotlib.pyplot as plt
from functions import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score
import pickle
from pathlib import Path

In [None]:
import pandas as pd

# IMAGE MODEL

In [None]:
# set path
training_data_path = "../data/image_data/training_data/"    # define UCF-101 RGB data path
validation_data_path = "../data/image_data/validation_data/"
test_data_path = "../data/image_data/test_data/"
save_model_path = "./ResNetCRNN_ckpt/"

# EncoderCNN architecture
CNN_fc_hidden1, CNN_fc_hidden2 = 1024, 768
CNN_embed_dim = 512   # latent dim extracted by 2D CNN
res_size = 224        # ResNet image size
dropout_p = 0.25    # dropout probability

# DecoderRNN architecture
RNN_hidden_layers = 3
RNN_hidden_nodes = 512
RNN_FC_dim = 128

# training parameters
k = 6           # number of target category
epochs = 50       # training epochs
batch_size = 16
learning_rate = 3e-4
l_decay = 5e-4
log_interval = 100  # interval for displaying training info

# Select which frame to begin & end in videos
begin_frame, end_frame, skip_frame = 1, 10, 1

In [None]:
def train(log_interval, model, device, train_loader, optimizer, epoch):
    # set model as training mode
    cnn_encoder, rnn_decoder, audio_model, final_net = model
    cnn_encoder.train()
    rnn_decoder.train()
    audio_model.train()
    final_net.train()

    losses = []
    scores = []
    N_count = 0   # counting total trained sample in one epoch
    for batch_idx, (X, X_audio, y) in enumerate(train_loader):
        # distribute data to device
        X, X_audio, y = X.to(device), X_audio.to(device), y.to(device)
        
        N_count += X.size(0)

        optimizer.zero_grad()
        video_output = rnn_decoder(cnn_encoder(X))   # output has dim = (batch, number of classes)
        audio_output = audio_model(X_audio)
        comb_tensor = torch.cat((video_output, audio_output), 1)
        output = final_net(comb_tensor)
        
        criterion = nn.L1Loss()
        loss = criterion(output, y)
        losses.append(loss.item())

        # to compute accuracy
        #y_pred = torch.max(output, 1)[1]  # y_pred != output
        #step_score = accuracy_score(y.cpu().data.squeeze().numpy(), y_pred.cpu().data.squeeze().numpy())
        #scores.append(step_score)         # computed on CPU

        loss.backward()
        optimizer.step()

        # show information
        if (batch_idx + 1) % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch + 1, N_count, len(train_loader.dataset), 100. * (batch_idx + 1) / len(train_loader), loss.item()))

    return losses, scores

In [None]:
def validation(model, device, optimizer, test_loader):
    # set model as testing mode
    cnn_encoder, rnn_decoder, audio_model, final_net = model
    cnn_encoder.eval()
    rnn_decoder.eval()
    audio_model.eval()
    final_net.eval()

    test_loss = 0
    all_y = []
    all_y_pred = []
    with torch.no_grad():
        for (X, X_audio, y) in test_loader:
            # distribute data to device
            X, X_audio, y = X.to(device), X_audio.to(device), y.to(device)

            video_output = rnn_decoder(cnn_encoder(X))   # output has dim = (batch, number of classes)
            audio_output = audio_model(X_audio)
            comb_tensor = torch.cat((video_output, audio_output), 1)
            output = final_net(comb_tensor)

            criterion = nn.L1Loss()
            loss = criterion(output, y)
            test_loss += loss.item()                 # sum up batch loss
            #y_pred = output.max(1, keepdim=True)[1]  # (y_pred != output) get the index of the max log-probability

            # collect all y and y_pred in all batches
            all_y.extend(y)
            #all_y_pred.extend(y_pred)

    test_loss /= len(test_loader.dataset)

    # compute accuracy
    #all_y = torch.stack(all_y, dim=0)
    #all_y_pred = torch.stack(all_y_pred, dim=0)
    #test_score = accuracy_score(all_y.cpu().data.squeeze().numpy(), all_y_pred.cpu().data.squeeze().numpy())

    # show information
    print('\nTest set ({:d} samples): Average loss: {:.4f}, \n'.format(len(all_y), test_loss))

    # save Pytorch models of best record
    torch.save(cnn_encoder.state_dict(), os.path.join(save_model_path, 'cnn_encoder_epoch{}.pth'.format(epoch + 1)))  # save spatial_encoder
    torch.save(rnn_decoder.state_dict(), os.path.join(save_model_path, 'rnn_decoder_epoch{}.pth'.format(epoch + 1)))  # save motion_encoder
    torch.save(audio_model.state_dict(), os.path.join(save_model_path, 'audio_model_epoch{}.pth'.format(epoch + 1)))
    torch.save(final_net.state_dict(), os.path.join(save_model_path, 'final_net_epoch{}.pth'.format(epoch + 1)))
    torch.save(optimizer.state_dict(), os.path.join(save_model_path, 'optimizer_epoch{}.pth'.format(epoch + 1)))      # save optimizer
    print("Epoch {} model saved!".format(epoch + 1))

    return test_loss


In [None]:
def CRNN_final_prediction(model, device, loader):
    cnn_encoder, rnn_decoder, audio_model, final_net = model
    cnn_encoder.eval()
    rnn_decoder.eval()
    audio_model.eval()
    final_net.eval()

    all_y_pred = []
    with torch.no_grad():
        for batch_idx, (X, X_audio, y) in enumerate(tqdm(loader)):
            # distribute data to device
            X, X_audio = X.to(device), X_audio.to(device)
            video_output = rnn_decoder(cnn_encoder(X))   # output has dim = (batch, number of classes)
            audio_output = audio_model(X_audio)
            comb_tensor = torch.cat((video_output, audio_output), 1)
            output = final_net(comb_tensor)
            #y_pred = output.max(1, keepdim=True)[1]  # location of max log-probability as prediction
            all_y_pred.append(output)

    return all_y_pred

In [None]:
# Detect devices
use_cuda = torch.cuda.is_available()                   # check if GPU exists
device = torch.device("cuda" if use_cuda else "cpu")   # use CPU or GPU
print(device)

## Image data preparation

In [None]:
params = {'batch_size': batch_size, 'shuffle': True, 'num_workers': 0, 'pin_memory': True} if use_cuda else {}
params2 = {'batch_size': batch_size, 'shuffle': False, 'num_workers': 0, 'pin_memory': True} if use_cuda else {}

In [None]:
transform = transforms.Compose([transforms.Resize([res_size, res_size]),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

selected_frames = np.arange(begin_frame, end_frame, skip_frame).tolist()

In [None]:
with open('../data/image_data/pickle_files/y_5d_training_all.pkl', 'rb') as file:
    train_label = pickle.load(file)

with open('../data/image_data/pickle_files/vid_ids_5d_training.pkl', 'rb') as file:
    train_list = pickle.load(file)
    
with open('../data/image_data/pickle_files/y_5d_validation_all.pkl', 'rb') as file:
    val_label = pickle.load(file)

with open('../data/image_data/pickle_files/vid_ids_5d_validation.pkl', 'rb') as file:
    val_list = pickle.load(file)
    
with open('../data/image_data/pickle_files/y_5d_test_all.pkl', 'rb') as file:
    test_label = pickle.load(file)

with open('../data/image_data/pickle_files/vid_ids_5d_test.pkl', 'rb') as file:
    test_list = pickle.load(file)

## Audio Data preperation

In [None]:
# set path
training_df = pd.read_csv("../data/audio_data/pickle_files/training_df_all.csv")
validation_df = pd.read_csv("../data/audio_data/pickle_files/validation_df_all.csv")
test_df = pd.read_csv("../data/audio_data/pickle_files/test_df_all.csv")
save_audio_model_path = "./audio_ckpt/"

In [None]:
X_train = training_df.drop(['interview_score','extraversion','agreeableness','conscientiousness','neuroticism','openness', 'video_id'], axis=1)
X_val = validation_df.drop(['interview_score','extraversion','agreeableness','conscientiousness','neuroticism','openness', 'video_id'], axis=1)
X_test = test_df.drop(['interview_score','extraversion','agreeableness','conscientiousness','neuroticism','openness', 'video_id'], axis=1)
Y_train = training_df[['interview_score','extraversion','agreeableness','conscientiousness','neuroticism','openness']]
Y_val = validation_df[['interview_score','extraversion','agreeableness','conscientiousness','neuroticism','openness']]
Y_test = test_df[['interview_score','extraversion','agreeableness','conscientiousness','neuroticism','openness']]

In [None]:
X_train = X_train.values
X_val = X_val.values
X_test = X_test.values
Y_train = Y_train.values
Y_val = Y_val.values
Y_test = Y_test.values

In [None]:
tot_data = np.zeros((10000, 42))

tot_data[0:6000] = X_train
tot_data[6000:8000] = X_val
tot_data[8000:10000] = X_test

for i in range(6):
    max_val = np.max(tot_data[:, i])
    min_val = np.min(tot_data[:, i])
    tot_data[:, i] -= min_val
    tot_data[:, i] /= (max_val-min_val)

In [None]:
X_train = tot_data[0:6000]
X_val = tot_data[6000:8000]
X_test = tot_data[8000:10000]

In [None]:
X_audio_train = torch.tensor(X_train).float()
X_audio_val = torch.tensor(X_val).float()
X_audio_test = torch.tensor(X_test).float()
Y_audio_train = torch.tensor(Y_train).float()
Y_audio_val = torch.tensor(Y_val).float()
Y_audio_test = torch.tensor(Y_test).float()

In [None]:
audio_model = Audio_Model(drop_p=dropout_p).to(device)

In [None]:
audio_params = list(audio_model.fc1.parameters()) + list(audio_model.fc2.parameters())

In [None]:
train_set, valid_set = Dataset_CRNN(training_data_path, train_list, train_label, selected_frames, X_audio_train, transform=transform), \
                       Dataset_CRNN(validation_data_path, val_list, val_label, selected_frames, X_audio_val, transform=transform)

train_loader = data.DataLoader(train_set, **params)
valid_loader = data.DataLoader(valid_set, **params)

In [None]:
test_set = Dataset_CRNN(test_data_path, test_list, test_label, selected_frames, X_audio_test, transform=transform)
test_loader = data.DataLoader(test_set, **params2)

In [None]:
cnn_encoder = ResCNNEncoder(fc_hidden1=CNN_fc_hidden1, fc_hidden2=CNN_fc_hidden2, drop_p=dropout_p, CNN_embed_dim=CNN_embed_dim).to(device)
rnn_decoder = DecoderRNN(CNN_embed_dim=CNN_embed_dim, h_RNN_layers=RNN_hidden_layers, h_RNN=RNN_hidden_nodes, 
                         h_FC_dim=RNN_FC_dim, drop_p=dropout_p, num_classes=k).to(device)

In [None]:
final_net = Final_Net(drop_p = dropout_p).to(device)

In [None]:
if torch.cuda.device_count() > 1:
    print("Using", torch.cuda.device_count(), "GPUs!")
    cnn_encoder = nn.DataParallel(cnn_encoder)
    rnn_decoder = nn.DataParallel(rnn_decoder)

    # Combine all EncoderCNN + DecoderRNN parameters
    crnn_params = list(cnn_encoder.module.fc1.parameters()) + list(cnn_encoder.module.bn1.parameters()) + \
                  list(cnn_encoder.module.fc2.parameters()) + list(cnn_encoder.module.bn2.parameters()) + \
                  list(cnn_encoder.module.fc3.parameters()) + list(rnn_decoder.parameters())

elif torch.cuda.device_count() == 1:
    print("Using", torch.cuda.device_count(), "GPU!")
    # Combine all EncoderCNN + DecoderRNN parameters
    crnn_params = list(cnn_encoder.fc1.parameters()) + list(cnn_encoder.bn1.parameters()) + \
                  list(cnn_encoder.fc2.parameters()) + list(cnn_encoder.bn2.parameters()) + \
                  list(cnn_encoder.fc3.parameters()) + list(rnn_decoder.parameters()) + list(audio_model.parameters()) + list(final_net.parameters())

In [None]:
optimizer = torch.optim.Adam(crnn_params, lr=learning_rate)

In [None]:
epoch_train_losses = []
epoch_train_scores = []
epoch_test_losses = []
epoch_test_scores = []

In [None]:
for epoch in range(epochs):
    # train, test model
    if __name__ == '__main__':
        train_losses, train_scores = train(log_interval, [cnn_encoder, rnn_decoder, audio_model, final_net], device, train_loader, optimizer, epoch)
        epoch_test_loss = validation([cnn_encoder, rnn_decoder, audio_model, final_net], device, optimizer, test_loader)

    # save results
    epoch_train_losses.append(train_losses)
    epoch_train_scores.append(train_scores)
    epoch_test_losses.append(epoch_test_loss)

    # save all train test results
    A = np.array(epoch_train_losses)
    C = np.array(epoch_test_losses)
    np.save('./CRNN_epoch_training_lossesMAE.npy', A)
    np.save('./CRNN_epoch_test_lossMAE.npy', C)

In [None]:
test_set = Dataset_3DCNN(test_data_path, test_list, test_label, selected_frames, X_audio_test, transform=transform)
test_loader = data.DataLoader(test_set, **params2)

In [None]:
cnn_encoder.load_state_dict(torch.load(os.path.join(save_model_path, 'cnn_encoder_epoch20.pth')))
rnn_decoder.load_state_dict(torch.load(os.path.join(save_model_path, 'rnn_decoder_epoch20.pth')))
audio_model.load_state_dict(torch.load(os.path.join(save_model_path, 'audio_model_epoch20.pth')))
final_net.load_state_dict(torch.load(os.path.join(save_model_path, 'final_net_epoch20.pth')))

In [None]:
all_y_pred = CRNN_final_prediction([cnn_encoder, rnn_decoder, audio_model, final_net], device, test_loader)

In [None]:
preds = np.zeros((2000,6))
k = 0
for i in range(len(all_y_pred)):
    for j in range(len(all_y_pred[i])):
        batch_pred = all_y_pred[i].cpu()
        preds[k] = batch_pred[j]
        k +=1

In [None]:
print(preds)

In [None]:
print(test_label)

# Eval Metrics IMAGE

In [None]:
acc = np.zeros(6)
for i in range(6):
    ind = i
    diff = abs(preds[:,ind] - test_label[:, ind])

    acc[i] = 1-(np.sum(diff))/2000

print(acc)
print(np.mean(acc))

In [None]:
ind = 0

diff = abs(preds[:,ind] - test_label[:, ind])
avg = np.mean(test_label[:, ind])
print(avg)
avg_diff = abs(test_label[:, ind] - avg)

acc = 1 - ((np.sum(diff))/np.sum(avg_diff))/2000
print(acc)

# INDEPENDENT AUDIO MODEL TESTS

In [None]:
# set path
training_df = pd.read_csv("../data/audio_data/pickle_files/training_df_all.csv")
validation_df = pd.read_csv("../data/audio_data/pickle_files/validation_df_all.csv")
test_df = pd.read_csv("../data/audio_data/pickle_files/test_df_all.csv")
save_audio_model_path = "./audio_ckpt/"

In [None]:
audio_lb = test_df['video_id'].values

In [None]:
for i in range(2000):
    if(audio_lb[i] != test_list[i]):
        print('False')

In [None]:
X_train = training_df.drop(['interview_score','extraversion','agreeableness','conscientiousness','neuroticism','openness', 'video_id'], axis=1)
X_val = validation_df.drop(['interview_score','extraversion','agreeableness','conscientiousness','neuroticism','openness', 'video_id'], axis=1)
X_test = test_df.drop(['interview_score','extraversion','agreeableness','conscientiousness','neuroticism','openness', 'video_id'], axis=1)
Y_train = training_df[['interview_score','extraversion','agreeableness','conscientiousness','neuroticism','openness']]
Y_val = validation_df[['interview_score','extraversion','agreeableness','conscientiousness','neuroticism','openness']]
Y_test = test_df[['interview_score','extraversion','agreeableness','conscientiousness','neuroticism','openness']]

In [None]:
X_train = X_train.values
X_val = X_val.values
X_test = X_test.values
Y_train = Y_train.values
Y_val = Y_val.values
Y_test = Y_test.values

In [None]:
tot_data = np.zeros((10000, 42))

tot_data[0:6000] = X_train
tot_data[6000:8000] = X_val
tot_data[8000:10000] = X_test

for i in range(6):
    max_val = np.max(tot_data[:, i])
    min_val = np.min(tot_data[:, i])
    tot_data[:, i] -= min_val
    tot_data[:, i] /= (max_val-min_val)

In [None]:
X_train = tot_data[0:6000]
X_val = tot_data[6000:8000]
X_test = tot_data[8000:10000]

In [None]:
X_audio_train = torch.tensor(X_train, device=device).float()
X_audio_val = torch.tensor(X_val, device=device).float()
X_audio_test = torch.tensor(X_test, device=device).float()
Y_audio_train = torch.tensor(Y_train, device=device).float()
Y_audio_val = torch.tensor(Y_val, device=device).float()
Y_audio_test = torch.tensor(Y_test, device=device).float()

In [None]:
X_train.device

In [None]:
audio_model = Audio_Model().to(device)

In [None]:
audio_params = list(audio_model.fc1.parameters()) + list(audio_model.fc2.parameters())

In [None]:
optimizer_audio = torch.optim.Adam(audio_params, lr=1e-3)
audio_epochs = 100

In [None]:
for epoch in range(audio_epochs):
    audio_model.train()
    optimizer_audio.zero_grad()
    train_ops = audio_model(X_train)
    criterion = nn.MSELoss()
    loss = torch.sqrt(criterion(train_ops, Y_train))
    loss.backward()
    optimizer_audio.step()
    
    print('Train Epoch {}\tLoss: {:.6f}' .format(epoch+1, loss))
    
    audio_model.eval()
    val_ops = audio_model(X_val)
    val_loss = torch.sqrt(criterion(val_ops, Y_val))
    
    print('Val loss {:.6f}' .format(val_loss))
    
    torch.save(audio_model.state_dict(), os.path.join(save_audio_model_path, 'audio_epoch{}.pth'.format(epoch + 1)))  # save spatial_encoder
    print("Epoch {} model saved!".format(epoch + 1))

In [None]:
audio_model.load_state_dict(torch.load(os.path.join(save_audio_model_path, 'audio_epoch100.pth')))

In [None]:
audio_model.eval()
test_ops = audio_model(X_test)