In [0]:
import os
import numpy as np
import torch 
print(torch.__version__)
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torchvision.transforms as transforms
import torch.utils.data as data
import torchvision
from torch.autograd import Variable
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
import pickle
from pathlib import Path
from ipywidgets import FloatProgress

In [0]:
torch.device(1)
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [0]:
import pandas as pd

In [0]:
%matplotlib inline

# IMAGE FUNCTIONS

In [0]:
import os
import numpy as np
from PIL import Image
from torch.utils import data
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torchvision.transforms as transforms
from tqdm import tqdm
from pathlib import Path
import random

In [0]:
class Dataset_CRNN(data.Dataset):
    "Characterizes a dataset for PyTorch"
    def __init__(self, data_path, folders, labels, frames, transform=None):
        "Initialization"
        self.data_path = data_path
        self.labels = labels
        self.folders = folders
        self.transform = transform
        self.frames = frames
        self.factor = 5

    def __len__(self):
        "Denotes the total number of samples"
        return len(self.folders)

    def read_images(self, path, selected_folder, use_transform):
        X = []
        
        for i in range(20):
            image = Image.open(Path(path, selected_folder, 'frame{}.jpg'.format(i+1)))
            if use_transform is not None:
                image = use_transform(image)

            X.append(image)
        
        X = torch.stack(X, dim=0)

        return X

    def __getitem__(self, index):
        "Generates one sample of data"
        # Select sample
        folder = self.folders[index]

        # Load data
        X = self.read_images(self.data_path, folder, self.transform)     # (input) spatial images
        y = torch.FloatTensor(self.labels[index])

        return X, y


In [0]:
# 2D CNN encoder using ResNet-50 pretrained
class ResCNNEncoder(nn.Module):
    def __init__(self, fc_hidden1=512, fc_hidden2=512, drop_p=0.3, CNN_embed_dim=300):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(ResCNNEncoder, self).__init__()

        self.fc_hidden1, self.fc_hidden2 = fc_hidden1, fc_hidden2
        self.drop_p = drop_p

        resnet = models.resnet50(pretrained=True)
        modules = list(resnet.children())[:-1]      # delete the last fc layer.
        self.resnet = nn.Sequential(*modules)
        self.fc1 = nn.Linear(resnet.fc.in_features, fc_hidden1)
        self.bn1 = nn.BatchNorm1d(fc_hidden1, momentum=0.01)
        self.fc2 = nn.Linear(fc_hidden1, fc_hidden2)
        self.bn2 = nn.BatchNorm1d(fc_hidden2, momentum=0.01)
        self.fc3 = nn.Linear(fc_hidden2, CNN_embed_dim)
        
    def forward(self, x_3d):
        cnn_embed_seq = []
        for t in range(x_3d.size(1)):
            # ResNet CNN
            with torch.no_grad():
                x = self.resnet(x_3d[:, t, :, :, :])  # ResNet
                x = x.view(x.size(0), -1)             # flatten output of conv

            # FC layers
            x = self.bn1(self.fc1(x))
            x = F.relu(x)
            x = self.bn2(self.fc2(x))
            x = F.relu(x)
            x = F.dropout(x, p=self.drop_p, training=self.training)
            x = self.fc3(x)

            cnn_embed_seq.append(x)

        # swap time and sample dim such that (sample dim, time dim, CNN latent dim)
        cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0).transpose_(0, 1)
        # cnn_embed_seq: shape=(batch, time_step, input_size)

        return cnn_embed_seq


In [0]:
class DecoderRNN(nn.Module):
    def __init__(self, CNN_embed_dim=300, h_RNN_layers=3, h_RNN=256, h_FC_dim=128, drop_p=0.3, num_classes=50):
        super(DecoderRNN, self).__init__()

        self.RNN_input_size = CNN_embed_dim
        self.h_RNN_layers = h_RNN_layers   # RNN hidden layers
        self.h_RNN = h_RNN                 # RNN hidden nodes
        self.h_FC_dim = h_FC_dim
        self.drop_p = drop_p
        self.num_classes = num_classes

        self.LSTM = nn.LSTM(
            input_size=self.RNN_input_size,
            hidden_size=self.h_RNN,        
            num_layers=h_RNN_layers,       
            batch_first=True,       # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size)
        )

        self.fc1 = nn.Linear(self.h_RNN, self.h_FC_dim)
        self.fc2 = nn.Linear(self.h_FC_dim, self.num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x_RNN):
        
        self.LSTM.flatten_parameters()
        RNN_out, (h_n, h_c) = self.LSTM(x_RNN, None)  
        """ h_n shape (n_layers, batch, hidden_size), h_c shape (n_layers, batch, hidden_size) """ 
        """ None represents zero initial hidden state. RNN_out has shape=(batch, time_step, output_size) """

        # FC layers
        x = self.fc1(RNN_out[:, -1, :])   # choose RNN_out at the last time step
        x = F.relu(x)
        x = F.dropout(x, p=self.drop_p, training=self.training)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

# IMAGE MODEL

In [0]:
# set path
training_data_path = ""       #"../data/image_data/training_data/"    # define UCF-101 RGB data path
validation_data_path = ""
test_data_path = ""
save_model_path = "./ResNetCRNN_ckpt/"

# EncoderCNN architecture
CNN_fc_hidden1, CNN_fc_hidden2 = 1024, 768
CNN_embed_dim = 512   # latent dim extracted by 2D CNN
res_size = 224        # ResNet image size
dropout_p = 0.25    # dropout probability

# DecoderRNN architecture
RNN_hidden_layers = 3
RNN_hidden_nodes = 512
RNN_FC_dim = 256

# training parameters
k = 6           # number of target category
epochs = 50       # training epochs
batch_size = 16*8
learning_rate = 1e-5
l_decay = 5e-4
log_interval = 10  # interval for displaying training info

# Select which frame to begin & end in videos
begin_frame, end_frame, skip_frame = 1, 20, 1

In [0]:
def train(log_interval, model, device, train_loader, optimizer, epoch):
    # set model as training mode
    cnn_encoder, rnn_decoder = model
    cnn_encoder.train()
    rnn_decoder.train()

    train_loss = 0
    scores = []
    N_count = 0   # counting total trained sample in one epoch
    for batch_idx, (X, y) in enumerate(train_loader):
        # distribute data to device
        X, y = X.to(device), y.to(device)
        
        N_count += X.size(0)

        optimizer.zero_grad()
        output = rnn_decoder(cnn_encoder(X))   # output has dim = (batch, number of classes)
        
        criterion = nn.L1Loss(reduction = 'sum')
        loss = criterion(output, y)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()

        # show information
        if (batch_idx + 1) % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch + 1, N_count, len(train_loader.dataset), 100. * (batch_idx + 1) / len(train_loader), loss.item()))

    
    return train_loss

In [0]:
def validation(model, device, optimizer, test_loader):
    # set model as testing mode
    cnn_encoder, rnn_decoder = model
    cnn_encoder.eval()
    rnn_decoder.eval()

    test_loss = 0
    all_y = []
    all_y_pred = []
    score = []
    with torch.no_grad():
        for (X, y) in test_loader:
            # distribute data to device
            X, y = X.to(device), y.to(device)

            output = rnn_decoder(cnn_encoder(X))   # output has dim = (batch, number of classes)
            
            criterion = nn.L1Loss(reduction = 'sum')
            loss = criterion(output, y)
            test_loss += loss.item()                 # sum up batch loss
            
            all_y.extend(y.cpu().detach().numpy())
            all_y_pred.extend(output.cpu().detach().numpy())

    test_loss /= len(test_loader.dataset)

    print('\nValid set ({:d} samples): Average loss: {:.8f}, \n'.format(len(all_y), test_loss))
    
    all_y = np.asarray(all_y)
    all_y_pred = np.asarray(all_y_pred)
    
    for i in range(all_y.shape[1]):
        score.extend([1 - mean_absolute_error(all_y[:,i], all_y_pred[:,i])])
        
    print('Validation scores' + str(score))

    return test_loss, np.asarray(score)


In [0]:
def CRNN_final_prediction(model, device, loader):
    cnn_encoder, rnn_decoder = model
    cnn_encoder.eval()
    rnn_decoder.eval()

    all_y_pred = []
    with torch.no_grad():
        for batch_idx, (X, y) in enumerate(tqdm(loader)):
            # distribute data to device
            X = X.to(device)
            output = rnn_decoder(cnn_encoder(X))   # output has dim = (batch, number of classes)
            #y_pred = output.max(1, keepdim=True)[1]  # location of max log-probability as prediction
            all_y_pred.append(output)

    return all_y_pred

In [0]:
# Detect devices
use_cuda = torch.cuda.is_available()                   # check if GPU exists
device = torch.device("cuda" if use_cuda else "cpu")   # use CPU or GPU
print(device)

## Image data preparation

In [0]:
params = {'batch_size': batch_size, 'shuffle': True, 'num_workers': 0, 'pin_memory': True} if use_cuda else {}
params2 = {'batch_size': batch_size, 'shuffle': False, 'num_workers': 0, 'pin_memory': True} if use_cuda else {}

In [0]:
transform = transforms.Compose([transforms.Resize([res_size, res_size]),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

selected_frames = np.arange(begin_frame, end_frame, skip_frame).tolist()

In [0]:
#with open('../data/image_data/pickle_files/y_5d_training_all.pkl', 'rb') as file:
#    train_label = pickle.load(file)

#with open('../data/image_data/pickle_files/vid_ids_5d_training.pkl', 'rb') as file:
#    train_list = pickle.load(file)
    
train_label_df = pd.read_csv('./training_path_labels.csv')
train_list = train_label_df['path'].values.tolist()
train_label = train_label_df[['o', 'c', 'e', 'a', 'n', 'i']].values

valid_label_df = pd.read_csv('./validation_path_labels.csv')
valid_list = valid_label_df['path'].values.tolist()
valid_label = valid_label_df[['o', 'c', 'e', 'a', 'n', 'i']].values

test_label_df = pd.read_csv('./test_path_labels.csv')
test_list = test_label_df['path'].values.tolist()
test_label = test_label_df[['o', 'c', 'e', 'a', 'n', 'i']].values

# Begin training

In [0]:
# Return validation set back when values are returned
train_set, valid_set = Dataset_CRNN(training_data_path, train_list, train_label, selected_frames, transform=transform), \
                       Dataset_CRNN(validation_data_path, valid_list, valid_label, selected_frames, transform=transform)

train_loader = data.DataLoader(train_set, **params)
valid_loader = data.DataLoader(valid_set, **params)

In [0]:
test_set = Dataset_CRNN(test_data_path, test_list, test_label, selected_frames, transform=transform)
test_loader = data.DataLoader(test_set, **params2)

In [0]:
cnn_encoder = ResCNNEncoder(fc_hidden1=CNN_fc_hidden1, fc_hidden2=CNN_fc_hidden2, drop_p=dropout_p, CNN_embed_dim=CNN_embed_dim).to(device)
rnn_decoder = DecoderRNN(CNN_embed_dim=CNN_embed_dim, h_RNN_layers=RNN_hidden_layers, h_RNN=RNN_hidden_nodes, 
                         h_FC_dim=RNN_FC_dim, drop_p=dropout_p, num_classes=k).to(device)

In [0]:
cnn_encoder.load_state_dict(torch.load(os.path.join(save_model_path, 'cnn_encoder_best.pth')))
rnn_decoder.load_state_dict(torch.load(os.path.join(save_model_path, 'rnn_decoder_best.pth')))

In [0]:
print("Using", torch.cuda.device_count(), "GPU!")
# Combine all EncoderCNN + DecoderRNN parameters
crnn_params = list(cnn_encoder.fc1.parameters()) + list(cnn_encoder.bn1.parameters()) + \
              list(cnn_encoder.fc2.parameters()) + list(cnn_encoder.bn2.parameters()) + \
              list(cnn_encoder.fc3.parameters()) + list(rnn_decoder.parameters())

In [0]:
optimizer = torch.optim.Adam(
            [
                {"params": cnn_encoder.resnet.parameters(), "lr": 1e-6},
                {"params": crnn_params}
            ],
            lr=learning_rate)

In [0]:
epoch_train_losses = []
epoch_valid_losses = []
epoch_valid_scores = []

In [0]:
for epoch in range(epochs):
    # train, test model
    if __name__ == '__main__':
        train_loss = train(log_interval, [cnn_encoder, rnn_decoder], device, train_loader, optimizer, epoch)
        valid_loss, valid_scores = validation([cnn_encoder, rnn_decoder], device, optimizer, valid_loader)

    # save results
    epoch_train_losses.append(train_loss)
    epoch_valid_losses.append(valid_loss)
    epoch_valid_scores.append(valid_scores)
    
    if(valid_loss == min(epoch_valid_losses)):
        torch.save(cnn_encoder.state_dict(), os.path.join(save_model_path, 'cnn_encoder_best.pth'))  # save spatial_encoder
        torch.save(rnn_decoder.state_dict(), os.path.join(save_model_path, 'rnn_decoder_best.pth'))  # save motion_encoder
        torch.save(optimizer.state_dict(), os.path.join(save_model_path, 'optimizer_epoch.pth'))      # save optimizer
        print("Epoch {} model saved!".format(epoch + 1))

In [0]:
test_set = Dataset_CRNN(test_data_path, test_list, test_label, selected_frames, transform=transform)
test_loader = data.DataLoader(test_set, **params2)

train_pred_loader = data.DataLoader(train_set, **params2)

In [0]:
cnn_encoder.load_state_dict(torch.load(os.path.join(save_model_path, 'cnn_encoder_best.pth')))
rnn_decoder.load_state_dict(torch.load(os.path.join(save_model_path, 'rnn_decoder_best.pth')))

In [0]:
all_y_pred = CRNN_final_prediction([cnn_encoder, rnn_decoder], device, test_loader)

In [0]:
test_preds = np.zeros((2000,6))
k = 0
for i in range(len(all_y_pred)):
    for j in range(len(all_y_pred[i])):
        batch_pred = all_y_pred[i].cpu()
        test_preds[k] = batch_pred[j]
        k +=1

In [0]:
print(test_preds)

In [0]:
print(test_label)

# Eval Metrics IMAGE

In [0]:
for i in range(6):
  print(1 - mean_absolute_error(test_label[:, i], test_preds[:, i]))