In [16]:
import os
import random
import functools

import datasets as DSS

import pandas as pd
import numpy as np
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from torchvision.transforms.functional import InterpolationMode
from torchinfo import summary

import transformers

from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

In [None]:
seed = 2021

torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True

In [58]:
df = pd.read_csv('data_multimodal.csv')

df = df.loc[(df['Text'].notnull())].reset_index(drop=True).copy()

df['Label'] = df['Sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

df_text = df.loc[
    (df['Type'] == 'test'), 
    ['Filename', 'Type', 'Text', 'Sentiment', 'Label']
].drop_duplicates().reset_index(drop=True).copy()


path_spectograms = os.path.join(os.getcwd(), 'spectograms')

df_spect = df.loc[:, ['Sentiment', 'Score', 'Type', 'Spectogram']].drop_duplicates().reset_index(drop=True).copy()
df_spect['PathSpectogram'] = df_spect['Spectogram'].apply(lambda x: os.path.join(path_spectograms, x))


path_faces = os.path.join(os.getcwd(), 'extracted_faces')

df_faces = df.loc[
    (df['Face'].notnull()), 
    ['Filename', 'Face', 'Sentiment', 'Type', 'Score']
].reset_index(drop=True).copy()

df_faces['Face'] = df_faces['Face'].apply(lambda x: os.path.join(path_faces, x))

In [59]:
df.head()

Unnamed: 0,Filename,FrameName,Fps,Count,Size,Face,Spectogram,Start,End,Score,Sentiment,Type,Text,Nwords,Label
0,03bSnISJMiM_1,03bSnISJMiM_1_frame_1.jpg,30.0,1,224x224,03bSnISJMiM_1_face_1.jpg,03bSnISJMiM_1_spect.jpg,51.904533,55.945351,2.4,positive,train,anyhow it was really good,5,1
1,03bSnISJMiM_1,03bSnISJMiM_1_frame_6.jpg,30.0,6,224x224,03bSnISJMiM_1_face_6.jpg,03bSnISJMiM_1_spect.jpg,51.904533,55.945351,2.4,positive,train,anyhow it was really good,5,1
2,03bSnISJMiM_1,03bSnISJMiM_1_frame_11.jpg,30.0,11,224x224,03bSnISJMiM_1_face_11.jpg,03bSnISJMiM_1_spect.jpg,51.904533,55.945351,2.4,positive,train,anyhow it was really good,5,1
3,03bSnISJMiM_1,03bSnISJMiM_1_frame_16.jpg,30.0,16,224x224,03bSnISJMiM_1_face_16.jpg,03bSnISJMiM_1_spect.jpg,51.904533,55.945351,2.4,positive,train,anyhow it was really good,5,1
4,03bSnISJMiM_1,03bSnISJMiM_1_frame_21.jpg,30.0,21,224x224,03bSnISJMiM_1_face_21.jpg,03bSnISJMiM_1_spect.jpg,51.904533,55.945351,2.4,positive,train,anyhow it was really good,5,1


In [60]:
def collate_fn(batch, pad_index):
    
    batch_ids = [i['ids'] for i in batch]
    batch_ids = nn.utils.rnn.pad_sequence(batch_ids, padding_value=pad_index, batch_first=True)
    batch_label = [i['label'] for i in batch]
    batch_label = torch.stack(batch_label)
    batch = {
        'ids': batch_ids,
        'label': batch_label
    }
    return batch


def evaluate(dataloader, model, criterion, device):
    
    model.eval()
    epoch_losses = []
    epoch_accs = []
    
    with torch.no_grad():
        
        for batch in tqdm(dataloader):
            
            ids = batch['ids'].to(device)
            label = batch['label'].to(device)
            prediction = model(ids)
            loss = criterion(prediction, label)
            accuracy = get_accuracy(prediction ,label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())
        
    return np.mean(epoch_losses), np.mean(epoch_accs)


def get_accuracy(prediction, label):
    
    batch_size, _ = prediction.shape
    predicted_classes = torch.sigmoid(prediction).argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / batch_size
    
    return accuracy

'''
Transformer
'''
class TextDatasetTransformer(torch.utils.data.Dataset):
    
    def __init__(self, df, tokenizer):
        
        self.ids = list(map(
            lambda x: tokenizer(x, truncation=True)['input_ids'], 
            df['Text'].tolist()
        ))
        self.labels = df['Label'].tolist()
        
    def __len__(self):
        
        return len(self.ids)
    
    def __getitem__(self, idx):
        
        ids = torch.tensor(self.ids[idx])
        
        label = torch.tensor(self.labels[idx])
        
        sample = {'ids': ids, 'label': label}
        
        return sample
    

class BERTLSTM(nn.Module):
    def __init__(self, bert, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        
        super().__init__()
        
        self.bert = bert
        embedding_dim = bert.config.hidden_size
        self.rnn = nn.LSTM(
            embedding_dim, 
            hidden_dim, 
            num_layers=n_layers,
            bidirectional=bidirectional, 
            batch_first=True, 
            dropout = 0 if n_layers < 2 else dropout
        )
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
        for param in self.bert.parameters():
            param.requires_grad = False
        
    def forward(self, text):
        
        with torch.no_grad():
            embedded = self.bert(text)[0]
    
        _, (hidden, cell) = self.rnn(embedded)
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        else:
            hidden = self.dropout(hidden[-1, :, :])
        
        output = self.out(hidden)
        
        return output
    

def get_transformer_model(path_saved_models, batch_size, df_text):
    
    model_name = 'BERTLSTM.pt'
    path_transformer = os.path.join(path_saved_models, model_name)

    transformer_name = 'bert-base-uncased'

    transformer_tokenizer = transformers.AutoTokenizer.from_pretrained(transformer_name)

    pad_index = transformer_tokenizer.pad_token_id

    transformer_dataset = TextDatasetTransformer(df_text, transformer_tokenizer)

    collate = functools.partial(collate_fn, pad_index=pad_index)

    transformer_dataloader = DataLoader(
        transformer_dataset,
        batch_size=batch_size, 
        collate_fn=collate, 
        shuffle=False
    )
    transformer = transformers.AutoModel.from_pretrained(transformer_name)

    hidden_dim = 256
    n_layers = 2
    bidirectional = True
    dropout = 0.5

    bert_lstm = BERTLSTM(transformer, hidden_dim, output_dim, n_layers, bidirectional, dropout)
    bert_lstm.load_state_dict(torch.load(path_transformer))
    
    return bert_lstm, transformer_dataloader

'''
FastText
'''
def generate_bigrams(x):
    
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    
    for n_gram in n_grams:
        x.append(' '.join(n_gram))
    
    return x


def tokenize_data(example, tokenizer, generate_ngrams):
    
    tokens = generate_ngrams(tokenizer(example['text']))
    length = len(tokens)
    
    return {'tokens': tokens, 'length': length}


# Bag of Tricks: https://arxiv.org/pdf/1607.01759.pdf
    
class FastText(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, output_dimm, pad_index):
        
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, text, *args):
        
        embedded = self.embedding(text)
        pooled = torch.mean(embedded, axis=1).squeeze(1)
        
        return self.fc(pooled)
    
    
class FastTextDataset(torch.utils.data.Dataset):
    
    def __init__(self, df, tokenizer, max_length, vocab, preprocess=None):
        # preprocess is a function that can be called after tokenization! 
         # (mainly used to generate bi-grams)
        
        self.texts = list(map(
            lambda x: preprocess(tokenizer(x)) if preprocess else tokenizer(x), 
            df['Text'].tolist()
        ))
        self.numericalized = [list(map(lambda x: vocab[x], i)) for i in self.texts]
        self.labels = df['Label'].tolist()
        self.max_length = max_length
        
    def __len__(self):
        
        return len(self.texts)
    
    def __getitem__(self, idx):
        
        n_ids = len(self.numericalized[idx])
        
        if n_ids > self.max_length:
            start_index = random.choice(range(n_ids - self.max_length))
            ids = torch.tensor(self.numericalized[idx][start_index: start_index + self.max_length])
        else:
            ids = torch.tensor(self.numericalized[idx])
        
        label = torch.tensor(self.labels[idx])
        
        sample = {'ids': ids, 'label': label}
        
        return sample

    
def get_fast_text_model(path_saved_models, batch_size, df_text):
    
    model_name = 'FastText_IMDB.pt'
    vocab_name = 'FastTextVocab.pth'

    path_fast_text = os.path.join(path_saved_models, model_name)
    path_vocab = os.path.join(path_saved_models, vocab_name)

    vocab = torch.load(path_vocab)

    unk_index = vocab['<unk>']
    pad_index = vocab['<pad>']

    vocab.set_default_index(unk_index)
    
    collate = functools.partial(collate_fn, pad_index=pad_index)

    max_length = 30
    tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

    fast_text_dataset = FastTextDataset(
        df_text, 
        tokenizer,
        max_length,
        vocab,
        generate_bigrams
    )
    fast_text_dataloader = DataLoader(
        fast_text_dataset,
        batch_size=batch_size, 
        collate_fn=collate, 
        shuffle=False
    )
    pad_index = vocab['<pad>']
    vocab_size = len(vocab)
    embedding_dim = 300

    fast_text_model = FastText(vocab_size, embedding_dim, output_dim, pad_index)
    fast_text_model.load_state_dict(torch.load(path_fast_text))

    return fast_text_model, fast_text_dataloader


'''
Spectogram CNN
'''
class SpectogramDataset(Dataset):
    
    def __init__(self, df, transforms):
        
        self.path_spectograms = df['PathSpectogram'].tolist()
        self.labels = list(map(lambda x: 1 if x == 'positive' else 0, df['Sentiment'].tolist()))
        self.transforms = transforms
        
    def __len__(self):
        return len(self.path_spectograms)
    
    def __getitem__(self, idx):
        
        image = Image.open(self.path_spectograms[idx])
        label = self.labels[idx]
        image = self.transforms(image)
            
        return image, label
    

def get_cnn_spectogram(path_saved_models, batch_size, df_spect):
    
    model_name = 'CNNSpectogram.pt'
    path_cnn_spect = os.path.join(path_saved_models, model_name)
    
    num_classes = df_spect['Sentiment'].nunique()
    
    tensor_transforms = transforms.Compose([
    transforms.ToTensor()
    ])

    channel_means = []
    channel_stds = []

    for pf in tqdm(df_spect.loc[(df_spect['Type'] == 'train'), 'PathSpectogram'].tolist()):

        img = tensor_transforms(Image.open(pf))

        channel_means.append(img.mean(axis=(1, 2)).numpy())
        channel_stds.append(img.std(axis=(1, 2)).numpy())

    channel_means = np.stack(channel_means).mean(axis=0).tolist()
    channel_stds = np.stack(channel_stds).mean(axis=0).tolist()
    
    size = (224, 224)
    
    test_transforms = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=channel_means, std=channel_stds),
        transforms.Resize(size, interpolation=InterpolationMode.NEAREST)
    ])
    
    spect_dataset = SpectogramDataset(
        df_spect.loc[(df_spect['Type'] == 'test')].reset_index(drop=True).copy(),
        test_transforms
    )
    spect_dataloader = DataLoader(
        spect_dataset,
        batch_size=batch_size,
        shuffle=False
    )
    cnn_spect = models.resnet18(pretrained=False)

    num_ftrs = cnn_spect.fc.in_features
    cnn_spect.fc = nn.Sequential(
        nn.Dropout(0.5),
        nn.Linear(num_ftrs, num_classes),
    )
    cnn_spect.load_state_dict(torch.load(path_cnn_spect))
    
    return cnn_spect, spect_dataloader
    

'''
CNN Faces - extracted faces
'''
class FaceDataset(Dataset):
    
    def __init__(self, df, transforms):
        
        self.path_faces = df['Face'].tolist()
        self.labels = list(map(lambda x: 1 if x == 'positive' else 0, df['Sentiment'].tolist()))
        self.transforms = transforms
        
    def __len__(self):
        return len(self.path_faces)
    
    def __getitem__(self, idx):
        
        image = Image.open(self.path_faces[idx])
        label = self.labels[idx]
        image = self.transforms(image)
            
        return image, label
    

def get_cnn_faces(path_saved_models, batch_size, df_faces):
    
    model_name = 'CNNFaces.pt'
    path_cnn_faces = os.path.join(path_saved_models, model_name)
    
    num_classes = df_faces['Sentiment'].nunique()
    
    tensor_transforms = transforms.Compose([
    transforms.ToTensor()
    ])
    channel_means = []
    channel_stds = []

    for pf in tqdm(df_faces.loc[(df_faces['Type'] == 'train'), 'Face'].tolist()):

        img = tensor_transforms(Image.open(pf))

        channel_means.append(img.mean(axis=(1, 2)).numpy())
        channel_stds.append(img.std(axis=(1, 2)).numpy())

    channel_means = np.stack(channel_means).mean(axis=0).tolist()
    channel_stds = np.stack(channel_stds).mean(axis=0).tolist()
    
    size = (200, 70)
    
    test_transforms = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=channel_means, std=channel_stds),
        transforms.Resize(size, interpolation=InterpolationMode.NEAREST)
    ])
    faces_dataset = FaceDataset(
        df_faces.loc[(df_faces['Type'] == 'test')].reset_index(drop=True).copy(),
        test_transforms
    )
    faces_dataloader = DataLoader(
        faces_dataset,
        batch_size=batch_size,
        shuffle=False
    )
    
    cnn_faces = models.resnet18(pretrained=True)

    num_ftrs = cnn_faces.fc.in_features
    cnn_faces.fc = nn.Sequential(
        nn.Dropout(0.5),
        nn.Linear(num_ftrs, num_classes)
    )
    cnn_faces.load_state_dict(torch.load(path_cnn_faces))

    return cnn_faces, faces_dataloader

In [4]:
path_saved_models = os.path.join(os.getcwd(), 'saved_models')

output_dim = df['Label'].nunique()

batch_size = 32

criterion = nn.CrossEntropyLoss()

device = torch.device('cuda')

In [5]:
bert_lstm, transformer_dataloader = get_transformer_model(path_saved_models, batch_size, df_text)

fast_text_model, fast_text_dataloader = get_fast_text_model(path_saved_models, batch_size, df_text)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
bert_lstm.to(device)
test_loss, test_acc = evaluate(transformer_dataloader, bert_lstm, criterion, device)
print(f'Transformer test_loss: {test_loss:.3f}, test_acc: {test_acc:.3f}')

fast_text_model.to(device)
test_loss, test_acc = evaluate(fast_text_dataloader, fast_text_model, criterion, device)
print(f'FastText test_loss: {test_loss:.3f}, test_acc: {test_acc:.3f}')

  0%|          | 0/22 [00:00<?, ?it/s]

Transformer test_loss: 0.620, test_acc: 0.681


  0%|          | 0/22 [00:00<?, ?it/s]

FastText test_loss: 3.544, test_acc: 0.690


In [24]:
def get_predictions_text(dataloader, model, device):
    
    predictions = []
    model.to(device)
    
    with torch.no_grad():
        
        for batch in tqdm(dataloader):
            
            ids = batch['ids'].to(device)
            preds = torch.sigmoid(model(ids)).cpu()
            predictions.append(preds)
            
    return torch.cat(predictions)

In [8]:
trf_preds = get_predictions_text(transformer_dataloader, bert_lstm, device)
ft_preds = get_predictions_text(fast_text_dataloader, fast_text_model, device)

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

In [10]:
# Average predictions
labels = torch.tensor(transformer_dataloader.dataset.labels)

avg_preds = torch.stack([trf_preds, ft_preds]).mean(dim=0).argmax(dim=1)

print(f'Averaged predictions accuracy: {avg_preds.eq(labels).sum() / len(labels):.4f}')

Averaged predictions accuracy: 0.7124


## The averaged predictions yields 1.6% more accuracy

In [22]:
cnn_spect, spect_dataloader = get_cnn_spectogram(path_saved_models, batch_size, df_spect)

  0%|          | 0/1284 [00:00<?, ?it/s]

In [28]:
def get_predictions_image(dataloader, model, device):
    
    predictions = []
    model.to(device)
    
    with torch.no_grad():
        
        for images, labels in tqdm(dataloader):
            
            images = images.to(device)
            preds = torch.sigmoid(model(images)).cpu()
            predictions.append(preds)
            
    return torch.cat(predictions)

In [41]:
assert labels.eq(torch.tensor(spect_dataloader.dataset.labels)).sum().item() == len(labels)

cnn_spect_preds = get_predictions_image(spect_dataloader, cnn_spect, device)

cnn_spect_accuracy = cnn_spect_preds.argmax(dim=1).eq(labels).sum() / len(labels)

print(f'CNN Spectogram test_acc: {cnn_spect_accuracy:.3f}')

  0%|          | 0/22 [00:00<?, ?it/s]

CNN Spectogram test_acc: 0.493


In [42]:
avg_preds = torch.stack([trf_preds, ft_preds, cnn_spect_preds]).mean(dim=0).argmax(dim=1)

print(f'Averaged predictions accuracy: {avg_preds.eq(labels).sum() / len(labels):.4f}')

Averaged predictions accuracy: 0.7139


## Although CNN Spectogram performance is worse, by averaging its predictions with
## the text models the overall accuracy slightly increases. Lets try majority voting.

In [43]:
torch.stack([trf_preds, ft_preds, cnn_spect_preds]).shape

torch.Size([3, 685, 2])

In [52]:
majority_preds, _ = torch.stack([trf_preds, ft_preds, cnn_spect_preds]).argmax(dim=2).mode(dim=0)

print(f'Majority voting accuracy: {majority_preds.eq(labels).sum() / len(labels):.4f}')

Majority voting accuracy: 0.6803


## In this case the accuracy decreases

In [61]:
cnn_faces, faces_dataloader = get_cnn_faces(path_saved_models, batch_size, df_faces)

  0%|          | 0/31146 [00:00<?, ?it/s]

In [62]:
cnn_faces_preds = get_predictions_image(faces_dataloader, cnn_faces, device)

  0%|          | 0/643 [00:00<?, ?it/s]

In [111]:
df_res = df_faces.loc[(df_faces['Type'] == 'test'), ['Filename', 'Face']].reset_index(drop=True).copy()
df_res['preds'] = cnn_faces_preds.argmax(dim=1).numpy().tolist()

cnn_faces_preds_mean = torch.round(torch.tensor(df_res.groupby('Filename')['preds'].mean().tolist()))

print(f'CNN Faces test_acc: {cnn_faces_preds_mean.int().eq(labels).sum() / len(labels):.3f}')

CNN Faces test_acc: 0.505


In [112]:
torch.stack([
    trf_preds.argmax(dim=1).unsqueeze(0), 
    ft_preds.argmax(dim=1).unsqueeze(0), 
    cnn_spect_preds.argmax(dim=1).unsqueeze(0)
])

tensor([[[1, 0, 1,  ..., 1, 0, 0]],

        [[0, 0, 0,  ..., 0, 1, 0]],

        [[1, 1, 1,  ..., 1, 1, 1]]])

In [117]:
all_averaged = torch.stack([
    trf_preds.argmax(dim=1), 
    ft_preds.argmax(dim=1), 
    cnn_spect_preds.argmax(dim=1),
    cnn_faces_preds_mean
]).mean(axis=0)

print(f'Averaged predictions accuracy: {all_averaged.int().eq(labels).sum() / len(labels):.4f}')

Averaged predictions accuracy: 0.6467


In [121]:
all_mode, _= torch.stack([
    trf_preds.argmax(dim=1), 
    ft_preds.argmax(dim=1), 
    cnn_spect_preds.argmax(dim=1),
    cnn_faces_preds_mean
]).mode(axis=0)

print(f'Majority voting accuracy: {all_mode.int().eq(labels).sum() / len(labels):.4f}')

Majority voting accuracy: 0.6788


## The vision based methods could not increase the overall performance. NLP based models dominated the performance.