# Multi Modal Model Training - Independent Labels

In [None]:
import pandas as pd
import numpy as np
import json
import os
import imageio
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.preprocessing import MultiLabelBinarizer
import time
import datetime
import re
import emoji

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
import transformers
from transformers import BertTokenizer, BertModel, BertConfig, AdamW
from transformers import DistilBertTokenizer, DistilBertModel, DistilBertConfig
from transformers import pipeline

In [None]:
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
from torchvision import transforms
from torchvision import models
from torch.utils.data import DataLoader, SubsetRandomSampler, RandomSampler, SequentialSampler, ConcatDataset
from sklearn.model_selection import KFold
import skimage

## Setup

In [None]:
classes = ['Smears', 'Loaded Language', 'Name calling/Labeling', 'Glittering generalities (Virtue)',
               'Appeal to (Strong) Emotions', 'Appeal to fear/prejudice', 'Transfer', 'Doubt',
               'Exaggeration/Minimisation', 'Whataboutism', 'Slogans', 'Flag-waving',
               "Misrepresentation of Someone's Position (Straw Man)", 'Causal Oversimplification',
               'Thought-terminating cliché', 'Black-and-white Fallacy/Dictatorship', 'Appeal to authority',
               'Reductio ad hitlerum', 'Repetition', 'Obfuscation, Intentional vagueness, Confusion',
               'Presenting Irrelevant Data (Red Herring)', 'Bandwagon']

### Create Class Binarizer
one_hot = MultiLabelBinarizer()
one_hot.fit([classes])

In [None]:
one_hot.classes_

In [None]:
from CustomLoader import MultiModalLoader

In [None]:
training_data = MultiModalLoader(json_file = 'MultiModal_training_data.json', root_dir = 'Images',
                           transform = transforms.Compose([
                               transforms.ToTensor(),
                               transforms.Resize(size = (224,224)),
                               transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5)) ### Pixel range [-1,1]
                           ]))

testing_data = MultiModalLoader(json_file = 'MultiModal_testing_data.json', root_dir = 'Images',
                           transform = transforms.Compose([
                               transforms.ToTensor(),
                               transforms.Resize(size = (224,224)),
                               transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5)) ### Pixel range [-1,1]
                           ]))

In [None]:
train_loader = DataLoader(dataset = training_data, batch_size = 25, shuffle = True)
test_loader = DataLoader(dataset = testing_data, batch_size = 25, shuffle = True)

# 
# 
# 

## Initialize Tokenizer

In [None]:
do_lower_case = True
model_type = 'distilbert'
model_version = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)

# 
# 

## Define DistilBert Class

In [None]:
class DistilBertClass(nn.Module): ### INCLUDE TOKENIZER IN CLASS
    def __init__(self):
        super(DistilBertClass, self).__init__()
        
        ### Import DistilBert Model
        distilbert = transformers.DistilBertModel.from_pretrained("distilbert-base-uncased")
        
        ### Freeze parameters
        for param in distilbert.parameters():
            param.requires_grad = False
            
        ### DistilBert encoder
        self.distil = distilbert
                        
            
        self.pre_classifier = nn.Linear(770, 3072) #768
        
        self.fc2 = nn.Linear(3072, 1024)    
        self.fc3 = nn.Linear(1024, 512)
        self.fc4 = nn.Linear(512, 128)
        
        self.classifier = nn.Linear(128, 22)
        
        self.dropout = nn.Dropout(0.01) #.3
        
        
    def forward(self, input_ids, attention_mask, hate, sent):

        ### Run DistilBert
        distil_output = self.distil(input_ids = input_ids, attention_mask = attention_mask)

        ### Grab hidden state
        hidden_state = distil_output[0]
        x1 = hidden_state[:, 0]
        

        # prepare hidden state to append additional features (hate and sentiment; hotencoded)
        x1 = x1.view(x1.size(0), -1)
        
        ### Bring in hate and sentiment
        x2 = hate
        x3 = sent

        ### Concatenate hidden state, hate, and sentiment
        x = torch.cat((x1, x2, x3), dim=1)

        ### Run through linear layers
        fc_output = x
        fc_output = self.pre_classifier(fc_output)

        fc_output = nn.GELU()(fc_output)
        fc_output = self.dropout(fc_output)

        fc_output = self.fc2(fc_output)
        fc_output = nn.GELU()(fc_output)
        fc_output = self.dropout(fc_output)

        #GELU

        fc_output = self.fc3(fc_output)
        fc_output = nn.GELU()(fc_output)
        fc_output = self.dropout(fc_output)


        fc_output = self.fc4(fc_output)
        fc_output = nn.Tanh()(fc_output)  # using gelu except for the last one to allow for classification
        fc_output = self.dropout(fc_output)

        output = self.classifier(fc_output)


        return output

# 
# 
# 

## Define CNN_Distilbert Architectures

### ResNet18

In [None]:
class ResNet18_DB(nn.Module):
    def __init__(self):
        super(ResNet18_DB, self).__init__()
        
        ### Import model
        resnet18 = models.resnet18(pretrained = True)
        
        ### Freeze parameters
        for param in resnet18.parameters():
            param.requires_grad = False
        
        ### Change last layer to match output
        resnet18.fc = nn.Sequential(
            nn.Linear(512, 22),
            nn.Sigmoid()
        )
        
        ### CNN Encoder
        self.cnn = resnet18
        
        ### Classifier
        self.classifier = nn.Linear(22,22)
        
        
        
    def forward(self, images, distil_output):
        
        ### Pass images through cnn
        cnn_output = self.cnn(images)
        
        ### Compute average between two outputs
        x = cnn_output + distil_output / 2
        
        ### Pass through final linear layer and sigmoid
        
        x = F.sigmoid(self.classifier(x))
    
        return x

### ResNet50

In [None]:
class ResNet50_DB(nn.Module):
    def __init__(self):
        super(ResNet50_DB, self).__init__()
        
        ### Import model
        resnet50 = models.resnet50(pretrained = True)
        
        ### Freeze parameters
        for param in resnet50.parameters():
            param.requires_grad = False
        
        ### Change last layer to match output
        resnet50.fc = nn.Sequential(
            nn.Linear(2048, 22),
            nn.Sigmoid()
        )
        
        ### CNN Encoder
        self.cnn = resnet50
        
        ### Classifier
        self.classifier = nn.Linear(22,22)
        
        
        
    def forward(self, images, distil_output):
        
        ### Pass images through cnn
        cnn_output = self.cnn(images)
        
        ### Compute average between two outputs
        x = cnn_output + distil_output / 2
        
        ### Pass through final linear layer and sigmoid
        
        x = F.sigmoid(self.classifier(x))
    
        return x

### ResNet101

In [None]:
class ResNet101_DB(nn.Module):
    def __init__(self):
        super(ResNet101_DB, self).__init__()
        
        ### Import model
        resnet101 = models.resnet101(pretrained = True)
        
        ### Freeze parameters
        for param in resnet101.parameters():
            param.requires_grad = False
        
        ### Change last layer to match output
        resnet101.fc = nn.Sequential(
            nn.Linear(2048, 22),
            nn.Sigmoid()
        )
        
        ### CNN Encoder
        self.cnn = resnet101
        
        ### Classifier
        self.classifier = nn.Linear(22,22)
        
        
        
    def forward(self, images, distil_output):
        
        ### Pass images through cnn
        cnn_output = self.cnn(images)
        
        ### Compute average between two outputs
        x = cnn_output + distil_output / 2
        
        ### Pass through final linear layer and sigmoid
        
        x = F.sigmoid(self.classifier(x))
    
        return x

### DenseNet121

In [None]:
class DenseNet121_DB(nn.Module):
    def __init__(self):
        super(DenseNet121_DB, self).__init__()
        
        ### Import model
        densenet121 = models.densenet121(pretrained = True)
        
        ### Freeze parameters
        for param in densenet121.parameters():
            param.requires_grad = False
        
        ### Change last layer to match output
        densenet121.classifier = nn.Sequential(
            nn.Linear(1024, 22),
            nn.Sigmoid()
        )
        
        ### CNN Encoder
        self.cnn = densenet121
        
        ### Classifier
        self.classifier = nn.Linear(22,22)
        
        
        
    def forward(self, images, distil_output):
        
        ### Pass images through cnn
        cnn_output = self.cnn(images)
        
        ### Compute average between two outputs
        x = cnn_output + distil_output / 2
        
        ### Pass through final linear layer and sigmoid
        
        x = F.sigmoid(self.classifier(x))
    
        return x

### DenseNet169

In [None]:
class DenseNet169_DB(nn.Module):
    def __init__(self):
        super(DenseNet169_DB, self).__init__()
        
        ### Import model
        densenet169 = models.densenet169(pretrained = True)
        
        ### Freeze parameters
        for param in densenet169.parameters():
            param.requires_grad = False
        
        ### Change last layer to match output
        densenet169.classifier = nn.Sequential(
            nn.Linear(1664, 22),
            nn.Sigmoid()
        )
        
        ### CNN Encoder
        self.cnn = densenet169
        
        ### Classifier
        self.classifier = nn.Linear(22,22)
        
        
        
    def forward(self, images, distil_output):
        
        ### Pass images through cnn
        cnn_output = self.cnn(images)
        
        ### Compute average between two outputs
        x = cnn_output + distil_output / 2
        
        ### Pass through final linear layer and sigmoid
        
        x = F.sigmoid(self.classifier(x))
    
        return x

### DenseNet201

In [None]:
class DenseNet201_DB(nn.Module):
    def __init__(self):
        super(DenseNet201_DB, self).__init__()
        
        ### Import model
        densenet201 = models.densenet201(pretrained = True)
        
        ### Freeze parameters
        for param in densenet201.parameters():
            param.requires_grad = False
        
        ### Change last layer to match output
        densenet201.classifier = nn.Sequential(
            nn.Linear(1920, 22),
            nn.Sigmoid()
        )
        
        ### CNN Encoder
        self.cnn = densenet201
        
        ### Classifier
        self.classifier = nn.Linear(22,22)
        
        
        
    def forward(self, images, distil_output):
        
        ### Pass images through cnn
        cnn_output = self.cnn(images)
        
        ### Compute average between two outputs
        x = cnn_output + distil_output / 2
        
        ### Pass through final linear layer and sigmoid
        
        x = F.sigmoid(self.classifier(x))
    
        return x

### VGG11_BN

In [None]:
class VGG11_BN_DB(nn.Module):
    def __init__(self):
        super(VGG11_BN_DB, self).__init__()
        
        ### Import model
        vgg11bn = models.vgg11_bn(pretrained = True)
        
        ### Freeze parameters
        for param in vgg11bn.parameters():
            param.requires_grad = False
        
        ### Change last layer to match output
        vgg11bn.classifier[6] = nn.Sequential(
            nn.Linear(4096, 22),
            nn.Sigmoid()
        )
        
        ### CNN Encoder
        self.cnn = vgg11bn
        
        ### Classifier
        self.classifier = nn.Linear(22,22)
        
        
        
    def forward(self, images, distil_output):
        
        ### Pass images through cnn
        cnn_output = self.cnn(images)
        
        ### Compute average between two outputs
        x = cnn_output + distil_output / 2
        
        ### Pass through final linear layer and sigmoid
        
        x = F.sigmoid(self.classifier(x))
    
        return x

### VGG16_BN

In [None]:
class VGG16_BN_DB(nn.Module):
    def __init__(self):
        super(VGG16_BN_DB, self).__init__()
        
        ### Import model
        vgg16bn = models.vgg16_bn(pretrained = True)
        
        ### Freeze parameters
        for param in vgg16bn.parameters():
            param.requires_grad = False
        
        ### Change last layer to match output
        vgg16bn.classifier[6] = nn.Sequential(
            nn.Linear(4096, 22),
            nn.Sigmoid()
        )
        
        ### CNN Encoder
        self.cnn = vgg16bn
        
        ### Classifier
        self.classifier = nn.Linear(22,22)
        
        
        
    def forward(self, images, distil_output):
        
        ### Pass images through cnn
        cnn_output = self.cnn(images)
        
        ### Compute average between two outputs
        x = cnn_output + distil_output / 2
        
        ### Pass through final linear layer and sigmoid
        
        x = F.sigmoid(self.classifier(x))
    
        return x

### VGG19_BN

In [None]:
class VGG19_BN_DB(nn.Module):
    def __init__(self):
        super(VGG19_BN_DB, self).__init__()
        
        ### Import model
        vgg19bn = models.vgg19_bn(pretrained = True)
        
        ### Freeze parameters
        for param in vgg19bn.parameters():
            param.requires_grad = False
        
        ### Change last layer to match output
        vgg19bn.classifier[6] = nn.Sequential(
            nn.Linear(4096, 22),
            nn.Sigmoid()
        )
        
        ### CNN Encoder
        self.cnn = vgg19bn
        
        ### Classifier
        self.classifier = nn.Linear(22,22)
        
        
        
    def forward(self, images, distil_output):
        
        ### Pass images through cnn
        cnn_output = self.cnn(images)
        
        ### Compute average between two outputs
        x = cnn_output + distil_output / 2
        
        ### Pass through final linear layer and sigmoid
        
        x = F.sigmoid(self.classifier(x))
    
        return x

# 
# 
# 

## Define Training Methodology

In [None]:
def get_lr(optim):
    for param_group in optim.param_groups:
        return param_group['lr']

In [None]:
### This function is used to train a cnn model
def Kfold_train_CNN_DB(CNN = None, DistilBert = None, training_data = None, learning_rate = None, k_folds = None, n_epochs = None, model_name = None):
    
    ### Check that all entries are valid
    if ((CNN == None) or (training_data == None) or (model_name == None) or 
        (learning_rate == None) or (k_folds == None) or (n_epochs == None) or (DistilBert == None)):
        print ('Enter all info.')
        
        
        
    ### Run K-Fold CV
    else:
        
        device = 'cpu'

        ### Set Loss Function and Optimizer
        criterion = nn.BCELoss()
        
        
        #### Define the K-fold Cross Validator
        kfold = KFold(n_splits=k_folds, shuffle=True)
        
        
        
        ### Create values to hold the best model metrics across folds
        val_f1_mic_max = 0 ### This determines best model
        
        best_train_loss = 0
        best_train_acc = 0
        best_train_f1_mic = 0
        best_train_f1_mac = 0
        best_train_prec_mic = 0
        best_train_prec_mac = 0
        best_train_rec_mic = 0
        best_train_rec_mac = 0

        best_val_acc = 0
        best_val_loss = 0
#         best_val_f1_mic = 0
        best_val_f1_mac = 0
        best_val_prec_mic = 0
        best_val_prec_mac = 0
        best_val_rec_mic = 0
        best_val_rec_mac = 0
        
        best_fold = 0
        best_epoch = 0
        

        
        ### Start print

        start = time.time()
        
        ### K-fold Cross Validation model evaluation
        for fold, (train_ids, val_ids) in enumerate(kfold.split(training_data)):
            
            print('-------------------------------------------')
            print('FOLD {}'.format(fold + 1))
            print('-------------------------------------------')
            
            ### Sample elements randomly from a given list of ids, no replacement
            train_subsampler = SubsetRandomSampler(train_ids)
            val_subsampler = SubsetRandomSampler(val_ids)
            
            ### Define data loaders for training and validation in current fold
            train_loader = DataLoader(dataset = training_data, batch_size = 25, sampler = train_subsampler)
            val_loader = DataLoader(dataset = training_data, batch_size = 25, sampler = val_subsampler)
            
            ### Initialize network
            network1 = DistilBert
            network2 = CNN
            if torch.cuda.is_available():
                network1.cuda()
                network1 = nn.DataParallel(network1, list(range(2)))
                
                network2.cuda()
                network2 = nn.DataParallel(network2, list(range(2)))
                
                device = 'cuda'
            
            ### Initialize optimizer
            optimizer1 = transformers.Adafactor(
                network1.parameters(),
                lr=None,  # when using warm up and relative step, LR is auto determined
                eps=(1e-30, 1e-3),
                clip_threshold=1.0,
                decay_rate=-0.8, #0.8
                beta1=None, # <- used for L1 regularization
                weight_decay=0.000002, # L2 regularization, to prevent overfitting  (beta2)
                relative_step=True,
                scale_parameter=True, # https://github.com/pytorch/pytorch/issues/25081 this setting keeps the gradients from reaching 0 (using the clip threshold) (if this is enabled, must modify in training)
                warmup_init=True
            )
                        
            optimizer2 = optim.Adam(network2.parameters(), lr=learning_rate)
            scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer2, mode = 'min', factor = 0.1, patience = 5)
            
            
            ### Create lists of values at the end of each epoch for each fold
            train_loss = []
            train_acc = []
            train_f1_mic = []
            train_f1_mac = []
            train_prec_mic = []
            train_prec_mac = []
            train_rec_mic = []
            train_rec_mac = []
            
            val_loss = []
            val_acc = []
            val_f1_mic = []
            val_f1_mac = []
            val_prec_mic = []
            val_prec_mac = []
            val_rec_mic = []
            val_rec_mac = []
            
            
            ### Train network
            for epoch in range(n_epochs):
                
                ### Hold training predictions and targets
                train_output = np.empty((0,22), int)
                train_all_targets = np.empty((0,22), int)
                
                val_output = np.empty((0,22), int)
                val_all_targets = np.empty((0,22), int)
                
                
                ### Train ###
                network1.train()
                network2.train()
                
                train_running_loss = 0.0
                
                batch_number = 0
                for i, data in enumerate(train_loader):
                    
                    images, text, hate, sent, targets = data[0].to(device), data[1], data[2].to(device), data[3].to(device), data[4].float().to(device)
                    
                    optimizer1.zero_grad()
                    optimizer2.zero_grad()
                    
                    
                    ids = []
                    mask = []

                    for j in text:   
                        inputs = tokenizer.encode_plus(
                                j, 
                                None,
                                add_special_tokens = True,    
                                max_length= 512,
                                padding = "max_length",
                                pad_to_max_length = True,
                                return_token_type_ids= False)

                        ids.append(inputs['input_ids'])
                        mask.append(inputs['attention_mask'])

                    ids = torch.from_numpy(np.array(ids)).to(device)
                    mask = torch.from_numpy(np.array(mask)).to(device)

                    distil_output = network1(ids, mask, hate, sent)

                    output = network2(images, distil_output)

                    
                    loss = criterion(output, targets) 
                    train_running_loss += loss.item()
                                        
                    ### Append output
                    train_output = np.vstack((train_output, ((output > 0.5).cpu().numpy().astype('int'))))
                    train_all_targets = np.vstack((train_all_targets, targets.cpu().numpy().astype('int')))

                    loss.backward()
                    optimizer1.step()
                    optimizer2.step()
                    
                    
                ### Calculate metrics and append
                train_loss.append(train_running_loss/len(train_loader.dataset))
                train_acc.append(accuracy_score(train_all_targets, train_output))
                train_f1_mic.append(f1_score(train_all_targets, train_output, average = 'micro'))
                train_f1_mac.append(f1_score(train_all_targets, train_output, average = 'macro'))
                train_prec_mic.append(precision_score(train_all_targets, train_output, average = 'micro'))
                train_prec_mac.append(precision_score(train_all_targets, train_output, average = 'macro'))
                train_rec_mic.append(recall_score(train_all_targets, train_output, average = 'micro'))
                train_rec_mac.append(recall_score(train_all_targets, train_output, average = 'macro'))
                
                
                ### Validate###
                network1.eval()
                network2.eval()
                
                val_running_loss = 0.0

                                
                for i, data in enumerate(val_loader):
                    images, text, hate, sent, targets = data[0].to(device), data[1], data[2].to(device), data[3].to(device), data[4].float().to(device)
                    
                    ids = []
                    mask = []
                    
                    for j in text:   
                        inputs = tokenizer.encode_plus(
                                j, 
                                None,
                                add_special_tokens = True,    
                                max_length= 512,
                                padding = "max_length",
                                pad_to_max_length = True,
                                return_token_type_ids= False)

                        ids.append(inputs['input_ids'])
                        mask.append(inputs['attention_mask'])

                    ids = torch.from_numpy(np.array(ids)).to(device)
                    mask = torch.from_numpy(np.array(mask)).to(device)

                    distil_output = network1(ids, mask, hate, sent)

                    output = network2(images, distil_output)
                    
                    loss = criterion(output, targets)
                    val_running_loss += loss.item()
                    
                    ### Append output
                    val_output = np.vstack((val_output, ((output > 0.5).cpu().numpy().astype('int'))))
                    val_all_targets = np.vstack((val_all_targets, targets.cpu().numpy().astype('int')))

                
                ### Calculate metrics and append
                val_loss.append(val_running_loss/len(val_loader.dataset))
                val_acc.append(accuracy_score(val_all_targets, val_output))
                val_f1_mic.append(f1_score(val_all_targets, val_output, average = 'micro'))
                val_f1_mac.append(f1_score(val_all_targets, val_output, average = 'macro'))
                val_prec_mic.append(precision_score(val_all_targets, val_output, average = 'micro'))
                val_prec_mac.append(precision_score(val_all_targets, val_output, average = 'macro'))
                val_rec_mic.append(recall_score(val_all_targets, val_output, average = 'micro'))
                val_rec_mac.append(recall_score(val_all_targets, val_output, average = 'macro'))
                
                
                ### Save model with the lowest validation loss
                if val_f1_mic[epoch] > val_f1_mic_max:
                    print('Validation F1 Micro Score Increased ({:.6f} --> {:.6f}).  Saving model ...\n'.format(
                    val_f1_mic_max,
                    val_f1_mic[epoch]))
                    torch.save(network1.state_dict(), 'BestCnnDBModels/DB_' + model_name[:-3] + '.pt')
                    torch.save(network2.state_dict(), 'BestCnnDBModels/' + model_name +'.pt')
                    val_f1_mic_max = val_f1_mic[epoch]
                    
                    ### Set current best metrics
                    best_train_loss = train_loss[epoch]
                    best_train_acc = train_acc[epoch]
                    best_train_f1_mic = train_f1_mic[epoch]
                    best_train_f1_mac = train_f1_mac[epoch]
                    best_train_prec_mic = train_prec_mic[epoch]
                    best_train_prec_mac = train_prec_mac[epoch]
                    best_train_rec_mic = train_rec_mic[epoch]
                    best_train_rec_mac = train_rec_mac[epoch]

                    best_val_acc = val_acc[epoch]
                    best_val_loss = val_loss[epoch]
#                     best_val_f1_mic = val_f1_mic[epoch]
                    best_val_f1_mac = val_f1_mac[epoch]
                    best_val_prec_mic = val_prec_mic[epoch]
                    best_val_prec_mac = val_prec_mac[epoch]
                    best_val_rec_mic = val_rec_mic[epoch]
                    best_val_rec_mac = val_rec_mac[epoch]

                    best_fold = fold + 1
                    best_epoch = epoch + 1
                    
                    
                    
                ### Display summary for epoch
                print('Epoch {} \tLearning Rate: {} \tTime (min): {}'.format(epoch+1, get_lr(optimizer2), round((time.time()-start)/60, 2)))
                print('Train Loss: {} \tValidation Loss: {}'. format(round(train_loss[epoch], 4),
                                                                     round(val_loss[epoch], 4)))
                print('Train Accuracy: {} \tValidation Accuracy: {}'.format(round(train_acc[epoch], 4),
                                                                            round(val_acc[epoch], 4)))
                print('Train F1 Mirco: {} \tValidation F1 Micro: {}'.format(round(train_f1_mic[epoch], 4),
                                                                            round(val_f1_mic[epoch], 4)))
                print('Train F1 Marco: {} \tValidation F1 Macro: {}'.format(round(train_f1_mac[epoch], 4),
                                                                            round(val_f1_mac[epoch], 4)))
                print('Train Precision Mirco: {} \tValidation Precision Micro: {}'.format(round(train_prec_mic[epoch], 4),
                                                                                          round(val_prec_mic[epoch], 4)))
                print('Train Precision Marco: {} \tValidation Precision Macro: {}'.format(round(train_prec_mac[epoch], 4),
                                                                                          round(val_prec_mac[epoch], 4)))
                print('Train Recall Mirco: {} \tValidation Recall Micro: {}'.format(round(train_rec_mic[epoch], 4),
                                                                                    round(val_rec_mic[epoch], 4)))
                print('Train Recall Marco: {} \tValidation Recall Macro: {}\n'.format(round(train_rec_mac[epoch], 4),
                                                                                    round(val_rec_mac[epoch], 4)))
                
                
                ### Update learning rate if needed
                scheduler.step(val_loss[epoch])
                
                
                
            ### Display summary graph of fold
            fig, (ax1, ax3) = plt.subplots(1,2, figsize = (20,6))
            ax1.set_xlabel('Epoch')
            ax1.set_ylabel('Loss')
            ln1 = ax1.plot(np.arange(start = 1, stop = n_epochs + 1), train_loss, label = 'Train Loss')
            ln2 = ax1.plot(np.arange(start = 1, stop = n_epochs + 1), val_loss, label = 'Val Loss')
            
            ax2 = ax1.twinx()
            ax2.set_ylabel('Accuracy')
            ln3 = ax2.plot(np.arange(start = 1, stop = n_epochs + 1), train_acc, marker = 'o', label = 'Train Acc')
            ln4 = ax2.plot(np.arange(start = 1, stop = n_epochs + 1), val_acc, marker = 'o', label = 'Val Acc')
            
            lns1 = ln1 + ln2 + ln3 + ln4
            labs1 = [l.get_label() for l in lns1]
            
            ax3.set_xlabel('Epoch')
            ax3.set_ylabel('Score')
            ln5 = ax3.plot(np.arange(start = 1, stop = n_epochs + 1), train_f1_mic, marker = 'v', label = 'Train F1 Micro')
            ln6 = ax3.plot(np.arange(start = 1, stop = n_epochs + 1), val_f1_mic, marker = 'v', label = 'Val F1 Micro')
            ln7 = ax3.plot(np.arange(start = 1, stop = n_epochs + 1), train_f1_mac, marker = '^', label = 'Train F1 Macro')
            ln8 = ax3.plot(np.arange(start = 1, stop = n_epochs + 1), val_f1_mac, marker = '^', label = 'Val F1 Micro')
            ln9 = ax3.plot(np.arange(start = 1, stop = n_epochs + 1), train_prec_mic, marker = 'd', label = 'Train Prec. Micro')
            ln10 = ax3.plot(np.arange(start = 1, stop = n_epochs + 1), val_prec_mic, marker = 'd', label = 'Val Prec. Micro')
            ln11 = ax3.plot(np.arange(start = 1, stop = n_epochs + 1), train_prec_mac, marker = 'X', label = 'Train Prec. Macro')
            ln12 = ax3.plot(np.arange(start = 1, stop = n_epochs + 1), val_prec_mac, marker = 'X', label = 'Val Prec. Macro')
            ln13 = ax3.plot(np.arange(start = 1, stop = n_epochs + 1), train_rec_mic, marker = 'P', label = 'Train Rec. Micro')
            ln14 = ax3.plot(np.arange(start = 1, stop = n_epochs + 1), val_rec_mic, marker = 'P', label = 'Val Rec. Micro')
            ln15 = ax3.plot(np.arange(start = 1, stop = n_epochs + 1), train_rec_mac, marker = 's', label = 'Train Rec. Macro')
            ln16 = ax3.plot(np.arange(start = 1, stop = n_epochs + 1), val_rec_mac, marker = 's', label = 'Val Rec. Macro')
            
            lns2 = ln5 + ln6 + ln7 + ln8 +  ln9 + ln10 + ln11 + ln12 + ln13 + ln14 + ln15 + ln16
            labs2 = [l.get_label() for l in lns2]
            
            
            ax1.legend(lns1, labs1, loc = 'upper left', bbox_to_anchor = (1.1,1))
            ax3.legend(lns2, labs2, loc = 'upper left', bbox_to_anchor = (1.05,1))
            fig.tight_layout()
            
            plt.show()
            
            fig.savefig('MultiModal_IndLabels_TrainingSummary/' + model_name + '_FOLD' + str(fold+1) + '.png')
        
        
        
        ### Display metrics of the best model
                
        print('------------------------------------------------------------')
        print('------------------------------------------------------------')
        
        print('\nMetrics of Best Model:')
        print('Fold: {} \tEpoch: {}'.format(best_fold, best_epoch))
        print('Train Loss: {} \tValidation Loss: {}'. format(round(best_train_loss, 4),
                                                             round(best_val_loss, 4)))
        print('Train Accuracy: {} \tValidation Accuracy: {}'.format(round(best_train_acc, 4),
                                                                    round(best_val_acc, 4)))
        print('Train F1 Mirco: {} \tValidation F1 Micro: {}'.format(round(best_train_f1_mic, 4),
                                                                    round(val_f1_mic_max, 4)))
        print('Train F1 Marco: {} \tValidation F1 Macro: {}'.format(round(best_train_f1_mac, 4),
                                                                    round(best_val_f1_mac, 4)))
        print('Train Precision Mirco: {} \tValidation Precision Micro: {}'.format(round(best_train_prec_mic, 4),
                                                                                  round(best_val_prec_mic, 4)))
        print('Train Precision Marco: {} \tValidation Precision Macro: {}'.format(round(best_train_prec_mac, 4),
                                                                                  round(best_val_prec_mac, 4)))
        print('Train Recall Mirco: {} \tValidation Recall Micro: {}'.format(round(best_train_rec_mic, 4),
                                                                            round(best_val_rec_mic, 4)))
        print('Train Recall Marco: {} \tValidation Recall Macro: {}'.format(round(best_train_rec_mac, 4),
                                                                            round(best_val_rec_mac, 4)))
        
        
        ### Return best metrics
        return [best_fold, best_epoch, best_train_loss, best_train_acc, best_train_f1_mic, best_train_f1_mac, best_train_prec_mic, best_train_prec_mac, best_train_rec_mic, best_train_rec_mac,
                best_val_loss, best_val_acc, val_f1_mic_max, best_val_f1_mac, best_val_prec_mic, best_val_prec_mac, best_val_rec_mic, best_val_rec_mac]
  

# 
# 
# 

## Initialize Models

In [None]:
DB_resnet18 = DistilBertClass()
DB_resnet50 = DistilBertClass()
DB_resnet101 = DistilBertClass()

DB_densenet121 = DistilBertClass()
DB_densenet169 = DistilBertClass()
DB_densenet201 = DistilBertClass()

DB_vgg11bn = DistilBertClass()
DB_vgg16bn = DistilBertClass()
DB_vgg19bn = DistilBertClass()

resnet18 = ResNet18_DB()
resnet50 = ResNet50_DB()
resnet101 = ResNet101_DB()

densenet121 = DenseNet121_DB()
densenet169 = DenseNet169_DB()
densenet201 = DenseNet201_DB()

vgg11bn = VGG11_BN_DB()
vgg16bn = VGG16_BN_DB()
vgg19bn = VGG19_BN_DB()

# 
# 
# 

## Train Models

In [None]:
START = time.time()

In [None]:
resnet18_DB_best = Kfold_train_CNN_DB(CNN = resnet18, DistilBert = DB_resnet18,
                                       training_data = training_data, learning_rate = 0.5, 
                                       k_folds = 10, n_epochs = 30, model_name = 'resnet18_DB')

In [None]:
resnet50_DB_best = Kfold_train_CNN_DB(CNN = resnet50, DistilBert = DB_resnet50, 
                                      training_data = training_data, learning_rate = 0.01,
                                      k_folds = 10, n_epochs = 30, model_name = 'resnet50_DB')

In [None]:
resnet101_DB_best = Kfold_train_CNN_DB(CNN = resnet101, DistilBert = DB_resnet101, 
                                       training_data = training_data, learning_rate = 0.01,
                                       k_folds = 10, n_epochs = 30, model_name = 'resnet101_DB')

In [None]:
densenet121_DB_best = Kfold_train_CNN_DB(CNN = densenet121, DistilBert = DB_densenet121, 
                                         training_data = training_data, learning_rate = 0.01,
                                         k_folds = 10, n_epochs = 30, model_name = 'densenet121_DB')

In [None]:
densenet169_DB_best = Kfold_train_CNN_DB(CNN = densenet169, DistilBert = DB_densenet169, 
                                         training_data = training_data, learning_rate = 0.01, 
                                         k_folds = 10, n_epochs = 30, model_name = 'densenet169_DB')

In [None]:
densenet201_DB_best = Kfold_train_CNN_DB(CNN = densenet201, DistilBert = DB_densenet201, 
                                         training_data = training_data, learning_rate = 0.01, 
                                         k_folds = 10, n_epochs = 30, model_name = 'densenet201_DB')

In [None]:
vgg11bn_DB_best = Kfold_train_CNN_DB(CNN = vgg11bn, DistilBert = DB_vgg11bn, 
                                     training_data = training_data, learning_rate = 0.01, 
                                     k_folds = 10, n_epochs = 30, model_name = 'vgg11bn_DB')

In [None]:
vgg16bn_DB_best = Kfold_train_CNN_DB(CNN = vgg16bn, DistilBert = DB_vgg16bn, 
                                     training_data = training_data, learning_rate = 0.01, 
                                     k_folds = 10, n_epochs = 30, model_name = 'vgg16bn_DB')

In [None]:
vgg19bn_DB_best = Kfold_train_CNN_DB(CNN = vgg19bn, DistilBert = DB_vgg19bn, 
                                     training_data = training_data, learning_rate = 0.01, 
                                     k_folds = 10, n_epochs = 30, model_name = 'vgg19bn_DB')

In [None]:
print('Total Training Time (HR:M:S): ' + str(datetime.timedelta(hours = ((time.time() - START) / 60 / 60))).rsplit('.', 1)[0])

# 
# 
# 

## Training Summary

### Summary Table

In [None]:
summaryDF = pd.DataFrame({
    'ResNet18' : resnet18_DB_best,
    'ResNet50' : resnet50_DB_best,
    'ResNet101' : resnet101_DB_best,
    'DenseNet121' : densenet121_DB_best,
    'DenseNet169' : densenet169_DB_best,
    'DenseNet201' : densenet201_DB_best,
    'VGG11_BN' : vgg11bn_DB_best,
    'VGG16_BN' : vgg16bn_DB_best,
    'VGG19_BN' : vgg19bn_DB_best
})
summaryDF.index = ['Fold', 'Epoch', 'Train Loss', 'Train Accuracy', 'Train F1 Micro', 'Train F1 Macro', 'Train Precision Micro', 'Train Precision Macro', 'Train Recall Micro', 'Train Recall Macro',
                   'Val Loss', 'Val Accuracy', 'Val F1 Micro', 'Val F1 Macro', 'Val Precision Micro', 'Val Precision Macro', 'Val Recall Micro', 'Val Recall Macro']

summaryDF

### Training Summary Graph

In [None]:
import plotly
import plotly.graph_objects as go

In [None]:
fig = go.Figure(data = [
    go.Bar(name = 'ResNet18', x = summaryDF.index[3:10], y = summaryDF['ResNet18'][3:10]),
    go.Bar(name = 'ResNet50', x = summaryDF.index[3:10], y = summaryDF['ResNet50'][3:10]),
    go.Bar(name = 'ResNet101', x = summaryDF.index[3:10], y = summaryDF['ResNet101'][3:10]),
    go.Bar(name = 'DenseNet121', x = summaryDF.index[3:10], y = summaryDF['DenseNet121'][3:10]),
    go.Bar(name = 'DenseNet169', x = summaryDF.index[3:10], y = summaryDF['DenseNet169'][3:10]),
    go.Bar(name = 'DenseNet201', x = summaryDF.index[3:10], y = summaryDF['DenseNet201'][3:10]),
    go.Bar(name = 'VGG11_BN', x = summaryDF.index[3:10], y = summaryDF['VGG11_BN'][3:10]),
    go.Bar(name = 'VGG16_BN', x = summaryDF.index[3:10], y = summaryDF['VGG16_BN'][3:10]),
    go.Bar(name = 'VGG19_BN', x = summaryDF.index[3:10], y = summaryDF['VGG19_BN'][3:10]),
])
fig.update_layout(barmode = 'group',
                  title = 'Best Model Metrics Across Folds (Training) - With DistilBert',
                  xaxis_title = 'Metrics',
                  yaxis_title = 'Score',
                  legend_title = 'Models')
fig.show()

In [None]:
fig = go.Figure(data = [
    go.Bar(name = 'ResNet18', x = summaryDF.index[11:], y = summaryDF['ResNet18'][11:]),
    go.Bar(name = 'ResNet50', x = summaryDF.index[11:], y = summaryDF['ResNet50'][11:]),
    go.Bar(name = 'ResNet101', x = summaryDF.index[11:], y = summaryDF['ResNet101'][11:]),
    go.Bar(name = 'DenseNet121', x = summaryDF.index[11:], y = summaryDF['DenseNet121'][11:]),
    go.Bar(name = 'DenseNet169', x = summaryDF.index[11:], y = summaryDF['DenseNet169'][11:]),
    go.Bar(name = 'DenseNet201', x = summaryDF.index[11:], y = summaryDF['DenseNet201'][11:]),
    go.Bar(name = 'VGG11_BN', x = summaryDF.index[11:], y = summaryDF['VGG11_BN'][11:]),
    go.Bar(name = 'VGG16_BN', x = summaryDF.index[11:], y = summaryDF['VGG16_BN'][11:]),
    go.Bar(name = 'VGG19_BN', x = summaryDF.index[11:], y = summaryDF['VGG19_BN'][11:]),
])
fig.update_layout(barmode = 'group',
                  title = 'Best Model Metrics Across Folds (Validation) - With DistilBert',
                  xaxis_title = 'Metrics',
                  yaxis_title = 'Score',
                  legend_title = 'Models')
fig.show()