# Multimodal system for sentiment analysis based on texts and images (MVSA-SINGLE DATASET)

## Mount Drive, Libraries

Mount Google Drive for loading and storing files

In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


Install/Import libraries

In [None]:
!pip install transformers
!pip install comet_ml
!pip install contractions
!pip install sentencepiece

import sys
sys.path.append('/content/drive/My Drive/sentiment-analysis/functions')
import sentiment_analysis_functions, text_functions, image_functions, multimodal_functions
import random
import torch
import numpy as np
import matplotlib.pyplot as plt
from comet_ml import Experiment
from comet_ml.integration.pytorch import log_model
from sklearn.metrics import f1_score, accuracy_score
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import (CenterCrop,
                                    Compose,
                                    Normalize,
                                    RandomHorizontalFlip,
                                    RandomResizedCrop,
                                    Resize,
                                    ToTensor,
                                    ToPILImage,
                                    Lambda, RandomRotation,ColorJitter,RandomAffine)
from PIL import Image
import torch

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m67.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m
Col

Apply random seed and use CUDA

In [None]:
seed_val = 20
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
generator = torch.Generator()
generator.manual_seed(seed_val)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Load dataset and/or features, select models and hyperparameters, connect with Comet ML Framework, map labels

Load dataset from Google Drive (Necessary)

In [None]:
DATA_PATH = './drive/My Drive/sentiment-analysis/datasets/mvsa-single-4511_multimodal.hdf5'
texts,images,labels,text_labels,image_labels = sentiment_analysis_functions.load_mvsa_data(DATA_PATH, 1)

Load stored features for text and image from Google Drive (Optional)

In [None]:
text_logits = np.load('/content/drive/MyDrive/sentiment-analysis/new logits/text_logits.npy', allow_pickle = True)
image_logits = np.load('/content/drive/MyDrive/sentiment-analysis/new logits/image_logits.npy',allow_pickle = True)

Load translated greek texts instead of english texts (Optional)

In [None]:
texts = np.load(f'/content/drive/MyDrive/sentiment-analysis/translations/greek/translated.npy', allow_pickle=True)

Select models, hyperparameters.  
Connect with Comet ML.
Log the hyperparameters to Comet ML.

In [None]:
from comet_ml import Experiment
from comet_ml.integration.pytorch import log_model

# Replace the following values with yours connection.
experiment = Experiment(
  api_key = "XXXXXXXXXXXXXXXXXXXXX",
  project_name = "PROJECT NAME",
  workspace="USERNAME",
  log_code = True,
  auto_param_logging = True,
  auto_metric_logging = True,
  auto_output_logging = True,
)


### TEXT GLOBAL SETTINGS ###

samples = 'multimodal only'
# TEXT_MODEL = 'bert-base-cased'
# TEXT_MODEL = 'bert-base-uncased'
# TEXT_MODEL = 'nlpaueb/bert-base-greek-uncased-v1'
TEXT_MODEL = 'roberta-base'
# TEXT_MODEL='xlm-roberta-base'
# TEXT_MODEL = 'roberta-large'
# TEXT_MODEL = 'albert-base-v2'
# TEXT_MODEL = 'microsoft/deberta-base'
text_batch_size = 16
text_epochs = 4
text_lr = 2e-5
text_dropout = 0.5
text_scheduler_name = 'warmup'

text_hyperparameters = {
    'samples':samples,
    'Text batch size': text_batch_size,
    'Text epochs': text_epochs,
    'Text learning rate': text_lr,
    'Text Model name': TEXT_MODEL,
    'Text dropout': text_dropout,
    'Text scheduler' : text_scheduler_name
}
experiment.log_parameters(text_hyperparameters)

# IMAGE_FREEZE: Freeze the weights of the selected image model during training
# IMAGE_ROTATIONS: Apply image augmentations during image preprocessing
# IMAGE_HIDDEN_LAYER: Use hidden layer on the classifier of the image model

### IMAGE GLOBAL SETTINGS ###
IMAGE_MODEL = 'google/vit-base-patch16-224-in21k'
# IMAGE_MODEL = 'facebook/dino-vitb8'
# IMAGE_MODEL = 'facebook/dino-vitb16'
# IMAGE_MODEL = 'microsoft/beit-base-patch16-224-pt22k-ft22k'
# IMAGE_MODEL = 'EFFICIENTNET-B1'
image_batch_size = 16
image_epochs = 4
image_lr = 2e-5
image_scheduler_name = 'warmup'
IMAGE_FREEZE = False
IMAGE_ROTATIONS = False
image_HIDDEN_LAYER = True
image_dropout = 0.2

image_hyperparameters = {
    'image batch size': image_batch_size,
    'image epochs': image_epochs,
    'image learning rate': image_lr,
    'image scheduler': image_scheduler_name,
    'image Model name': IMAGE_MODEL,
    'image freeze': IMAGE_FREEZE,
    'image rotations': IMAGE_ROTATIONS,
    'image dropout': image_dropout
}

experiment.log_parameters(image_hyperparameters)

# MULTIMODAL_GRID_SEARCH: Use grid search on the specified parameters for multimodal model
# MULTIMODAL_VADER: Import features calculated from Vader (generally disabled)
# MULTIMODAL_MODEL: Choose between 5 layers, 4 layers, 3 layers and attention

### MULTIMODAL GLOBAL SETTINGS ###
MULTIMODAL_GRID_SEARCH = False
MULTIMODAL_VADER = False
MULTIMODAL_MODEL = '3 layers'
PATH = './drive/My Drive/sentiment-analysis/'

if MULTIMODAL_GRID_SEARCH:
  multimodal_scheduler_names = ['warmup','exponential']
  multimodal_batch_sizes = [4]
  multimodal_hidden_sizes = [400,600,800]
  # multimodal_last_hidden_sizes = [100,200,300]
  multimodal_num_heads_list = [4,8]
  multimodal_learning_rates = [5e-6,1e-5,3e-5]
  multimodal_epochs = 25

else:
  multimodal_epochs = 25
  multimodal_scheduler_names=['exponential']
  multimodal_batch_sizes=[16]
  multimodal_hidden_sizes = [800]
  multimodal_last_hidden_sizes = [300]
  multimodal_learning_rates=[3e-5]

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/kostasgrg/dump/cc1bf4a7e202490db97a8af4fdbcd701



Map labels from sentiment to integer values:  
Negative --> 0  
Neutral --> 1  
Positive --> 2

In [None]:
text_labels,labels_dict = sentiment_analysis_functions.map_labels(text_labels)
image_labels,labels_dict = sentiment_analysis_functions.map_labels(image_labels)
labels,labels_dict = sentiment_analysis_functions.map_labels(labels)

## Data splitting methods

In [None]:
from sklearn.model_selection import StratifiedKFold, train_test_split

# (Optional) Create the splits of the 5-fold Cross Validation.
# The indices of train and test sets are stored on some files that you
# can load later and use them as input to split_data_for_cv function.
def create_folds(texts,labels):
  skf = StratifiedKFold(n_splits=5)
  count=0
  for train_index, test_index in skf.split(texts, labels):
    np.save(f'train_index_{count}.npy',train_index)
    np.save(f'test_index_{count}.npy',test_index)
    count += 1

# Use this function when you select Cross Validation for the experiment.
def split_data_for_cv(data,labels,train_index,test_index,seed_val):

  train_labels = labels[train_index]
  train_texts = [text for (i,text) in enumerate(data) if i in train_index]


  X_rem =  [text for (i,text) in enumerate(data) if i in test_index]
  y_rem = labels[test_index]

  test_texts, valid_texts, test_labels, valid_labels = train_test_split(X_rem,
                                                  y_rem,
                                                  test_size=0.5,
                                                  random_state=seed_val,
                                                  stratify=y_rem)


  train_labels = torch.tensor(train_labels)
  test_labels = torch.tensor(test_labels)
  valid_labels = torch.tensor(valid_labels)

  print(np.shape(train_texts))

  return train_texts,test_texts,valid_texts,train_labels,test_labels,valid_labels


# Use this function when you select a single random seed value for the experiment.
def split_data(data,labels,text_labels,seed_val):
  indices = np.arange(len(data))
  train_texts, X_rem, train_labels, y_rem, train_indices, indices_rem = train_test_split(data,
                                                    labels,
                                                    indices,
                                                    test_size=0.2,
                                                    random_state=seed_val,
                                                    stratify=labels)


  test_texts, valid_texts, test_labels, valid_labels, test_indices, valid_indices = train_test_split(X_rem,
                                                  y_rem,
                                                  indices_rem,
                                                  test_size=0.5,
                                                  random_state=seed_val,
                                                  stratify=y_rem)

  train_labels = text_labels[train_indices]
  valid_labels = text_labels[valid_indices]
  test_labels = text_labels[test_indices]


  train_labels = torch.tensor(train_labels)
  test_labels = torch.tensor(test_labels)
  valid_labels = torch.tensor(valid_labels)

  return train_texts,test_texts,valid_texts,train_labels,test_labels,valid_labels

## Text

### Text preprocesing, creating text dataloaders, initialize text model, train text model, evaluate text model, extract text features

In [None]:
from sklearn.metrics import f1_score, accuracy_score
import numpy as np

# Text Preprocessing
texts = [text_functions.text_preprocessing(text) for text in texts]
tokenizer = text_functions.choose_tokenizer(TEXT_MODEL)
train_texts,test_texts,valid_texts,train_labels,test_labels,valid_labels = split_data(texts,labels,text_labels,seed_val)
# train_texts,test_texts,valid_texts,train_labels,test_labels,valid_labels = split_data_for_cv(texts,text_labels,train_index, test_index, seed_val) ### UNCOMMENT THIS LINE IF YOU CHOOSE CROSS VALIDATION ###

### UNCOMMENT THE FOLLOWING LINES IF YOU WANT TO COMBINE ENGLISH AND GREEK TEXTS FOR THE EXPERIMENT OF FOREIGN LANGUAGES ###
# greek_texts = [text_functions.text_preprocessing(text) for text in greek_texts]
# greek_train_texts,greek_test_texts,greek_valid_texts,greek_train_labels,greek_test_labels,greek_valid_labels = split_data(greek_texts,labels,text_labels,seed_val)
# train_texts = np.concatenate((train_texts,greek_train_texts),axis=0)
# test_texts = np.concatenate((test_texts,greek_test_texts),axis=0)
# valid_texts = np.concatenate((valid_texts,greek_valid_texts),axis=0)
# train_labels = torch.cat((train_labels,greek_train_labels),axis=0)
# test_labels = torch.cat((test_labels,greek_test_labels),axis=0)
# valid_labels = torch.cat((valid_labels,greek_valid_labels),axis=0)

# Create Dataloaders
dataloader_train = sentiment_analysis_functions.create_dataloaders(train_texts,train_labels,text_batch_size,generator,SHUFFLE=True,type='text',tokenizer=tokenizer)
dataloader_test = sentiment_analysis_functions.create_dataloaders(test_texts,test_labels,text_batch_size,generator,SHUFFLE=False,type='text',tokenizer=tokenizer)
dataloader_valid = sentiment_analysis_functions.create_dataloaders(valid_texts,valid_labels,text_batch_size,generator,SHUFFLE=False,type='text',tokenizer=tokenizer)

# Initializing Model, Training the model and making predictions
textModel, loss_fn, optimizer, scheduler = sentiment_analysis_functions.initialize_model(device,text_lr,dataloader_train,text_scheduler_name, TEXT_MODEL, text_dropout, HIDDEN_LAYER=None,type='text', epochs=text_epochs)
temp1, temp2 = sentiment_analysis_functions.train(textModel, dataloader_train, optimizer, scheduler, loss_fn, experiment, device, 'text', dataloader_valid, dataloader_test, epochs=text_epochs, evaluation=True)
probs_test, test_dataloader_labels = sentiment_analysis_functions.predict(textModel, dataloader_test, device,'text')

# Printing the final predictions of the model on the test set
print('-----------Test set-----------')
probs_test = np.argmax(probs_test,1)  # Get the class with the highest probability
f1_test = f1_score(y_pred = probs_test, y_true = test_dataloader_labels, average = 'weighted')
acc_test = accuracy_score(y_pred=probs_test,y_true= test_dataloader_labels)
experiment.log_metric('Text f1 (test)',f1_test)
experiment.log_metric('Text accuracy (test)',acc_test)
print(f1_test)
print(acc_test)

# Extracting text features
text_labels = torch.tensor(text_labels)
dataloader = sentiment_analysis_functions.create_dataloaders(texts,text_labels,text_batch_size,generator,SHUFFLE=False,type='text',tokenizer=tokenizer)
text_logits = sentiment_analysis_functions.calculate_logits(textModel.bert, dataloader, device, 'text')
np.save('text_logits.npy',text_logits)

Start training...



  0%|          | 0/4 [00:00<?, ?it/s]

Epoch 0:   0%|          | 0/451 [00:00<?, ?it/s]


Epoch 0
Training loss: 0.895261740406971


  0%|          | 0/57 [00:00<?, ?it/s]

Validation loss: 0.7633033745121538
Validation F1 Score (weighted): 0.6974787718012471
Validation Accuracy Score: 0.6924778761061947


  0%|          | 0/57 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.649426327211002
Test Accuracy Score: 0.6441241685144125




Epoch 1:   0%|          | 0/451 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.6885992657318877


  0%|          | 0/57 [00:00<?, ?it/s]

Validation loss: 0.721421786567621
Validation F1 Score (weighted): 0.7123517861996392
Validation Accuracy Score: 0.7046460176991151


  0%|          | 0/57 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7002160017993685
Test Accuracy Score: 0.697339246119734




Epoch 2:   0%|          | 0/451 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.5617136371439683


  0%|          | 0/57 [00:00<?, ?it/s]

Validation loss: 0.7469325102212137
Validation F1 Score (weighted): 0.7284656384526885
Validation Accuracy Score: 0.7278761061946902


  0%|          | 0/57 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7185237515445304
Test Accuracy Score: 0.7184035476718403




Epoch 3:   0%|          | 0/451 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.44061752327794246


  0%|          | 0/57 [00:00<?, ?it/s]

Validation loss: 0.8899183533432191
Validation F1 Score (weighted): 0.7421590783368357
Validation Accuracy Score: 0.7411504424778761


  0%|          | 0/57 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7094924979581213
Test Accuracy Score: 0.7084257206208425


Training complete!
-----------Test set-----------
0.7094924979581213
0.7084257206208425


  0%|          | 0/282 [00:00<?, ?it/s]

Save fine-tuned text model (Optional)

In [None]:
textModel.bert.save_pretrained('./textModel')
torch.save(textModel.classifier.state_dict(), "textClassifier.pt")
!zip -r ./textModel.zip ./textModel

  adding: textModel/ (stored 0%)
  adding: textModel/model.safetensors (deflated 13%)
  adding: textModel/config.json (deflated 50%)


Upload fine-tuned text model to Hugging Face (Optional)

In [None]:
from huggingface_hub import notebook_login
notebook_login()
textModel.bert.push_to_hub('kostasGRG/roberta-model-mvsa-multiple')

### Applying another language

In [None]:
# Choose the language: greek, german, spanish, russian, hindi
# The following code will load the non-english texts,
# use the fine-tuned model and calculate the features on the selected language.

LANGUAGE = 'hindi'
# foreign_texts,_,_,_,_ = sentiment_analysis_functions.load_mvsa_data(DATA_PATH, 1)
# foreign_texts = np.load(f'/content/drive/MyDrive/sentiment-analysis/translations/{LANGUAGE}/translated.npy', allow_pickle=True)

foreign_texts = [text_functions.text_preprocessing(text) for text in foreign_texts]
text_labels = torch.tensor(text_labels)
dataloader = sentiment_analysis_functions.create_dataloaders(foreign_texts,text_labels,text_batch_size,generator,SHUFFLE=False,type='text',tokenizer=tokenizer)
text_logits = sentiment_analysis_functions.calculate_logits(textModel.bert, dataloader, device, 'text')
np.save(f'{LANGUAGE}_text_logits.npy',text_logits)

  text_labels = torch.tensor(text_labels)


  0%|          | 0/282 [00:00<?, ?it/s]

## Image

### Functions

In [None]:
# Train function, similar to the train function that we have defined on the sentiment_analysis_functions.py library
def train(model, train_dataloader, optimizer, scheduler, loss_fn, experiment=None, device='cpu', dtype='text', val_dataloader=None, test_dataloader=None, epochs=10, evaluation=False, FREEZE=False,CONFIDENT_ACC=False,PRINT=True):

    model.train()
    if CONFIDENT_ACC:
       confidence_60_percent = []
       confidence_70_percent = []
       confidence_80_percent = []
       confidence_90_percent = []

    accuracy_scores = []
    f1_scores = []


    if PRINT:
        print("Start training...\n")
    for epoch_i in tqdm(range(epochs)):
        if dtype =='image' and FREEZE and (epoch_i==0):
          for name,param in model.named_parameters():
              if ("classifier" not in name) and ("pooler" not in name):
                param.requires_grad = False

        total_loss = 0
        model.train()

        for batch in tqdm(train_dataloader):
            if dtype=='text':
                b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
            elif dtype=='image':
                images, b_labels = batch
                images  = images.to(device)
                b_labels = b_labels.to(device)

            model.zero_grad()
            optimizer.zero_grad()

            if dtype=='text':
                logits = model(b_input_ids, b_attn_mask)
            elif dtype=='image':
                logits = model(images)

            loss = loss_fn(logits, b_labels)
            total_loss += loss.item()
            loss.backward()
            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        avg_train_loss = total_loss / len(train_dataloader)

        if PRINT:
            tqdm.write(f'\nEpoch {epoch_i}')
            tqdm.write(f'Training loss: {avg_train_loss}')

        if evaluation == True:
            val_loss, predictions, valid_labels = sentiment_analysis_functions.evaluate(model, loss_fn, device, val_dataloader, dtype)
            val_f1 = f1_score(valid_labels, predictions, average = 'weighted')
            val_acc = accuracy_score(valid_labels, predictions)
            accuracy_scores.append(val_acc)
            f1_scores.append(val_f1)

            if PRINT:
                tqdm.write(f'Validation loss: {val_loss}')
                tqdm.write(f'F1 Score (weighted): {val_f1}')
                tqdm.write(f'Accuracy Score: {val_acc}')
            if experiment is not None:
                metrics = {
                    'train loss': avg_train_loss,
                    'validation loss': val_loss,
                    'f1 score': val_f1,
                    'accuracy': val_acc
                }
                experiment.log_metrics(metrics, epoch=epoch_i)

            test_loss, predictions, test_labels = sentiment_analysis_functions.evaluate(model, loss_fn, device, test_dataloader, dtype)
            test_f1 = f1_score(test_labels, predictions, average = 'weighted')
            test_acc = accuracy_score(test_labels, predictions)
            accuracy_scores.append(test_acc)
            f1_scores.append(test_f1)

            if PRINT:
                tqdm.write(f'Test F1 Score (weighted): {test_f1}')
                tqdm.write(f'Test Accuracy Score: {test_acc}')
            if experiment is not None:
                metrics = {
                    'Test f1 score': test_f1,
                    'Test accuracy': test_acc
                }
                experiment.log_metrics(metrics, epoch=epoch_i)
            if CONFIDENT_ACC:
               probs, true_values = sentiment_analysis_functions.predict(model,val_dataloader,device,dtype)
               confidence_60_percent.append(text_functions.calculate_confident_accuracy(predictions=probs,true_values=true_values,threshold=0.6)[0])
               confidence_70_percent.append(text_functions.calculate_confident_accuracy(predictions=probs,true_values=true_values,threshold=0.7)[0])
               confidence_80_percent.append(text_functions.calculate_confident_accuracy(predictions=probs,true_values=true_values,threshold=0.8)[0])
               confidence_90_percent.append(text_functions.calculate_confident_accuracy(predictions=probs,true_values=true_values,threshold=0.9)[0])


        else:
            if experiment is not None:
                experiment.log_metric('train loss',avg_train_loss, epoch=epoch_i)
        scheduler.step()
        if PRINT:
            print("\n")

    print("Training complete!")

    if CONFIDENT_ACC:
       return predictions, test_labels, confidence_60_percent, confidence_70_percent, confidence_80_percent, confidence_90_percent
    return predictions, test_labels, accuracy_scores, f1_scores



Use this function instead of the sentiment_analysis_functions.initiliaze_model only for CNN image models.

In [None]:
from torchvision.models.inception import Inception_V3_Weights
from torchvision.models.efficientnet import EfficientNet_B1_Weights
from transformers import get_linear_schedule_with_warmup
from torchvision import models
from torchvision import transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

def initialize_model(device, lr, dataloader_train, scheduler_name, MODEL, dropout, HIDDEN_LAYER, type='text', epochs=10):
    # model = torch.hub.load('pytorch/vision:v0.10.0', 'resnext50_32x4d', pretrained=True)
    # model = models.resnet50(weights='DEFAULT')
    # model = models.resnet34(weights='DEFAULT')
    # model = models.resnext50_32x4d(weights='DEFAULT')
    # model = models.densenet161(weights='DEFAULT')
    # model = models.efficientnet_b7(weights=EfficientNet_B7_Weights.IMAGENET1K_V1)
    model = models.efficientnet_b1(weights=models.EfficientNet_B1_Weights.IMAGENET1K_V1)
    # model = models.efficientnet_b3(weights=models.EfficientNet_B3_Weights.IMAGENET1K_V1)
    # model = models.inception_v3(weights=Inception_V3_Weights.IMAGENET1K_V1)
    model.to(device)


    # Create the optimizer
    optimizer = optim.AdamW(model.parameters(),
                      lr=lr
                      )
    criterion = nn.CrossEntropyLoss()


    # Total number of training steps
    total_steps = len(dataloader_train) * epochs


    if scheduler_name == 'warmup':
      total_steps = len(dataloader_train) * epochs
      scheduler = get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=0,
                                                    num_training_steps=total_steps)
    elif scheduler_name == 'reduce':
      scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor = 0.1, patience=1, threshold=1e-4,verbose=True)
    elif scheduler_name == 'exponential':
      scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9, last_epoch=-1, verbose=True)
    return model, criterion, optimizer, scheduler

* Create datasets
* Select image augmentations
* Create dataloaders
* Initialize image model
* Train/predict image model
* Extract image features

In [None]:
from sklearn.model_selection import train_test_split

### Suitable dataset for images, containing labels and transformations ###
class CustomDataset(Dataset):
    def __init__(self, images, labels, transform=None, imageProcessor=None):
        self.images = images
        self.labels = labels
        self.transform = transform
        self.imageProcessor = imageProcessor

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        image = self.images[idx]
        image = image
        label = self.labels[idx]

        if self.transform:
            image = self.transform(image)
            if self.imageProcessor is not None:
              image = self.imageProcessor(image, return_tensors='pt', do_rescale=True)['pixel_values'][0]

        return image, label

### Select the appropriate image processor ###
imageProcessor, image_mean, image_std, size = image_functions.choose_processor(IMAGE_MODEL)


### Uncomment the following lines to select image augmentation techniques ###
train_transform = Compose(
  [
    # ToPILImage(),
    # Resize((size, size)),
    # RandomHorizontalFlip(),
    # RandomRotation(10),
    # ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    # RandomAffine(degrees=0, translate=(0.1, 0.1)),
    ToTensor(),
    # Normalize(mean=image_mean, std=image_std),
    # ToPILImage()
  ]
)
test_transform = Compose(
  [
    ToTensor(),
    # Normalize(mean=image_mean,std=image_std),
    # ToPILImage()
  ]
)



### Select method for splitting data ###
train_images,test_images,valid_images,train_labels,test_labels,valid_labels = split_data(images,labels,image_labels,seed_val)
# train_images,test_images,valid_images,train_labels,test_labels,valid_labels = split_data_for_cv(images,image_labels,train_index,test_index,seed_val)




### If you use CNN models, you should use the commented lines, as imageProcessor is not needed then ###
train_dataset = CustomDataset(train_images,train_labels,train_transform, imageProcessor)
test_dataset = CustomDataset(test_images,test_labels,test_transform, imageProcessor)
valid_dataset = CustomDataset(valid_images,valid_labels,test_transform, imageProcessor)

# train_dataset = CustomDataset(train_images,train_labels,train_transform, None)
# test_dataset = CustomDataset(test_images,test_labels,test_transform, None)
# valid_dataset = CustomDataset(valid_images,valid_labels,test_transform, None)


### Create dataloaders, shuffling is enabled on test and validation set ###
dataloader_train = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=image_batch_size,
    generator=generator
)

dataloader_test = DataLoader(
    test_dataset,
    shuffle=False,
    batch_size=image_batch_size,
    generator=generator
)

dataloader_valid = DataLoader(
    valid_dataset,
    shuffle=False,
    batch_size=image_batch_size,
    generator=generator
)

### Initialize model, train the model, evaluate on validation set, predict on test set ###
imageModel, loss_fn, optimizer, scheduler = sentiment_analysis_functions.initialize_model(device,image_lr,dataloader_train,image_scheduler_name,IMAGE_MODEL,image_dropout,image_HIDDEN_LAYER,'image',image_epochs)
predictions, test_labels, acc_scores, f1_scores = train(imageModel,dataloader_train,optimizer,scheduler,loss_fn,experiment,device,'image',dataloader_valid,dataloader_test,image_epochs,True,IMAGE_FREEZE)
probs_test, test_labels = sentiment_analysis_functions.predict(imageModel, dataloader_test,device,'image')
probs_test = np.argmax(probs_test,1)  # Get the class with the highest probability
f1_test = f1_score(y_pred = probs_test, y_true = test_labels, average = 'weighted')
acc_test = accuracy_score(y_pred=probs_test,y_true= test_labels)
experiment.log_metric('Image f1 (test)',f1_test)
experiment.log_metric('Image accuracy (test)',acc_test)
print('f1 score (test): ',f1_test)
print('acc score (test): ',acc_test)


### Extract image features on the fine-tuned image model and store them on a file named image_logits.npy ###
full_dataset = CustomDataset(images,image_labels, test_transform, imageProcessor)
dataloader = DataLoader(
    full_dataset,
    shuffle=False,
    batch_size=image_batch_size,
    generator=generator
)
image_logits = sentiment_analysis_functions.calculate_logits(imageModel.vit, dataloader, device, 'image')
np.save('image_logits.npy',image_logits)

Save image model (Optional)

In [None]:
imageModel.vit.save_pretrained('./imageModel')
torch.save(imageModel.classifier.state_dict(), "imageClassifier.pt")
!zip -r ./imageModel.zip ./imageModel

  adding: imageModel/ (stored 0%)
  adding: imageModel/pytorch_model.bin (deflated 7%)
  adding: imageModel/config.json (deflated 45%)


Connect to Hugging Face hub (Optional)

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Upload model to Hugging Face hub (Optional)

In [None]:
imageModel.vit.push_to_hub('kostasGRG/vit-model-mvsa-multiple')

pytorch_model.bin:   0%|          | 0.00/346M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kostasGRG/vit-model-mvsa-multiple/commit/0960ccbf6f35bdcee111138e336130a0a8650ac7', commit_message='Upload model', commit_description='', oid='0960ccbf6f35bdcee111138e336130a0a8650ac7', pr_url=None, pr_revision=None, pr_num=None)

## Fusion

Concatenate text and image features, that we have already calculated or loaded from file.  
Split data again to train/validation/test sets.

In [None]:
multimodal = np.concatenate((text_logits,image_logits),1)
# train_multimodal,test_multimodal,valid_multimodal,train_labels,test_labels,valid_labels = split_data_for_cv(multimodal,labels,train_index,test_index,seed_val)
train_multimodal,test_multimodal,valid_multimodal,train_labels,test_labels,valid_labels = split_data(multimodal, labels, labels, seed_val)

train_multimodal = torch.stack([torch.tensor(data,dtype=torch.float32) for data in train_multimodal])
test_multimodal = torch.stack([torch.tensor(data,dtype=torch.float32) for data in test_multimodal])
valid_multimodal = torch.stack([torch.tensor(data,dtype=torch.float32) for data in valid_multimodal])


* Create Dataloaders  
* Initialize multimodal model  
* Train multimodal model  
* Make the final predictions

In [None]:
input_size = np.shape(multimodal)[1]
output_size = 3

for multimodal_batch_size in multimodal_batch_sizes:
  train_dataloader = sentiment_analysis_functions.create_dataloaders(train_multimodal,train_labels,multimodal_batch_size,generator,True,'image')
  test_dataloader = sentiment_analysis_functions.create_dataloaders(test_multimodal,test_labels,multimodal_batch_size,generator,False,'image')
  valid_dataloader = sentiment_analysis_functions.create_dataloaders(valid_multimodal,valid_labels,multimodal_batch_size,generator,False,'image')
  for multimodal_scheduler_name in multimodal_scheduler_names:
    for multimodal_hidden_size in multimodal_hidden_sizes:
      for multimodal_last_hidden_size in multimodal_last_hidden_sizes:
        # for num_heads in num_heads_list:
          for multimodal_lr in multimodal_learning_rates:
            multimodal_hyperparameters = {
              'multimodal batch size': multimodal_batch_size,
              'multimodal epochs': multimodal_epochs,
              'multimodal learning rate': multimodal_lr,
              'multimodal scheduler' : multimodal_scheduler_name,
              'multimodal hidden size':multimodal_hidden_size,
              'multimodal last hidden size':multimodal_last_hidden_size,
              # 'number of heads':num_heads,
              'multimodal Model': MULTIMODAL_MODEL
            }

            experiment.log_parameters(multimodal_hyperparameters)
            model,loss_fn,optimizer,scheduler = multimodal_functions.initialize_model(train_dataloader,multimodal_lr,multimodal_epochs,multimodal_hidden_size,multimodal_last_hidden_size,4,input_size,output_size,MULTIMODAL_MODEL,device,multimodal_scheduler_name)
            predictions, last_valid_labels, accuracy_scores, f1_scores = train(model,train_dataloader,optimizer,scheduler,loss_fn,experiment,device,'image',valid_dataloader,test_dataloader,multimodal_epochs,True)

            probs_test, test_dataloader_labels = sentiment_analysis_functions.predict(model, test_dataloader,device,'image')

            preds = np.argmax(probs_test,1)
            acc = accuracy_score(test_dataloader_labels,preds)
            f1 = f1_score(test_dataloader_labels,preds,average = 'weighted')

            experiment.log_metric('multimodal acc (test)',acc)
            experiment.log_metric('multimodal f1 (test)', f1)
            print('acc = ',acc)
            print('f1 = ',f1)

# log your Pytorch model
# log_model(experiment, textModel, model_name=TEXT_MODEL)
# log_model(experiment, imageModel, model_name=IMAGE_MODEL)
# log_model(experiment, model, model_name=MULTIMODAL_MODEL)

experiment.end()

Adjusting learning rate of group 0 to 3.0000e-05.
Start training...



  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 0
Training loss: 0.4503623191267252


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 0.7670604190949736
F1 Score (weighted): 0.7153196259569561
Accuracy Score: 0.7234513274336283


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7041106926018434
Test Accuracy Score: 0.7117516629711752
Adjusting learning rate of group 0 to 2.7000e-05.




  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.2579926996846246


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 0.8133404843252281
F1 Score (weighted): 0.7211280974216381
Accuracy Score: 0.7300884955752213


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.715592111349426
Test Accuracy Score: 0.720620842572062
Adjusting learning rate of group 0 to 2.4300e-05.




  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.24265992190622915


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 0.8751780575719373
F1 Score (weighted): 0.7136347982344914
Accuracy Score: 0.7234513274336283


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7110058718327654
Test Accuracy Score: 0.7161862527716186
Adjusting learning rate of group 0 to 2.1870e-05.




  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.23687058915507742


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 0.8889877945184708
F1 Score (weighted): 0.7106046837059494
Accuracy Score: 0.7212389380530974


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7180475491207416
Test Accuracy Score: 0.7250554323725056
Adjusting learning rate of group 0 to 1.9683e-05.




  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.22880109936097817


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 0.9054405129161375
F1 Score (weighted): 0.7135524097855257
Accuracy Score: 0.7234513274336283


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.705382301204106
Test Accuracy Score: 0.7117516629711752
Adjusting learning rate of group 0 to 1.7715e-05.




  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.22155772772642363


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 0.9046973776200722
F1 Score (weighted): 0.703985205557862
Accuracy Score: 0.7146017699115044


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7063835972613
Test Accuracy Score: 0.7117516629711752
Adjusting learning rate of group 0 to 1.5943e-05.




  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 6
Training loss: 0.2165094407242352


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 0.9344069901211508
F1 Score (weighted): 0.7082722421884904
Accuracy Score: 0.7190265486725663


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7035505184456287
Test Accuracy Score: 0.7095343680709535
Adjusting learning rate of group 0 to 1.4349e-05.




  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 7
Training loss: 0.21311534134943072


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 0.9385344794598119
F1 Score (weighted): 0.7113373380634069
Accuracy Score: 0.7256637168141593


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7079372974506768
Test Accuracy Score: 0.7139689578713969
Adjusting learning rate of group 0 to 1.2914e-05.




  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 8
Training loss: 0.2094239890328155


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 0.9339777493271334
F1 Score (weighted): 0.7100219687754187
Accuracy Score: 0.7234513274336283


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7108528779151572
Test Accuracy Score: 0.7161862527716186
Adjusting learning rate of group 0 to 1.1623e-05.




  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 9
Training loss: 0.20419063585999925


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 0.9444780026016564
F1 Score (weighted): 0.7105288947582482
Accuracy Score: 0.7234513274336283


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7086680829160739
Test Accuracy Score: 0.7139689578713969
Adjusting learning rate of group 0 to 1.0460e-05.




  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 10
Training loss: 0.2007413291479502


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 0.9442073384235645
F1 Score (weighted): 0.7066130501418314
Accuracy Score: 0.7168141592920354


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7140534866859937
Test Accuracy Score: 0.7184035476718403
Adjusting learning rate of group 0 to 9.4143e-06.




  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 11
Training loss: 0.19800941518351303


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 0.9623021519389646
F1 Score (weighted): 0.7110055782142963
Accuracy Score: 0.7212389380530974


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7092838250877533
Test Accuracy Score: 0.7139689578713969
Adjusting learning rate of group 0 to 8.4729e-06.




  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 12
Training loss: 0.19606236589091738


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 0.965612783000387
F1 Score (weighted): 0.7095951329497177
Accuracy Score: 0.7234513274336283


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7105655476387668
Test Accuracy Score: 0.7161862527716186
Adjusting learning rate of group 0 to 7.6256e-06.




  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 13
Training loss: 0.19193139825164612


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 0.9668629220847426
F1 Score (weighted): 0.7098972871562531
Accuracy Score: 0.7234513274336283


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7142565417899871
Test Accuracy Score: 0.720620842572062
Adjusting learning rate of group 0 to 6.8630e-06.




  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 14
Training loss: 0.18959944414395213


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 0.9924032056126101
F1 Score (weighted): 0.7119449004217091
Accuracy Score: 0.7256637168141593


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7098223782313
Test Accuracy Score: 0.7184035476718403
Adjusting learning rate of group 0 to 6.1767e-06.




  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 15
Training loss: 0.18779921985078868


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 0.9876370203906092
F1 Score (weighted): 0.7078536246660504
Accuracy Score: 0.7212389380530974


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7064415394124339
Test Accuracy Score: 0.7139689578713969
Adjusting learning rate of group 0 to 5.5591e-06.




  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 16
Training loss: 0.18655855137755914


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 0.9812512490256079
F1 Score (weighted): 0.712981340084504
Accuracy Score: 0.7256637168141593


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7104115844741664
Test Accuracy Score: 0.7161862527716186
Adjusting learning rate of group 0 to 5.0032e-06.




  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 17
Training loss: 0.1833961455169983


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 0.9768951247478354
F1 Score (weighted): 0.7114948756237844
Accuracy Score: 0.7234513274336283


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7135449138281852
Test Accuracy Score: 0.7184035476718403
Adjusting learning rate of group 0 to 4.5028e-06.




  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 18
Training loss: 0.18189107380659048


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 0.9915969808553827
F1 Score (weighted): 0.7127964483459165
Accuracy Score: 0.7256637168141593


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7090762771313043
Test Accuracy Score: 0.7139689578713969
Adjusting learning rate of group 0 to 4.0526e-06.




  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 19
Training loss: 0.18092705265297432


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 0.991295565305085
F1 Score (weighted): 0.7113204832968845
Accuracy Score: 0.7234513274336283


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7115469689592128
Test Accuracy Score: 0.7161862527716186
Adjusting learning rate of group 0 to 3.6473e-06.




  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 20
Training loss: 0.17964988473217638


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 0.9930377350798969
F1 Score (weighted): 0.7114948756237844
Accuracy Score: 0.7234513274336283


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.714991820759306
Test Accuracy Score: 0.720620842572062
Adjusting learning rate of group 0 to 3.2826e-06.




  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 21
Training loss: 0.17810082215964135


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 0.9947114895130026
F1 Score (weighted): 0.7134591646629892
Accuracy Score: 0.7256637168141593


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7172726505027044
Test Accuracy Score: 0.7228381374722838
Adjusting learning rate of group 0 to 2.9543e-06.




  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 22
Training loss: 0.17738697078672394


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 1.0012897694933003
F1 Score (weighted): 0.7143288345911192
Accuracy Score: 0.7278761061946902


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7197010497990135
Test Accuracy Score: 0.7250554323725056
Adjusting learning rate of group 0 to 2.6589e-06.




  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 23
Training loss: 0.17698679969283398


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 0.9940895741355831
F1 Score (weighted): 0.7114948756237844
Accuracy Score: 0.7234513274336283


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7135449138281852
Test Accuracy Score: 0.7184035476718403
Adjusting learning rate of group 0 to 2.3930e-06.




  0%|          | 0/226 [00:00<?, ?it/s]


Epoch 24
Training loss: 0.1752930036509014


  0%|          | 0/29 [00:00<?, ?it/s]

Validation loss: 1.0002573379154862
F1 Score (weighted): 0.7114948756237844
Accuracy Score: 0.7234513274336283


  0%|          | 0/29 [00:00<?, ?it/s]

Test F1 Score (weighted): 0.7183256669818731
Test Accuracy Score: 0.7250554323725056
Adjusting learning rate of group 0 to 2.1537e-06.


Training complete!
acc =  0.7250554323725056
f1 =  0.7183256669818731


In [None]:
torch.save(model.state_dict(), "multimodalModel.pt")