# Multimodal sentiment analysis (text/image) in MVSA dataset

## Reading data from original MVSA-single dataset

### Import libraries, get access to texts,images and labels of the dataset


Import necessary libraries that will be used in this notebook

In [None]:
from zipfile import ZipFile
import cv2
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import h5py

# MVSA_SINGLE: BOOLEAN VARIABLE. IF TRUE, MVSA-SINGLE DATASET WILL BE PROCESSED, OTHERWISE MVSA-MULTIPLE
MVSA_SINGLE = True

unzip the zipped files

In [None]:
if MVSA_SINGLE:
  zf = ZipFile('/content/drive/MyDrive/sentiment-analysis/notebooks/MVSA-Single.zip', 'r')
  zf.extractall('./data')
  zf.close()
else:
  zf = ZipFile('/content/drive/MyDrive/sentiment-analysis/notebooks/MVSA-Multiple.zip', 'r')
  zf.extractall('./data')
  zf.close()

Get data and label paths

In [None]:
if MVSA_SINGLE:
  mvsa_single_data_path = './data/MVSA_Single/data'
  mvsa_single_label_path = './data/MVSA_Single/labelResultAll.txt'
else:
  mvsa_multiple_data_path = './data/MVSA/data'
  mvsa_multiple_label_path = './data/MVSA/labelResultAll.txt'

IMAGE_SIZE = (224, 224)
NUM_CHANNELS = 3

### Create functions

In [None]:
# Read a single text file
def read_text_file(path):
    return open(path, 'r', encoding='latin-1').read()

# Read a single image file and resize it to the desired size.
# If the image is corrupted, store this info to invalid_ID parameter.
def read_image_file(path):
    try:
        image = cv2.imread(path)[:, :, ::-1] #, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, IMAGE_SIZE, interpolation = cv2.INTER_AREA)

        invalid_ID = -1
    except:
        image = np.zeros((IMAGE_SIZE[0], IMAGE_SIZE[1], NUM_CHANNELS))
        invalid_ID = int(os.path.split(path)[1].split('.')[0])
    return image, invalid_ID

# Read file of labels
def read_labels_file(path):
    dataframe = pd.read_csv(path, sep="\s+|,", engine="python")
    return dataframe


def get_data_paths(path, extension):
    ''' Get list of data paths with input extension and sort by its filename (ID)
    path: Folder path
    extension: File extension wants to get
    '''
    paths = os.listdir(path)
    paths = list(filter(lambda x: x.endswith(extension), paths))
    paths.sort(key = lambda x : int(x.split('.')[0]))
    paths = [os.path.join(path, x) for x in paths]
    return paths

# Get the image with its unique ID in the dataset, given the path as argument.
def get_image_with_id(path):
    filename = os.path.split(path)[1]
    ID = int(filename.split('.')[0])
    image = read_image_file(path)
    return (ID, image)

# Decide about the multimodal label, based on the text and the image label. The final label is decided through the rules:
# 1. if both text and image labels have the same value -> multimodal label will have the same value also
# 2. if text label equals to negative and image label equals to positive or vice versa -> multimodal label can't be predicted and it's considered as 'unknown'
# 3. if one label is neutral and the other one is positive or negative -> multimodal label will be equal to the non neutral label
def multimodal_label(text_label, image_label):
    if text_label == image_label:
        label = text_label
    elif (text_label == 'positive' and image_label == 'negative') or (text_label == 'negative' and image_label == 'positive'):
        label = 'invalid'
    elif (text_label == 'neutral' and image_label != 'neutral') or (text_label != 'neutral' or image_label == 'neutral'):
        label = image_label if text_label == 'neutral' else text_label
    return label

# Collect all the texts stored in the dataset
def create_text_data(path):
    texts = []
    text_paths = get_data_paths(path, '.txt')

    print('Reading text data')
    for text_path in tqdm(text_paths):
        text = read_text_file(text_path).rstrip('\n')
        texts.append(text)

    return texts

# Collect all the images stored in the dataset and check for each image if it's corrupted.
# If the image is corrupted, then its ID is stored in a separate array with all the invalid IDs.
def create_image_data(path):
    images = []
    invalid_indices = []
    image_paths = get_data_paths(path, '.jpg')

    print('Reading image data')
    for image_path in tqdm(image_paths):
        image, invalid_ID = read_image_file(image_path)
        images.append(image)

        if invalid_ID != -1:
            invalid_indices.append(invalid_ID)

    images = np.array(images, dtype='uint8')
    return images, invalid_indices

In [None]:
# there are 3 annotators labelling each modality labels in the MVSA-Multiple dataset
# merge those 3 label pairs into 1 pair by taking majority vote on each modality label
# since there are only 3 different labels, if 1 modality receives 3 different labels from 3 annotators
# => the data pair is considered invalid
def merge_multi_label(dataframe):
    anno_1 = list(dataframe.loc[:, ['text', 'image']].itertuples(index=False, name=None))
    anno_2 = list(dataframe.loc[:, ['text.1', 'image.1']].itertuples(index=False, name=None))
    anno_3 = list(dataframe.loc[:, ['text.2', 'image.2']].itertuples(index=False, name=None))
    IDs = list(dataframe.iloc[:, 0])

    valid_pairs = []

    for i in range(len(anno_1)):
        pairs = [anno_1[i], anno_2[i], anno_3[i]]
        ID = IDs[i]

        text_labels = [pair[0] for pair in pairs]
        image_labels = [pair[1] for pair in pairs]

        max_occur_text_label = max(text_labels, key=text_labels.count)
        max_occur_image_label = max(image_labels, key=image_labels.count)

        if text_labels.count(max_occur_text_label) > 1 and image_labels.count(max_occur_image_label) > 1:
          valid_pair = (ID, max_occur_text_label, max_occur_image_label)
        else:
          valid_pair = (ID, 'invalid', 'invalid')

        valid_pairs.append(valid_pair)
    valid_dataframe = pd.DataFrame(valid_pairs, columns=['ID', 'text', 'image'])
    return valid_dataframe

# Create the multimodal labels, using the previous assistant functions.
# Based on the value of argument "multiple" choose between the MVSA-Single and the MVSA-Multiple processing.
def create_multimodal_labels(path, multiple=False, mappings=False):
    dataframe = read_labels_file(path)
    labels = []

    if multiple == True:
      dataframe = merge_multi_label(dataframe)

    for label_pair in dataframe.loc[:, ['text', 'image']].values:
        label = multimodal_label(label_pair[0], label_pair[1])
        labels.append(label)

    if mappings == True:
        label_map = {}
        for i in range(len(labels)):
            ID = dataframe.iloc[i, 0]
            label_map[ID] = labels[i]
        return label_map

    return np.array(labels, dtype='object')

# Read the original labels from the initial file of the .zip and map them with the correct pair of text and image.
def create_original_labels(path, multiple=False):
    dataframe = read_labels_file(path)
    if multiple == True:
      dataframe = merge_multi_label(dataframe)
    text_labels = dataframe['text'].to_numpy()
    image_labels = dataframe['image'].to_numpy()
    return text_labels, image_labels

# Remove pairs that have invalid indices
def remove_invalid(data, indices):
    valid_data = []
    for i in range(len(data)):
        if i not in indices:
            valid_data.append(data[i])
    return valid_data

### Cleaning dataset for multimodal analysis

In [None]:
if MVSA_SINGLE:
  # Get texts, images, labels and create multimodal labels
  mvsa_single_texts = create_text_data(mvsa_single_data_path)
  mvsa_single_images, mvsa_single_images_invalid_indices = create_image_data(mvsa_single_data_path)
  mvsa_single_multimodal_labels = create_multimodal_labels(mvsa_single_label_path)
  mvsa_single_text_labels, mvsa_single_image_labels = create_original_labels(mvsa_single_label_path)
  num_mvsa_single = len(mvsa_single_texts)

  # Exclude pairs with invalid indices, either because of a corrupted image or unknown multimodal label
  mvsa_single_multimodal_labels_invalid_indices = [i for i in range(num_mvsa_single) if mvsa_single_multimodal_labels[i] == 'invalid']
  print('Number of text-image pair in MVSA-Single:', num_mvsa_single)
  mvsa_single_invalid_indices = []
  mvsa_single_invalid_indices.extend(mvsa_single_images_invalid_indices) # corrupted images
  mvsa_single_invalid_indices.extend(mvsa_single_multimodal_labels_invalid_indices)
  mvsa_single_invalid_indices = list(set(mvsa_single_invalid_indices))
  print('Number of invalid data in MVSA-Single:', len(mvsa_single_invalid_indices))
  mvsa_single_texts_valid = remove_invalid(mvsa_single_texts, mvsa_single_invalid_indices)
  mvsa_single_images_valid = remove_invalid(mvsa_single_images, mvsa_single_invalid_indices)
  mvsa_single_multimodal_labels_valid = remove_invalid(mvsa_single_multimodal_labels, mvsa_single_invalid_indices)
  mvsa_single_text_labels_valid = remove_invalid(mvsa_single_text_labels, mvsa_single_invalid_indices)
  mvsa_single_image_labels_valid = remove_invalid(mvsa_single_image_labels, mvsa_single_invalid_indices)
  num_mvsa_single_valid = len(mvsa_single_texts_valid)
  print('Number of text-image pair in MVSA-Single after removing invalid data:', num_mvsa_single_valid)

  # save the cleaned dataset
  with h5py.File('mvsa-single-{}.hdf5'.format(num_mvsa_single_valid), 'w') as f:
      f.create_dataset('texts', data = mvsa_single_texts_valid)
      f.create_dataset('images', data = mvsa_single_images_valid)
      f.create_dataset('multimodal-labels', data = mvsa_single_multimodal_labels_valid)
      f.create_dataset('text-labels', data = mvsa_single_text_labels_valid)
      f.create_dataset('image-labels', data = mvsa_single_image_labels_valid)

  from google.colab import files
  files.download('./sample_data/mvsa-single-4511.hdf5')

else:
  # Get texts, images, labels and create multimodal labels
  mvsa_multiple_texts = create_text_data(mvsa_multiple_data_path)
  mvsa_multiple_images, mvsa_multiple_images_invalid_indices = create_image_data(mvsa_multiple_data_path)
  mvsa_multiple_multimodal_labels = create_multimodal_labels(mvsa_multiple_label_path, multiple=True)
  mvsa_multiple_text_labels, mvsa_multiple_image_labels = create_original_labels(mvsa_multiple_label_path, multiple=True)
  num_mvsa_multiple = len(mvsa_multiple_texts)

  # Exclude pairs with invalid indices, either because of a corrupted image or unknown multimodal label
  mvsa_multiple_multimodal_labels_invalid_indices = [i for i in range(num_mvsa_multiple) if mvsa_multiple_multimodal_labels[i] == 'invalid']
  print('Number of text-image pair in MVSA-Multiple:', num_mvsa_multiple)
  mvsa_multiple_invalid_indices = []
  mvsa_multiple_invalid_indices.extend(mvsa_multiple_images_invalid_indices)
  print('Number of invalid data in images: ',len(mvsa_multiple_invalid_indices))
  mvsa_multiple_invalid_indices.extend(mvsa_multiple_multimodal_labels_invalid_indices)
  mvsa_multiple_invalid_indices = list(set(mvsa_multiple_invalid_indices))
  print('Number of invalid data in MVSA-Multiple:', len(mvsa_multiple_invalid_indices))
  mvsa_multiple_texts_valid = remove_invalid(mvsa_multiple_texts, mvsa_multiple_invalid_indices)
  mvsa_multiple_images_valid = remove_invalid(mvsa_multiple_images, mvsa_multiple_invalid_indices)
  mvsa_multiple_multimodal_labels_valid = remove_invalid(mvsa_multiple_multimodal_labels, mvsa_multiple_invalid_indices)
  mvsa_multiple_text_labels_valid = remove_invalid(mvsa_multiple_text_labels, mvsa_multiple_invalid_indices)
  mvsa_multiple_image_labels_valid = remove_invalid(mvsa_multiple_image_labels, mvsa_multiple_invalid_indices)
  num_mvsa_multiple_valid = len(mvsa_multiple_texts_valid)
  print('Number of text-image pair in MVSA-Multiple after removing invalid data:', num_mvsa_multiple_valid)

  # save the cleaned dataset
  with h5py.File('mvsa-multiple-{}.hdf5'.format(num_mvsa_multiple_valid), 'w') as f:
      f.create_dataset('texts', data = mvsa_multiple_texts_valid)
      f.create_dataset('images', data = mvsa_multiple_images_valid)
      f.create_dataset('multimodal-labels', data = mvsa_multiple_multimodal_labels_valid)
      f.create_dataset('text-labels', data = mvsa_multiple_text_labels_valid)
      f.create_dataset('image-labels', data = mvsa_multiple_image_labels_valid)

  from google.colab import files
  files.download('./mvsa-multiple-17024.hdf5')

Reading text data


100%|██████████| 4869/4869 [00:00<00:00, 37579.80it/s]


Reading image data


100%|██████████| 4869/4869 [00:59<00:00, 81.53it/s] 


### Cleaning dataset for text only analysis

In [None]:
if MVSA_SINGLE:
  # Get texts and text labels
  mvsa_single_texts = create_text_data(mvsa_single_data_path)
  # mvsa_single_multimodal_labels = create_multimodal_labels(mvsa_single_label_path)
  mvsa_single_text_labels = create_original_labels(mvsa_single_label_path)[0]
  num_mvsa_single = len(mvsa_single_texts)

  # Store the cleaned dataset consisting only of texts
  with h5py.File('mvsa-single-{}.hdf5'.format(num_mvsa_single), 'w') as f:
    f.create_dataset('texts', data = mvsa_single_texts)
    f.create_dataset('text-labels', data = mvsa_single_text_labels)
else:
  # Get texts and text labels
  mvsa_multiple_texts = create_text_data(mvsa_multiple_data_path)
  mvsa_multiple_text_labels = create_original_labels(mvsa_multiple_label_path,multiple=True)[0]
  print(mvsa_multiple_text_labels)
  num_mvsa_multiple = len(mvsa_multiple_texts)

  # Store the cleaned dataset consisting only of texts
  with h5py.File('mvsa-multiple-{}.hdf5'.format(num_mvsa_multiple), 'w') as f:
    f.create_dataset('texts', data = mvsa_multiple_texts)
    f.create_dataset('text-labels', data = mvsa_multiple_text_labels)

### Cleaning dataset for image only analysis

In [None]:
if MVSA_SINGLE:
  # Get images and image labels
  mvsa_single_images, invalid_indices = create_image_data(mvsa_single_data_path)
  temp,mvsa_single_image_labels = create_original_labels(mvsa_single_label_path)
  num_mvsa_single = len(mvsa_single_images)
  print(len(invalid_indices))

  # Store the cleaned dataset consisting only of images
  with h5py.File('mvsa-single-{}.hdf5'.format(num_mvsa_single), 'w') as f:
    f.create_dataset('images', data = mvsa_single_images)
    f.create_dataset('image-labels', data = mvsa_single_image_labels)
else:
  # Get images and image labels
  mvsa_multiple_images, invalid_indices = create_image_data(mvsa_multiple_data_path)
  temp,mvsa_multiple_image_labels = create_original_labels(mvsa_multiple_label_path,multiple=True)
  num_mvsa_multiple = len(mvsa_multiple_images)
  print(len(invalid_indices))

  # Remove corrupted images
  mvsa_multiple_image_labels_invalid_indices = [i for i in range(num_mvsa_multiple) if mvsa_multiple_image_labels[i] == 'invalid']
  print('Number of text-image pair in MVSA-Multiple:', num_mvsa_multiple)
  mvsa_multiple_invalid_indices = []
  # mvsa_multiple_invalid_indices.extend(mvsa_multiple_texts_duplicated_indices)
  mvsa_multiple_invalid_indices.extend(invalid_indices)
  mvsa_multiple_invalid_indices.extend(mvsa_multiple_image_labels_invalid_indices)
  print('Number of invalid data in images: ',len(mvsa_multiple_invalid_indices))
  mvsa_multiple_invalid_indices = list(set(mvsa_multiple_invalid_indices))
  mvsa_multiple_images_valid = remove_invalid(mvsa_multiple_images, mvsa_multiple_invalid_indices)
  mvsa_multiple_image_labels_valid = remove_invalid(mvsa_multiple_image_labels, mvsa_multiple_invalid_indices)
  num_mvsa_multiple_valid = len(mvsa_multiple_images_valid)
  print('Number of text-image pair in MVSA-Multiple after removing invalid data:', num_mvsa_multiple_valid)
  print(np.unique(mvsa_multiple_image_labels_valid))

  # Store the cleaned dataset consisting only of images
  with h5py.File('mvsa-multiple-{}_image.hdf5'.format(num_mvsa_multiple_valid), 'w') as f:
    f.create_dataset('images', data = mvsa_multiple_images_valid)
    f.create_dataset('image-labels', data = mvsa_multiple_image_labels_valid)