# Extract predictions from vader tool, a lexicon approach for sentiment analysis

## Import libraries

Connect with Google Drive files

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Install/Import necessary libraries

In [None]:
!pip install vaderSentiment
!pip install contractions
!pip install sentencepiece

import h5py
import numpy as np
import re #regular expressions
import contractions
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer



## Libraries

In [1]:
# Read from an h5py file, the file's location is given with the "path" argument.
def read_hdf5(path):
    read_file = h5py.File(path, 'r')

    feature_names = list(read_file.keys())
    loaded_data = []

    for name in feature_names:
        dataset = read_file[name][:]
        if dataset.dtype == np.dtype('object'):
            dataset = np.array([x.decode('utf-8') for x in dataset])
        loaded_data.append((name, dataset))

    return loaded_data

# Load MVSA dataset that we have stored when the cleaning was done.
# Use the mode argument to select between the pair of texts/images (mode=1),
# only texts (mode=2) and only images (mode=3).
def load_mvsa_data(path,mode):
    data = read_hdf5(path)
    if mode == 1: #multimodal
      for x in data:
          if x[0] == 'texts':
              texts = x[1]
          if x[0] == 'multimodal-labels':
              labels = x[1]
          if x[0] == 'images':
              images = x[1]
      return texts, images, labels

    elif mode == 2: # text only
      for x in data:
          if x[0] == 'texts':
              texts = x[1]
          if x[0] == 'text-labels':
              text_labels = x[1]
      return texts,text_labels

    elif mode == 3: # image only
      for x in data:
          if x[0] == 'images':
              images = x[1]
          if x[0] == 'image-labels':
              image_labels = x[1]
      return images,image_labels

# Apply selected preprocessing steps on the texts of the dataset
def text_preprocessing(text):
    """
    - Remove entity mentions (eg. '@united')
    - Correct errors (eg. '&amp;' to '&')
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """
    # Remove '@name'
    # Replace '&amp;' with '&'
    # Remove trailing whitespace
    # Remove words that contain only digits
    # Remove contractions, example: I'll --> I will
    text = re.sub('RT '+r'(@.*?)[\s]', '', text)
    text = re.sub(r'(@.*?)[\s]', '', text)
    text = re.sub(r'#','',text)
    text = re.sub(r'&amp;', '&', text)
    text = re.sub(r'\s+', ' ', text).strip()
    #text = re.sub(r'\b\d+\b','', text)
    text = contractions.fix(text)
    return text

# Calculate sentiment scores using vader tool for sentiment analysis
# The output has 4 values:
# neg_values: array of possibilities of negative sentiment
# neu_values: array of possibilites of neutral sentiment
# pos_values: array of possibilities of positive sentiment
# compound_values: array of a compound value that represents a direct prediction of the sentiment
def sentiment_scores_from_vader(texts):
  sentIntensityAnalyzer = SentimentIntensityAnalyzer()
  neg_values = []
  neu_values = []
  pos_values = []
  compound_values = []

  for sentence in texts:
      sentiment_dict = sentIntensityAnalyzer.polarity_scores(sentence)
      neg_values.append(sentiment_dict['neg'])
      neu_values.append(sentiment_dict['neu'])
      pos_values.append(sentiment_dict['pos'])
      compound_values.append(sentiment_dict['compound'])

  return np.array([neg_values,neu_values,pos_values,compound_values]).T

## Calculate and save predictions of vader tool

In [None]:
# TEXT_DATA_PATH = './drive/My Drive/sentiment-analysis/mvsa-multiple-19600_text.hdf5'
TEXT_DATA_PATH = './drive/My Drive/sentiment-analysis/mvsa-single-4511_multimodal.hdf5'
texts,_,_= load_mvsa_data(TEXT_DATA_PATH, 1)

texts = [text_preprocessing(text) for text in texts]
vaderValues = sentiment_scores_from_vader(texts)

np.save("vader_values.npy",vaderValues)