In [None]:
# This code is to show experiments with Neural and Recursive Neural Networks

In [2]:
import pandas as pd
eb = pd.read_csv("emobank.csv", index_col=0 ,engine='python')
eb.reset_index(drop=True, inplace=True)
eb = eb.drop(labels='split', axis=1)
eb_list = eb['text'].values.tolist()

In [3]:
# Based on https://towardsdatascience.com/another-twitter-sentiment-analysis-bb5b01ebad90
# Data cleaning
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
import re

tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))
def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'html.parser')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()
testing = eb_list
test_result = []
for t in testing:
    test_result.append(tweet_cleaner(t))



In [4]:
eb_list_V = test_result
eb_list_A = test_result
eb_list_D = test_result

In [5]:
# Feature extraction
# This function is taken from 
# https://developers.google.com/machine-learning/guides/text-classification/step-3
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text

# Vectorization parameters
# Limit on the number of features. We use the top 20K features.
TOP_K = 20000

# Limit on the length of text sequences. Sequences longer than this
# will be truncated.
MAX_SEQUENCE_LENGTH = 500

def sequence_vectorize(train_texts, val_texts):
    """Vectorizes texts as sequence vectors.

    1 text = 1 sequence vector with fixed length.

    # Arguments
        train_texts: list, training text strings.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val, word_index: vectorized training and validation
            texts and word index dictionary.
    """
    # Create vocabulary with training texts.
    tokenizer = text.Tokenizer(num_words=TOP_K)
    tokenizer.fit_on_texts(train_texts)

    # Vectorize training and validation texts.
    x_train = tokenizer.texts_to_sequences(train_texts)
    x_val = tokenizer.texts_to_sequences(val_texts)

    # Get max sequence length.
    max_length = len(max(x_train, key=len))
    if max_length > MAX_SEQUENCE_LENGTH:
        max_length = MAX_SEQUENCE_LENGTH

    # Fix sequence length to max value. Sequences shorter than the length are
    # padded in the beginning and sequences longer are truncated
    # at the beginning.
    x_train = sequence.pad_sequences(x_train, maxlen=max_length)
    x_val = sequence.pad_sequences(x_val, maxlen=max_length)
    return x_train, x_val, tokenizer.word_index, max_length

In [6]:
# Feature 
# Preprocessing function with Tf-idf
# Based on code from:
# https://developers.google.com/machine-learning/guides/text-classification/step-3
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# Vectorization parameters
# Range (inclusive) of n-gram sizes for tokenizing text.
NGRAM_RANGE = (1, 2)

# Limit on the number of features. We use the top 20K features.
TOP_K = 20000

# Whether text should be split into word or character n-grams.
# One of 'word', 'char'.
TOKEN_MODE = 'word'

# Minimum document/corpus frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY = 2

def ngram_vectorize(train_texts, train_labels, val_texts):
    """Vectorizes texts as n-gram vectors.

    1 text = 1 tf-idf vector the length of vocabulary of unigrams + bigrams.

    # Arguments
        train_texts: list, training text strings.
        train_labels: np.ndarray, training labels.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val: vectorized training and validation texts
    """
    # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    kwargs = {
            'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,  # Split text into word tokens.
            'min_df': MIN_DOCUMENT_FREQUENCY,
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # Learn vocabulary from training texts and vectorize training texts.
    x_train = vectorizer.fit_transform(train_texts)

    # Vectorize validation texts. This way no leaking of information is done.
    x_val = vectorizer.transform(val_texts)

    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train).astype('float32')
    x_val = selector.transform(x_val).astype('float32')
    return x_train, x_val, 

In [7]:
from sklearn.model_selection import train_test_split

y_V = eb['V']
y_A = eb['A']
y_D = eb['D']
X_Vtrain, X_Vtest, y_Vtrain, y_Vtest = train_test_split(eb_list_V, y_V, test_size=0.3)
X_Atrain, X_Atest, y_Atrain, y_Atest = train_test_split(eb_list_A, y_A, test_size=0.3)
X_Dtrain, X_Dtest, y_Dtrain, y_Dtest = train_test_split(eb_list_D, y_D, test_size=0.3)

In [8]:
# Preprocess the data with ngram
#X_Vtrain, X_Vtest = ngram_vectorize(X_Vtrain, y_Vtrain, X_Vtest)
#X_Atrain, X_Atest = ngram_vectorize(X_Atrain, y_Atrain, X_Atest)
#X_Dtrain, X_Dtest = ngram_vectorize(X_Dtrain, y_Dtrain, X_Dtest)

In [10]:
# Preprocess data with embeddings
X_Vtrain, X_Vtest, word_index_V, max_len_V = sequence_vectorize(X_Vtrain, X_Vtest)
X_Atrain, X_Atest, word_index_A, max_len_A = sequence_vectorize(X_Atrain, X_Atest)
X_Dtrain, X_Dtest, word_index_D, max_len_D = sequence_vectorize(X_Dtrain, X_Dtest)

In [11]:
# Split into dev set from training set so we get the same distribution
X_Vtest, X_Vdev, y_Vtest, y_Vdev = train_test_split(X_Vtest, y_Vtest, test_size=0.5)
X_Atest, X_Adev, y_Atest, y_Adev = train_test_split(X_Atest, y_Atest, test_size=0.5)
X_Dtest, X_Ddev, y_Dtest, y_Ddev = train_test_split(X_Dtest, y_Dtest, test_size=0.5)

In [12]:
# FOR VLAD - list to tensor
import tensorflow as tf
X_Vtrain_dataset = tf.data.Dataset.from_tensor_slices((X_Vtrain, y_Vtrain))
X_Vtest_dataset = tf.data.Dataset.from_tensor_slices((X_Vtest, y_Vtest))
X_Vdev_dataset = tf.data.Dataset.from_tensor_slices((X_Vdev, y_Vdev))

X_Atrain_dataset = tf.data.Dataset.from_tensor_slices((X_Atrain, y_Atrain))
X_Atest_dataset = tf.data.Dataset.from_tensor_slices((X_Atest, y_Atest))
X_Adev_dataset = tf.data.Dataset.from_tensor_slices((X_Adev, y_Adev))
X_Dtrain_dataset = tf.data.Dataset.from_tensor_slices((X_Dtrain, y_Dtrain))
X_Dtest_dataset = tf.data.Dataset.from_tensor_slices((X_Dtest, y_Dtest))
X_Ddev_dataset = tf.data.Dataset.from_tensor_slices((X_Ddev, y_Ddev))

In [24]:
X_Vtrain_dataset

<TensorSliceDataset shapes: ((120,), ()), types: (tf.int32, tf.float64)>

In [15]:
# FOR VLAD - tensor to batch
BUFFER_SIZE = 1000
train_batches_V = (
    X_Vtrain_dataset
    .shuffle(BUFFER_SIZE)
    .padded_batch(32, tf.compat.v1.data.get_output_shapes(X_Vtrain_dataset)))

test_batches_V = (
    X_Vtest_dataset
    .padded_batch(32, tf.compat.v1.data.get_output_shapes(X_Vtest_dataset)))
dev_batches_V = (
    X_Vdev_dataset
    .padded_batch(32, tf.compat.v1.data.get_output_shapes(X_Vdev_dataset)))

train_batches_A = (
    X_Atrain_dataset
    .shuffle(BUFFER_SIZE)
    .padded_batch(32, tf.compat.v1.data.get_output_shapes(X_Atrain_dataset)))

test_batches_A = (
    X_Atest_dataset
    .padded_batch(32, tf.compat.v1.data.get_output_shapes(X_Atest_dataset)))
dev_batches_A = (
    X_Adev_dataset
    .padded_batch(32, tf.compat.v1.data.get_output_shapes(X_Adev_dataset)))

train_batches_D = (
    X_Dtrain_dataset
    .shuffle(BUFFER_SIZE)
    .padded_batch(32, tf.compat.v1.data.get_output_shapes(X_Dtrain_dataset)))

test_batches_D = (
    X_Dtest_dataset
    .padded_batch(32, tf.compat.v1.data.get_output_shapes(X_Dtest_dataset)))
dev_batches_D = (
    X_Ddev_dataset
    .padded_batch(32, tf.compat.v1.data.get_output_shapes(X_Ddev_dataset)))

In [16]:
# Using linear activation function since this is regression
""" Build the model """
def create_model() :
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(word_index_V)+1, 16),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(1, activation='linear')])  

  model.summary()

  model.compile(optimizer='sgd',
                loss=tf.keras.losses.MeanSquaredError())
                 #metrics=['accuracy']) accuracy doesnt work for regression)
  return model

model = create_model()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          222480    
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 17        
                                                                 
Total params: 222,497
Trainable params: 222,497
Non-trainable params: 0
_________________________________________________________________


In [17]:
""" Train the model """
history = model.fit(train_batches_V,
                    epochs=10,
                    validation_data=dev_batches_V,
                    validation_steps=30)
print("Valence training set error")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Valence training set error


In [18]:
""" Train the model """
history = model.fit(train_batches_A,
                    epochs=10,
                    validation_data=dev_batches_A,
                    validation_steps=30)
print("Arousal training set error")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Arousal training set error


In [19]:
""" Train the model """
history = model.fit(train_batches_D,
                    epochs=10,
                    validation_data=dev_batches_D,
                    validation_steps=30)
print("Dominance training set error")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Dominance training set error


In [20]:
# Construct model
model_rnn = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(word_index_V)+1, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='linear')
])

In [21]:
# Compile model
model_rnn.compile(loss=tf.keras.losses.MeanSquaredError(),
                  optimizer='sgd')
                  #metrics=['accuracy'])


In [22]:
""" Train the model """
history = model_rnn.fit(train_batches_V,
                    epochs=10,
                    validation_data=dev_batches_V,
                    validation_steps=30)
print("Valence training set error")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Valence training set error
