In [185]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import numpy as np
import tensorflow
np.random.seed(0)
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Dropout, LSTM, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.initializers import glorot_uniform
np.random.seed(1)

- Load and pre-process data

In [186]:
# Load training data (use both "train" and "val" texts)
filepath_1 = 'train.txt'
filepath_2 = 'val.txt'
data_train = []
with open (filepath_1, 'r') as file:
    for sentence in file:
        data_train.append(sentence[:-1].split(';'))
    file.close

with open (filepath_2, 'r') as file:
    for sentence in file:
        data_train.append(sentence[:-1].split(';'))
    file.close

data_train = pd.DataFrame(data_train, columns = ['text', 'sentiment'])

# Load test data
filepath_3 = 'test.txt'
data_test = []
with open (filepath_3, 'r') as file:
    for sentence in file:
        data_test.append(sentence[:-1].split(';'))
    file.close
data_test = pd.DataFrame(data_test, columns = ['text', 'sentiment'])

# Create numpy arrays of strings
X_train, Y_train = data_train['text'], data_train['sentiment']
X_test, Y_test = data_test['text'], data_test['sentiment']
X_train = X_train.to_numpy()
Y_train = Y_train.to_numpy()
X_test = X_test.to_numpy()
Y_test = Y_test.to_numpy()

# Convert strings in Y_train and Y_test into integers
Y_train_table, Y_train_indexed = np.unique(Y_train, return_inverse = True)
Y_test_table, Y_test_indexed = np.unique(Y_test, return_inverse = True)

# Convert Y_train_indexed and Y_test_indexed as arrays of one-hot vectors
C = len(Y_train_table)
Y_train_oh = np.eye(C)[Y_train_indexed.reshape(-1)]
Y_test_oh = np.eye(C)[Y_test_indexed.reshape(-1)]

In [187]:
# Find length of sentence with the max. number of words:
maxLen = 0
for i in range(len(X_train)):
    length = len(X_train[i].split())
    if length > maxLen:
        maxLen = length
maxLen

66

In [192]:
# Load pre-trained word embeddings
# We're using the GloVe representations, where each vector has 50 features.

def read_glove_vecs(glove_file):
    """
    Create three dictionaries to map as follows:
    word - index
    index - word
    word - embedding vector

    returns: word_to_index, index_to_word, word_to_vec_map
    """
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        word_to_index = {}
        index_to_word = {}
        for w in sorted(words):
            word_to_index[w] = i
            index_to_word[i] = w
            i = i + 1
    return word_to_index, index_to_word, word_to_vec_map


In [193]:
weights_file = 'glove.6B.50d.txt'
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs(weights_file)

- X_train: (m, 1), Y_train: (m, 1)
- Each row of X_train is a single string
- Want to convert each row of X_train into a list of integers (indices)
- Want to zero-pad at the end, so that each list has max_len elements

In [194]:
def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to 'Embedding()'

    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """

    m = X.shape[0]

    # initialise
    X_indices = np.zeros((m, max_len))
    
    # loop over training examples
    for i in range(m):                               
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words = X[i].lower().split()
        
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words[:max_len]:
            # if w exists in the word_to_index dictionary
            if w in word_to_index.keys():
                # Set the (i,j)th entry of X_indices to the index of the correct word.
                X_indices[i, j] = word_to_index[w]
                # Increment j to j + 1
                j =  j+1
    return X_indices

In [195]:
X1 = np.array(["funny lol", "lets play baseball", "food is ready for you"])
X1_indices = sentences_to_indices(X1, word_to_index, max_len=5)
X1_indices

array([[155345., 225122.,      0.,      0.,      0.],
       [220930., 286375.,  69714.,      0.,      0.],
       [151204., 192973., 302254., 151349., 394475.]])

In [196]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """

    vocab_size = len(word_to_index) + 1              # adding 1 to fit Keras embedding
    any_word = list(word_to_vec_map.keys())[0]
    emb_dim = word_to_vec_map[any_word].shape[0]    # define dimensionality of GloVe word vectors (= 50)

    # initialise embedding matrix
    emb_matrix = np.zeros((vocab_size, emb_dim))

    for word, idx in word_to_index.items():
        emb_matrix[idx,:] = word_to_vec_map[word]
    
    embedding_layer = Embedding(input_dim = vocab_size, output_dim = emb_dim, trainable = False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [197]:
def classify_emotion(input_shape, word_to_vec_map, word_to_index):
    """
    Model for classifying emotion of input sentence.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """

    # Define sentence_indices as the input
    # It should be of shape input_shape and dtype 'int32' (as it contains indices, which are integers).
    sentence_indices = Input(shape = input_shape, dtype = 'int32' )
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through your embedding layer
    # (See additional hints in the instructions).
    embeddings = embedding_layer(sentence_indices)   
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # The returned output should be a batch of sequences.
    X = LSTM(units = 128, return_sequences = True)(embeddings)    # return_sequences = True, to get all the hidden states of all time steps
    # Add dropout with a probability of 0.5
    X = Dropout(rate= 0.5)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # The returned output should be a single hidden state, not a batch of sequences.
    X = LSTM(units = 128)(X)
    # Add dropout with a probability of 0.5
    X = Dropout(rate = 0.5)(X) 
    # Propagate X through a Dense layer with 5 units
    X = Dense(units = 6)(X)
    # Add a softmax activation
    X = Activation(activation = 'softmax')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs = sentence_indices, outputs = X )
    
    ### END CODE HERE ###
    
    return model
    

In [203]:
# Notice that input_shape is (10,)
# Earlier, tried with maxLen (= 66), but then the accuracy just didn't improve at all.
max_len = 15
model = classify_emotion((max_len,), word_to_vec_map, word_to_index)
model.summary()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 15)]              0         
                                                                 
 embedding_6 (Embedding)     (None, 15, 50)            20000050  
                                                                 
 lstm_12 (LSTM)              (None, 15, 128)           91648     
                                                                 
 dropout_12 (Dropout)        (None, 15, 128)           0         
                                                                 
 lstm_13 (LSTM)              (None, 128)               131584    
                                                                 
 dropout_13 (Dropout)        (None, 128)               0         
                                                                 
 dense_6 (Dense)             (None, 6)                 774 

In [204]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [205]:
# This is where the zero padding happens! 

# set max_len to 10
# When I tried running with max_len = maxLen = 66, the accuracy didn't go up enough
X_train_indices = sentences_to_indices(X_train, word_to_index, max_len = 15)        

In [206]:
model.fit(X_train_indices, Y_train_oh, epochs = 20, batch_size = 32, shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x17f712c10>

In [207]:
# test accuracy

X_test_indices = sentences_to_indices(X_test, word_to_index, max_len = 15)
loss, acc = model.evaluate(X_test_indices, Y_test_oh)



- Good, accuracy is similar.

In [208]:
# Check some mislabelled examples:

C = 6

X_test_indices = sentences_to_indices(X_test, word_to_index, max_len = 15)
pred = model.predict(X_test_indices)
for i in range(len(X_test)):
    #x = X_test_indices
    num = np.argmax(pred[i])
    if(num != Y_test_indexed[i]):
        print(f'Expected emotion: {Y_test[i]},  prediction:  {X_test[i]} --- {Y_test_table[num]}')

Expected emotion: joy,  prediction:  i explain why i clung to a relationship with a boy who was in many ways immature and uncommitted despite the excitement i should have been feeling for getting accepted into the masters program at the university of virginia --- sadness
Expected emotion: fear,  prediction:  i don t feel particularly agitated --- anger
Expected emotion: fear,  prediction:  i pay attention it deepens into a feeling of being invaded and helpless --- sadness
Expected emotion: joy,  prediction:  i feel like ive gotten to know many of you through comments and emails and for that im appreciative and glad you are a part of this little space --- sadness
Expected emotion: sadness,  prediction:  i stole a book from one of my all time favorite authors and now i feel like a rotten person --- joy
Expected emotion: anger,  prediction:  i highly recommend visiting on a wednesday if youre able because its less crowded so you get to ask the farmers more questions without feeling rude f

- In many cases, the incorect prediction stems from the sentence being long. (Since the model only looked at the first 15 words)

In [215]:
# Make model predict the emotion of new sentences
X_test = np.array(["I hate doing busy work", 
                "I'm excited about my new job", 
                "It's a beautiful day outside!", 
                "My wonderful partner is next to me",
                "The sun is shining so brightly"])
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len = 15)
pred = model.predict(X_test_indices)
for i in range(len(X_test)):
    num = np.argmax(pred[i])
    print(f'{X_test[i]}: {Y_test_table[num]}' )

I hate doing busy work: anger
I'm excited about my new job: anger
It's a beautiful day outside!: sadness
My wonderful partner is next to me: sadness
The sun is shining so brightly: joy


- Ok, the performance with new words isn't that great.

In [None]:
def cosine_similarity(u,v):
    """
    compute cosine similarity betwen u & v

    inputs: u & v - 1D arrays
    output: scalar

    returns: cosine_similarity
    """

    if np.all(u == v):
        return 1
    
    dot = np.dot(u,v)
    norm_u = np.sqrt(np.dot(u,u))
    norm_v = np.sqrt(np.dot(v,v))
    if np.isclose(norm_u, norm_v, atol = 1e-32):
        return 0
    
    cosine_similarity = dot / (norm_u * norm_v)
    return cosine_similarity


In [None]:
word_1 = 'seer'
word_2 = 'prophet'
word_3 = 'oracle'
word_4 = 'idiot'
word_5 = 'president'


cosine_similarity(word_to_vec_map['seer'],word_to_vec_map['prophet'])
cosine_similarity(word_to_vec_map['seer'],word_to_vec_map['oracle'])
cosine_similarity(word_to_vec_map['prophet'],word_to_vec_map['oracle'])
cosine_similarity(word_to_vec_map['love'],word_to_vec_map['adore'])
cosine_similarity(word_to_vec_map['despise'],word_to_vec_map['hate'])
cosine_similarity(word_to_vec_map['enamoured'],word_to_vec_map['charmed'])
cosine_similarity(word_to_vec_map['red'],word_to_vec_map['vegetable'])
cosine_similarity(word_to_vec_map['red'],word_to_vec_map['blue'])
cosine_similarity(word_to_vec_map['orange'],word_to_vec_map['apple'])
cosine_similarity(word_to_vec_map['orange'],word_to_vec_map['lemon'])
cosine_similarity(word_to_vec_map['orange'],word_to_vec_map['glove'])
cosine_similarity(word_to_vec_map['fast'],word_to_vec_map['quick'])
cosine_similarity(word_to_vec_map['fast'],word_to_vec_map['toe'])
cosine_similarity(word_to_vec_map['fast'],word_to_vec_map['quiet'])

0.6102162204826179