# Emojify - sensitivity analysis on text data
use Keras LSTM


In [2]:
import numpy as np
import emoji
import matplotlib.pyplot as plt

np.random.seed(0)
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
np.random.seed(1)

%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
import csv
import pandas as pd

In [4]:
### Reading csv files from local drive
def read_csv(filename = 'data/emojify_data.csv'):
    phrase = []
    emoji = []

    with open (filename) as csvDataFile:
        csvReader = csv.reader(csvDataFile)

        for row in csvReader:
            phrase.append(row[0])
            emoji.append(row[1])

    X = np.asarray(phrase)
    Y = np.asarray(emoji, dtype=int)

    return X, Y

In [5]:
### Read test and train data
X_train, Y_train = read_csv('data/train_emoji.csv')
X_test, Y_test = read_csv('data/tesss.csv')

In [6]:
maxLen = len(max(X_train, key=len).split())

In [7]:
### Reading wordvector glove
with open('data/glove.6B.50d.txt', 'r', encoding="utf-8") as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
        line = line.strip().split()
        curr_word = line[0]
        words.add(curr_word)
        word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
    i = 1
    words_to_index = {}
    index_to_words = {}
    for w in sorted(words):
        words_to_index[w] = i
        index_to_words[i] = w
        i = i + 1

## Emojifier-V2: Using LSTMs in Keras: 



In [8]:
### Prepare function

def sentences_to_indices(X, words_to_index, max_len):
    m = X.shape[0]                                   # number of training examples
    
    # Initialize X_indices as a numpy matrix of zeros and the correct shape
    X_indices = np.zeros((m, max_len))
    
    for i in range(m):                               # loop over training examples
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words = [w.lower() for w in X[i].split()]
        
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            X_indices[i, j] = words_to_index[w]
            # Increment j to j + 1
            j += 1
    
    return X_indices

In [9]:
### Build Keras Model
# Define sentence_indices as the input of the graph
input_shape=(maxLen,)
sentence_indices = Input(input_shape, dtype='int32')
    
## Create the embedding layer pretrained with GloVe Vectors
## Prepare embedding matrix used for 
vocab_len = len(words_to_index) + 1                  # adding 1 to fit Keras embedding
emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
# Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
emb_matrix = np.zeros((vocab_len, emb_dim))
# Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
for word, index in words_to_index.items():
    emb_matrix[index, :] = word_to_vec_map[word]

### Build Keras Embedding models    
# Define Keras embedding layer with the correct output/input sizes. 
embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
# Build the embedding layer, it is required before setting the weights of the embedding layer. 
embedding_layer.build((None,))
# Set the weights of the embedding layer to the embedding matrix. So, it is pre-trained.
embedding_layer.set_weights([emb_matrix])
    
### Build Keras LSTM models    
# Propagate sentence_indices through your embedding layer, you get back the embeddings
embeddings = embedding_layer(sentence_indices)   
    
# Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
X = LSTM(128, return_sequences=True)(embeddings)
# Add dropout with a probability of 0.5
X1 = Dropout(0.5)(X)
# Propagate X trough another LSTM layer with 128-dimensional hidden state
# Be careful, the returned output should be a single hidden state, not a batch of sequences.
X2 = LSTM(128, return_sequences=False)(X1)
# Add dropout with a probability of 0.5
X3 = Dropout(0.5)(X2)
# Propagate X through a Dense layer with softmax activation to get back a batch of 5-dimensional vectors.
X4 = Dense(5)(X3)
# Add a softmax activation
X5 = Activation('softmax')(X4)
    
# Create Model instance which converts sentence_indices into X.
model = Model(inputs=sentence_indices, outputs=X5)
    

In [10]:
X

<tf.Tensor 'lstm_1/transpose_1:0' shape=(?, ?, 128) dtype=float32>

In [11]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 10, 50)            20000050  
_________________________________________________________________
lstm_1 (LSTM)                (None, 10, 128)           91648     
_________________________________________________________________
dropout_1 (Dropout)          (None, 10, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 645       
__________

In [12]:
### Compile the model with gradient option
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
### Prepare the function

def sentences_to_indices(X, words_to_index, max_len):
    m = X.shape[0]                                   # number of training examples
    
    # Initialize X_indices as a numpy matrix of zeros and the correct shape 
    X_indices = np.zeros((m, max_len))
    
    for i in range(m):                               # loop over training examples
        
        # Convert the ith training sentence in lower case and split is into words.
        sentence_words = [w.lower() for w in X[i].split()]
        
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            X_indices[i, j] = words_to_index[w]
            # Increment j to j + 1
            j += 1
    
    return X_indices

In [14]:
### Prepare the train data
X_train_indices = sentences_to_indices(X_train, words_to_index, maxLen)
Y_train_oh = np.eye(5)[Y_train.reshape(-1)]

In [15]:
### Train the model
model.fit(X_train_indices, Y_train_oh, epochs = 50, batch_size = 32, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x21820e48>

In [16]:
## Prepare test data
X_test_indices = sentences_to_indices(X_test, words_to_index, max_len = maxLen)
Y_test_oh = np.eye(5)[Y_test.reshape(-1)]
## Evealuate model with test data
loss, acc = model.evaluate(X_test_indices, Y_test_oh)
print()
print("Test accuracy = ", acc)


Test accuracy =  0.8214285799435207


In [17]:
emoji_dictionary = {"0": "\u2764\uFE0F",    # :heart: prints a black instead of red heart depending on the font
                    "1": ":baseball:",
                    "2": ":smile:",
                    "3": ":disappointed:",
                    "4": ":fork_and_knife:"}

def label_to_emoji(label):
    """
    Converts a label (int or string) into the corresponding emoji code (string) ready to be printed
    """
    return emoji.emojize(emoji_dictionary[str(label)], use_aliases=True)

In [18]:
### Find out mislabelled examples
pred = model.predict(X_test_indices)
for i in range(len(X_test)):
    x = X_test_indices
    num = np.argmax(pred[i])
    if(num != Y_test[i]):
        print('Expected emoji:'+ label_to_emoji(Y_test[i]) + ' prediction: '+ X_test[i] + label_to_emoji(num).strip())

Expected emoji:😄 prediction: he got a very nice raise	❤️
Expected emoji:😄 prediction: she got me a nice present	❤️
Expected emoji:😞 prediction: work is hard	😄
Expected emoji:😞 prediction: This girl is messing with me	❤️
Expected emoji:😞 prediction: work is horrible	😄
Expected emoji:🍴 prediction: any suggestions for dinner	😄
Expected emoji:😄 prediction: you brighten my day	❤️
Expected emoji:😞 prediction: she is a bully	😄
Expected emoji:😞 prediction: My life is so boring	❤️
Expected emoji:😄 prediction: will you be my valentine	❤️


In [19]:
### Predict with test data of not feeling happy
x_test1 = np.array(['not feeling happy'])
X_test_indices1 = sentences_to_indices(x_test1, words_to_index, maxLen)
print(x_test1[0] +' '+  label_to_emoji(np.argmax(model.predict(X_test_indices1))))

not feeling happy 😞
