In [1]:
import numpy as np
from emo_utils import *
import emoji
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
import numpy as np
np.random.seed(0)
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
np.random.seed(1)

Using TensorFlow backend.


## Load Train/Test data

In [3]:
X_train_raw, Y_train_raw = read_csv('data/train_emoji.csv')
X_test_raw, Y_test_raw = read_csv('data/tesss.csv')

print(X_train_raw.shape,X_train_raw[1])
print(Y_train_raw.shape,label_to_emoji(Y_train_raw[1]))

(132,) I am proud of your achievements
(132,) 😄


## Word to vec Embedding Matrix

### Embedding Matrix

- It should be matrix of `(vocab_len, vec_size)` , i.e (50,400K) for our word to vec
- The column location should be identified by using `word_to_index[]` array

In [4]:
def read_embedding_matrix():
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    emb_matrix = np.zeros((vocab_len,emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    return (emb_matrix,word_to_index)

In [5]:
emb_matrix,word_to_index = read_embedding_matrix()
vocab_len, vec_len = emb_matrix.shape
print('matrix=',emb_matrix.shape,'word->indexmapper.len =',len(word_to_index))

matrix= (400001, 50) word->indexmapper.len = 400000


# Feature Engineering

### Transform Label ( To OneHot Encoding)

In [6]:
Y_train = convert_to_one_hot(Y_train_raw, C = 5)
Y_test = convert_to_one_hot(Y_test_raw, C = 5)
print('Y_train_raw.shape =', Y_train_raw.shape, 'Y_train.shape =', Y_train.shape)

Y_train_raw.shape = (132,) Y_train.shape = (132, 5)


### Transform Features (Words to indices which will be converted to Vec in embedding Layer)

In [8]:
def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]                                   # number of training examples
    
    X_indices = np.zeros((m,max_len))
    
    for i in range(m):                               # loop over training examples
        
        sentence_words = X[i].lower().split()
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            X_indices[i, j] = word_to_index[w]
            # Increment j to j + 1
            j = j+1
                
    return X_indices

In [16]:
def find_max_length_of_words(x):
    return len(max(x,key=lambda x:len(x.split())).split())

In [17]:
maxLen = max(find_max_length_of_words(X_train_raw),find_max_length_of_words(X_test_raw))
X_train = sentences_to_indices(X_train_raw,word_to_index,maxLen)
X_test = sentences_to_indices(X_test_raw,word_to_index,maxLen)

print('maxLen = ',maxLen)
print('X_train_raw.shape = ', X_train_raw.shape,'\n', 'X_train_raw[0] =', X_train_raw[0])
print('X_train.shape =',X_train.shape,'\n','X_train[0] = ',X_train[0])

maxLen =  10
X_train_raw.shape =  (132,) 
 X_train_raw[0] = never talk to me again
X_train.shape = (132, 10) 
 X_train[0] =  [259914. 352214. 360915. 239105.  47887.      0.      0.      0.      0.
      0.]


## Embedding Layer with pretrained weights

In [18]:
def pretrained_embedding_layer(emb_matrix,vocab_len,vec_len,trainable):
    # Define Keras embedding layer with the correct output/input sizes, make it trainable. 
    embedding_layer = Embedding(vocab_len,vec_len,trainable=trainable)

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer


### LSTM

LSTM takes input shape as (m,Tx,Nx).
- `m` number of examples in a given batch
- `Tx` number of sequences in each example. Convert all examples into same length for easy vector processing. However, the inference can be done on any size.
- `Nx` - Vector size.

Now when you do single call LSTM(input_matrix), it will do one full cycle of forward pass for all words in the sequence. You have option to get final output(A) of Tx th word, or get out put of all passes by using the variable return_sequences. 

You have to get the output from LSTM(A) and use a DenseLayer and softmax activation to convert it into output Y-hat.

## Model Creation

INPUT -> EMBEDDING -> LSTM -> DROPOUT -> LSTM ->DROPOUT -> DENSE -> SOFTMAX (OUTPUT) -> LOSS (FOR OPTIMIZER)

In [19]:
def Emojify_Model(input_shape):
    """
    Function creating the Emojify-v2 model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,000 words)

    Returns:
    model -- a model instance in Keras
    """
    
    # Create Input Tensor for the Model
    sentence_indices = X = Input(shape=input_shape, dtype='int32',name='main_input')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    X = pretrained_embedding_layer(emb_matrix,vocab_len,vec_len,trainable=False)(X)
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # Lets take output from all sequences, so that we can use another LSTM.
    X = LSTM(128,return_sequences='True')(X)
    print('outout of LSTM =',X)
    
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a single hidden state, not a batch of sequences.
    # Lets take only the last output so that we can get the results.
    X = LSTM(128)(X)
    
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    
    # Propagate X through a Dense layer.
    X = Dense(5)(X)

    #  with softmax activation to get back a batch of 5-dimensional vectors.
    X = Activation('softmax')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs=[sentence_indices], outputs=[X])
    
    ### END CODE HERE ###
    
    return model

In [20]:
model = Emojify_Model((maxLen,))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


outout of LSTM = Tensor("lstm_1/transpose_1:0", shape=(?, ?, 128), dtype=float32)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
main_input (InputLayer)      (None, 10)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 10, 50)            20000050  
_________________________________________________________________
lstm_1 (LSTM)                (None, 10, 128)           91648     
_________________________________________________________________
dropout_1 (Dropout)          (None, 10, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
____________________________________________________________

In [21]:
model.fit(X_train, Y_train, epochs = 50, batch_size = 32, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x121c76b38>

In [22]:
# Change the sentence below to see your prediction. Make sure all the words are in the Glove embeddings.  
x_test = np.array(['I am not happy','I am happy','I like cricket'])
X_test_indices = sentences_to_indices(x_test, word_to_index, maxLen)
for i in range(len(x_test)):
    print(x_test[i] +' '+  label_to_emoji(np.argmax(model.predict(X_test_indices[[i]]))))

I am not happy 😞
I am happy 😄
I like cricket 🍴
