# Emojify

In [54]:
import numpy as np
from emo_utils import read_csv, read_glove_vecs, convert_to_one_hot, label_to_emoji
np.random.seed(0)
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Dropout, LSTM, Activation
from tensorflow.keras.layers import Embedding
np.random.seed(1)

## 1. Dataset

<img src="images/data_set.png" height=500/>

In [5]:
# Dataset đã đc xử lí trước
X_train, Y_train = read_csv('data/train_emoji.csv')
# X_train = 
# ['never talk to me again' 'I am proud of your achievements'
# 'It is the worst day in my life' 'Miss you so much' 'food is life'
# ...], len=132
# Y_train = [3 2 3 0 ...], len=132
X_test, Y_test = read_csv('data/tesss.csv') # shape=(132,); (56, 0)
# X_test =
# ['I want to eat\t' 'he did not answer\t' 'he got a very nice raise\t'
#  'she got me a nice present\t' 'ha ha ha it was so funny\t'
#  ...], len=56
# Y_test = [4 3 2 2 ...], len=56


In [7]:
# Load các dictss
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt') 
# word_to_index = {'!': 1, '!!': 2, ...}, len=40000
# index_to_word = {1: '!', 2: '!!', ...}, len=40000
# word_to_vec_map = {'the': array([...]), ',': array([...]), ...}, len=40000, mỗi array có shape (50, 1)


## 2. Tạo Pretrained Embeddings

In [21]:
# Chuyển các sentences trong X_train thành ma trận indices, shape là (m, max_len)=(132, 10) với m là số lượng training examples và max_len là số words trong sentence dài nhất (ở đây =10)
def sentences_to_indices(X, word_to_index, max_len):    
    m = X.shape[0]                                  
    
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros([m,max_len])
    
    for i in range(m): # e.g: i=0                              
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words = X[i].lower().split() # ['never', 'talk', 'to', 'me', 'again']
        
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words

        for w in sentence_words:
            # if w exists in the word_to_index dictionary
            if w in word_to_index:
                # Set the (i,j)th entry of X_indices to the index of the correct word.
                X_indices[i, j] = word_to_index[w]
                # Increment j to j + 1
                j =  j+1
    
    return X_indices

In [27]:
maxLen = len(max(X_train, key=len).split()) # =10
X_train_indices = sentences_to_indices(X_train, word_to_index, maxLen)
# shape=(132, 10) 
# [[259914. 352214. 360915. ...      0.      0.      0.]
#  [185457.  52943. 293982. ...      0.      0.      0.]
#  ...
# ]
Y_train_oh = convert_to_one_hot(Y_train, C = 5)
# shape=(132, 5)
# [[0. 0. 0. 1. 0.]
#  [0. 0. 1. 0. 0.]
#  ...
# ]

['never', 'talk', 'to', 'me', 'again']


<img src="images/embedding1.png" height=300/>

In [38]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_size = len(word_to_index) + 1 # adding 1 to fit Keras embedding (requirement), =40001
    any_word = list(word_to_vec_map.keys())[0]
    emb_dim = word_to_vec_map[any_word].shape[0] # define dimensionality of your GloVe word vectors (= 50)
      
    # Step 1
    # Initialize the embedding matrix as a numpy array of zeros.
    # See instructions above to choose the correct shape.
    emb_matrix = np.zeros([vocab_size, emb_dim]) # shape=(40001, 50)
    
    # Step 2
    # Set each row "idx" of the embedding matrix to be 
    # the word vector representation of the idx'th word of the vocabulary
    for word, idx in word_to_index.items():
        emb_matrix[idx, :] = word_to_vec_map[word]
    # emb_matrix=
    # [[ 0.        0.        0.       ...  0.        0.        0.      ] vì idx trong word_to_index chạy từ 1 nên hàng 0 của emb_matrix =0
    #  [-0.58402   0.39031   0.65282  ... -1.2338    0.46715   0.78858 ] =word_to_vec_map['!']
    #  [-1.0588    0.26952   0.94632  ...  0.31138   0.79843   0.20392 ] =word_to_vec_map['!!']
    #  ...
    # ]

    # Step 3
    # Define Keras embedding layer with the correct input and output sizes
    # Make it non-trainable.
    embedding_layer = Embedding(vocab_size, emb_dim ,trainable = False)

    # Step 4 (already done for you; please do not modify)
    # Build the embedding layer, it is required before setting the weights of the embedding layer. 
    embedding_layer.build((None,)) # Do not modify the "None".  This line of code is complete as-is.
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

## 3. Model

<img src="images/emojifier-v2.png" height=500/>

In [49]:
# UNQ_C5 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
# GRADED FUNCTION: Emojify_V2

def Emojify_V2(input_shape, word_to_vec_map, word_to_index):
    # 1. Input là indices các words của sentences trong batch, có shape (batch_size, 10).
    # Define sentence_indices as the input of the graph.
    # It should be of shape input_shape and dtype 'int32' (as it contains indices, which are integers).
    sentence_indices = Input(shape=input_shape,dtype='int32')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # 2. Tính embeddings tương ứng của input, có shape (batch_size, 10, 50).
    # Propagate sentence_indices through your embedding layer
    # (See additional hints in the instructions).
    embeddings = embedding_layer(sentence_indices)   
    
    # 3. Đưa embeddings này vào một lớp LSTM. Ở đây T_x=10, n_x=50. Lấy ra tất cả 10 output states (thực ra là hidden states a^t) của LSTM, có shape (batch_size, 10, n_a)=(32, 10, 128)
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # The returned output should be a batch of sequences, So, set return_sequences = True
    # If return_sequences = False, the LSTM returns only the last output in output sequence
    X = LSTM(units=128, return_sequences=True)(embeddings)
    # Note: Output state cuối cùng X[:, -1, :] sẽ bằng hidden state a^t cuối cùng (đc trả về nếu return_state=True)

    # Đưa qua tiếp một lớp LSTM khác, nhưng giờ chỉ lấy ra output state cuối cùng của LSTM, có shape (batch_size, n_a)=(32, 128)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # The returned output should be a single hidden state, not a batch of sequences.
    X = LSTM(units=128,return_sequences = False)(X)

    # Tính output bằng softmax, có shape (batch_size, 5)=(32, 5)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X through a Dense layer with 5 units
    X = Dense(5)(X)
    # Add a softmax activation
    X = Activation('softmax')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs=sentence_indices,outputs=X)
    
    return model

In [50]:
model = Emojify_V2((maxLen,), word_to_vec_map, word_to_index)
model.summary()

In [51]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_indices, Y_train_oh, epochs=100, batch_size=32, shuffle=True) # Sẽ có 5 batches

Epoch 1/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.2331 - loss: 1.5949
Epoch 2/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.3470 - loss: 1.5034
Epoch 3/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.3604 - loss: 1.4590
Epoch 4/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.4111 - loss: 1.3771
Epoch 5/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5259 - loss: 1.2938 
Epoch 6/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.4824 - loss: 1.2412
Epoch 7/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5580 - loss: 1.1143
Epoch 8/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6585 - loss: 0.9621
Epoch 9/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x7b40b8d5d090>

In [52]:
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len = maxLen)
Y_test_oh = convert_to_one_hot(Y_test, C = 5)
loss, acc = model.evaluate(X_test_indices, Y_test_oh)
print()
print("Test accuracy = ", acc)

['i', 'want', 'to', 'eat']
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7857 - loss: 0.9779  

Test accuracy =  0.8035714030265808


In [55]:
x_test = np.array(["What are you eating?"])
X_test_indices = sentences_to_indices(x_test, word_to_index, maxLen)
print(x_test[0] +' '+  label_to_emoji(np.argmax(model.predict(X_test_indices))))

['what', 'are', 'you', 'eating?']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 187ms/step
What are you eating? :disappointed:


### Notes
Model hiện tại ko thực sự robust trong việc hiểu negation (như “not happy”), vì training set khá nhỏ và ko có nhiều examples of negation.