In [1]:
import csv
import numpy as np
import emoji
import matplotlib.pyplot as plt

%matplotlib inline

Emoji Class

In [2]:
emoji_dictionary = {"0": ":heart_with_arrow:",
                    "1": ":baseball:",
                    "2": ":smile:",
                    "3": ":disappointed:",
                    "4": ":fork_and_knife:"}

def label_to_emoji(label):
    """
    Converts a label (int or string) into the corresponding emoji code (string) ready to be printed
    """
    return emoji.emojize(emoji_dictionary[str(label)], use_aliases=True)

<img src="images/data_set.png" style="width:700px;height:300px;">

Load Dataset

In [3]:
def read_csv(filename):
    Phrase = []
    Emoji = []
    with open (filename) as f:
        data = csv.reader(f)
        for row in data:
            Phrase.append(row[0])
            Emoji.append(row[1])
    X = np.asarray(Phrase)
    Y = np.asarray(Emoji,dtype=int)
    return X,Y
X_train, Y_train = read_csv('data/train_emoji.csv')
X_test, Y_test = read_csv('data/tesss.csv')
### Example
index = 10
print X_train[index],label_to_emoji(Y_train[index])
###

she did not answer my text  😞


Get Max Length

In [4]:
max_len = len(max(X_train, key=len).split())
print 'max length of sentences is ', max_len

max length of sentences is  10


One-hot convert

In [5]:
y_train = np.eye(5)[Y_train.reshape(-1)]
y_test = np.eye(5)[Y_test.reshape(-1)]
### Example
index = 50
print Y_train[index], 'is converted into one hot', y_train[index]
###

0 is converted into one hot [1. 0. 0. 0. 0.]


Load Word Vectors

In [6]:
def read_GloVe_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map
word_to_index, index_to_word, word_to_vec_map = read_GloVe_vecs('data/glove.6B.50d.txt')

Average words meaning in a sentence. Can be used for naive Neural Network

In [7]:
# def meaning_encoding(sentence, word_to_vec_map):
#     words = sentence.lower().split()
#     meaning_code = np.zeros(word_to_vec_map[words[0]].shape)
#     for word in words:
#         meaning_code += word_to_vec_map[word]
#     avg = meaning_code/len(words)
#     return avg
# #### Example
# avg = meaning_encoding("Morrocan couscous is my favorite dish", word_to_vec_map)
# print("avg = ", avg)
# ####

Advanced LSTM Model in Keras

In [8]:
import numpy as np
np.random.seed(0)
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
np.random.seed(1)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


<img src="images/embedding1.png" style="width:700px;height:250px;">

In [9]:
def to_indices(X,word_to_index,max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    m = X.shape[0]
    X_indices = np.zeros((m,max_len))
    for i in range(m):
        words = X[i].lower().split()
        j = 0 
        for word in words:
            X_indices[i,j] = word_to_index[word]
            j += 1
    return X_indices
### Example
X1 = np.array(["funny lol", "lets play baseball", "food is ready for you"])
X1_indices = to_indices(X1,word_to_index, max_len = 5)
print("X1 =", X1)
print("X1_indices =", X1_indices)   
###

('X1 =', array(['funny lol', 'lets play baseball', 'food is ready for you'],
      dtype='|S21'))
('X1_indices =', array([[155345., 225122.,      0.,      0.,      0.],
       [220930., 286375.,  69714.,      0.,      0.],
       [151204., 192973., 302254., 151349., 394475.]]))


In [10]:
def pretrained_embedding_layer(word_to_vec_map,word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    dim = word_to_vec_map['cucumber'].shape[0]
    ebd_matrix = np.zeros((vocab_len,dim))
    for word,index in word_to_index.items():
        ebd_matrix[index,:] = word_to_vec_map[word]
        
    # Define Keras embedding layer with the correct output/input sizes    
    Embedding_layer = Embedding(vocab_len,dim,trainable=False)
    # Build the embedding layer, it is required before setting the weights of the embedding layer.
    Embedding_layer.build((None,))
    # Set the weights of the embedding layer to the embedding matrix.
    Embedding_layer.set_weights([ebd_matrix])
    
    return Embedding_layer
### Example
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
print("weights[0][1][3] =", embedding_layer.get_weights()[0][1][3])
###

('weights[0][1][3] =', -0.3403)


<img src="images/emojifier-v2.png" style="width:700px;height:400px;"> <br>

In [11]:
def Emojify(input_shape,word_to_vec_map,word_to_index):
    """
    Function creating the Emojify model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """
    sentence_indices = Input(shape=input_shape,dtype='int32')
    
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    embeddings = embedding_layer(sentence_indices) 
    
    X = LSTM(128,return_sequences=True)(embeddings)
    X = Dropout(0.5)(X)
    X = LSTM(128,return_sequences=False)(X)
    X = Dropout(0.5)(X)
    X = Dense(5)(X)
    X = Activation('softmax')(X)
    
    model = Model(inputs=sentence_indices,outputs=X)
    return model

In [12]:
model = Emojify((max_len,), word_to_vec_map, word_to_index)
model.summary()

Instructions for updating:
keep_dims is deprecated, use keepdims instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 10, 50)            20000050  
_________________________________________________________________
lstm_1 (LSTM)                (None, 10, 128)           91648     
_________________________________________________________________
dropout_1 (Dropout)          (None, 10, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
den

In [13]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [14]:
X_train_indices = to_indices(X_train,word_to_index,max_len)
model.fit(X_train_indices, y_train, epochs = 100, batch_size = 32, shuffle=True)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x11fffd990>

In [15]:
X_test_indices = to_indices(X_test,word_to_index,max_len)
loss, acc = model.evaluate(X_test_indices, y_test)
print()
print("Test accuracy = ", acc)

('Test accuracy = ', 0.8750000085149493)


In [16]:
# This code allows you to see the mislabelled examples
pred = model.predict(X_test_indices)
for i in range(len(X_test)):
    x = X_test_indices
    num = np.argmax(pred[i])
    if(num != Y_test[i]):
        print('Expected emoji:'+ label_to_emoji(Y_test[i]) + ' prediction: '+ X_test[i] + label_to_emoji(num).strip())

Expected emoji:😞 prediction: work is hard	😄
Expected emoji:😞 prediction: This girl is messing with me	💘
Expected emoji:💘 prediction: I love taking breaks	😞
Expected emoji:😄 prediction: you brighten my day	💘
Expected emoji:😞 prediction: she is a bully	💘
Expected emoji:😞 prediction: go away	🍴
Expected emoji:🍴 prediction: I did not have breakfast 💘


In [17]:
test = np.array(['not feeling happy'])
test_indices = to_indices(test, word_to_index, max_len)
print(test[0] +' '+  label_to_emoji(np.argmax(model.predict(test_indices))))

not feeling happy 😞
