## **Data Pre-Processing**

In [1]:
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv('train_emoji.csv',header=None)

In [3]:
df_train.shape

(132, 4)

In [4]:
df_train.head(10)

Unnamed: 0,0,1,2,3
0,never talk to me again,3,,
1,I am proud of your achievements,2,,
2,It is the worst day in my life,3,,
3,Miss you so much,0,,[0]
4,food is life,4,,
5,I love you mum,0,,
6,Stop saying bullshit,3,,
7,congratulations on your acceptance,2,,
8,The assignment is too long,3,,
9,I want to go play,1,,[3]


In [5]:
df_train[1].value_counts()

2    38
3    36
0    22
1    19
4    17
Name: 1, dtype: int64

In [6]:
df_test = pd.read_csv('test_emoji.csv',header=None)

In [7]:
df_test.shape

(56, 2)

In [8]:
df_test.head(10)

Unnamed: 0,0,1
0,I want to eat\t,4
1,he did not answer\t,3
2,he got a raise\t,2
3,she got me a present\t,0
4,ha ha ha it was so funny\t,2
5,he is a good friend\t,0
6,I am upset\t,0
7,We had such a lovely dinner tonight\t,0
8,where is the food\t,4
9,Stop making this joke ha ha ha\t,2


In [9]:
X_train = df_train[0]
y_train = df_train[1]

X_test = df_test[0]
y_test = df_test[1]

In [10]:
y_train

0      3
1      2
2      3
3      0
4      4
      ..
127    1
128    4
129    3
130    0
131    2
Name: 1, Length: 132, dtype: int64

In [11]:
X_train

0               never talk to me again
1      I am proud of your achievements
2       It is the worst day in my life
3                     Miss you so much
4                         food is life
                    ...               
127          he had to make a home run
128                 I am ordering food
129             What is wrong with you
130                         I love you
131                          great job
Name: 0, Length: 132, dtype: object

In [12]:
#figure out max length of a sentence in X_train and X_test
maxlen=0
for i in X_train:
    maxlen = max(maxlen , len(i.split()))
    #print(len(i.split()))

for i in X_test:
    maxlen = max(maxlen , len(i.split()))    

print(maxlen)

10


In [13]:
X_train.shape[0]

132

In [14]:
X_test.shape[0]

56

In [15]:
#tokenization of a sentence
for ix in range(X_train.shape[0]):
    fd = X_train[ix].split()
    print(fd)
    for ij in range(len(fd)):
        print(fd[ij])
    break

['never', 'talk', 'to', 'me', 'again']
never
talk
to
me
again


In [16]:
type(X_train)

pandas.core.series.Series

In [17]:
X_train[0]

'never talk to me again'

# Converting Sentences into embeddings 
# using Glove vector

In [18]:
f = open('glove.6B.50d.txt' , encoding='utf8')

In [19]:
#make our own word embedding dictionary
embeddings_idx={}

for line in f:
    values = line.split()
    #print(values[0])
    #print(values[1:])
    word = values[0]
    coefs = np.asarray(values[1:],dtype='float')
    #print(word,coefs)
    embeddings_idx[word] = coefs

f.close()

In [20]:
embeddings_idx["vacation"].shape

(50,)

function to make word embeddings for each word in each sentence

In [21]:
#maxlen of a sentence
maxlen = 10

In [22]:
def embedding_output(X):
    
    #embedding dimension
    emb_dim = 50
    #batch size
    batch_size = X.shape[0]
    #output of the function
    embedding_out = np.zeros((batch_size , maxlen , emb_dim))
    
    for ix in range(X.shape[0]):
        #tokenize current sentence
        X[ix] = X[ix].split()
        
        for ij in range(len(X[ix])):
            #get current word's embedding from 'glove' embeddings iff that word is present 
            if X[ix][ij].lower() in embeddings_idx.keys() and ij < maxlen:
                embedding_out[ix][ij] = embeddings_idx[X[ix][ij].lower()]
    
    return embedding_out
    

In [23]:
embedding_matrix_train = embedding_output(X_train)
embedding_matrix_test = embedding_output(X_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[ix] = X[ix].split()


In [24]:
X_train[1]

['I', 'am', 'proud', 'of', 'your', 'achievements']

In [25]:
embedding_matrix_train.shape

(132, 10, 50)

In [26]:
embedding_matrix_test.shape

(56, 10, 50)

In [27]:
embedding_matrix_train[1]

array([[ 1.1891e-01,  1.5255e-01, -8.2073e-02, -7.4144e-01,  7.5917e-01,
        -4.8328e-01, -3.1009e-01,  5.1476e-01, -9.8708e-01,  6.1757e-04,
        -1.5043e-01,  8.3770e-01, -1.0797e+00, -5.1460e-01,  1.3188e+00,
         6.2007e-01,  1.3779e-01,  4.7108e-01, -7.2874e-02, -7.2675e-01,
        -7.4116e-01,  7.5263e-01,  8.8180e-01,  2.9561e-01,  1.3548e+00,
        -2.5701e+00, -1.3523e+00,  4.5880e-01,  1.0068e+00, -1.1856e+00,
         3.4737e+00,  7.7898e-01, -7.2929e-01,  2.5102e-01, -2.6156e-01,
        -3.4684e-01,  5.5841e-01,  7.5098e-01,  4.9830e-01, -2.6823e-01,
        -2.7443e-03, -1.8298e-02, -2.8096e-01,  5.5318e-01,  3.7706e-02,
         1.8555e-01, -1.5025e-01, -5.7512e-01, -2.6671e-01,  9.2121e-01],
       [ 3.4664e-01,  3.9805e-01,  4.8970e-01, -5.1421e-01,  5.4574e-01,
        -1.2005e+00,  3.2107e-01,  7.4004e-01, -1.4979e+00, -1.9651e-01,
        -1.2631e-01, -3.7703e-01, -6.2569e-01,  3.8792e-02,  1.0579e+00,
         7.7199e-01, -1.8589e-01,  1.3032e+00, -7.

convert y_train & y_test to one hot vectors

In [28]:
from keras.utils import to_categorical

In [29]:
y_train = to_categorical(y_train , num_classes=5)

In [30]:
y_test = to_categorical(y_test , num_classes=5)

In [31]:
y_train.shape , y_test.shape

((132, 5), (56, 5))

# Apply SMOTE to handle imbalanced dataset

In [32]:
from imblearn.over_sampling import SMOTE

In [33]:
embedding_matrix_train.shape

(132, 10, 50)

In [34]:
embedding_matrix_train = embedding_matrix_train.reshape(-1, maxlen*50)

In [35]:
embedding_matrix_train.shape

(132, 500)

In [36]:
oversample = SMOTE()
X, y = oversample.fit_resample(embedding_matrix_train, y_train)



In [37]:
X = X.reshape(-1,maxlen,50)
X.shape, y.shape

((190, 10, 50), (190, 5))

# Define the LSTM model



In [38]:
from keras.models import Sequential
from keras.layers import *

In [39]:
model = Sequential()
model.add(LSTM(units = 64 ,return_sequences=True, input_shape = (maxlen,50)))
model.add(Dropout(0.5))
model.add(LSTM(units = 64))
model.add(Dropout(0.5))
model.add(Dense(units=5 , activation = 'softmax'))
model.compile(loss='categorical_crossentropy' , optimizer='Adam' , metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 10, 64)            29440     
_________________________________________________________________
dropout (Dropout)            (None, 10, 64)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 5)                 325       
Total params: 62,789
Trainable params: 62,789
Non-trainable params: 0
_________________________________________________________________


In [40]:
hist = model.fit(X,y,epochs=100,batch_size=64,shuffle=True,validation_split=0.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100


Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [41]:
model.evaluate(embedding_matrix_test,y_test)



[1.5119274854660034, 0.6964285969734192]

Preparing the emoji dictionary

In [42]:
import emoji

In [43]:
emoji_dictionary = {
                    "0": "\u2764\uFE0F",
                    "1": ":baseball:",
                    "2": ":grinning_face_with_big_eyes:",
                    "3": ":disappointed_face:",
                    "4": ":fork_and_knife:"
                    }

In [44]:
emoji.emojize(":baseball:")

'⚾'

Dispplay output on test dataset

In [45]:
y_pred = model.predict_classes(embedding_matrix_test)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [46]:
for i in range(len(y_pred)):
  print(' '.join(X_test[i]))
  print(emoji.emojize(emoji_dictionary[str(np.argmax(y_test[i]))]))
  print(emoji.emojize(emoji_dictionary[str(y_pred[i])]))

I want to eat
🍴
🍴
he did not answer
😞
😞
he got a raise
😃
😞
she got me a present
❤️
❤️
ha ha ha it was so funny
😃
😃
he is a good friend
❤️
😃
I am upset
❤️
😞
We had such a lovely dinner tonight
❤️
😃
where is the food
🍴
🍴
Stop making this joke ha ha ha
😃
😃
where is the ball
⚾
⚾
work is hard
😞
😞
This girl is messing with me
😞
❤️
are you serious ha ha
😃
😃
Let us go play baseball
⚾
⚾
This stupid grader is not working
😞
😞
work is horrible
😞
😞
Congratulation for having a baby
😃
😃
stop messing around
😞
⚾
any suggestions for dinner
🍴
🍴
I love taking breaks
❤️
❤️
you brighten my day
😃
😃
I boiled rice
🍴
🍴
she is a bully
😞
❤️
Why are you feeling bad
😞
😞
I am upset
😞
😞
I worked during my birthday
😞
😃
My grandmother is the love of my life
❤️
❤️
enjoy your break
😃
⚾
valentine day is near
❤️
😃
I miss you so much
❤️
❤️
throw the ball
⚾
⚾
My life is so boring
😞
❤️
she said yes
😃
😃
will you be my valentine
❤️
❤️
he can pitch really well
⚾
⚾
dance with me
😃
😃
I am starving
🍴
🍴
See you at the restaurant
🍴
🍴

# Saving the model

In [48]:
model.save('model.pkl')

INFO:tensorflow:Assets written to: model.pkl\assets


Load the model and make predictions

In [49]:
from tensorflow import keras
model = keras.models.load_model('model.pkl')

In [50]:
y_pred = model.predict_classes(embedding_matrix_test)

In [51]:
for i in range(len(y_pred)):
  print(' '.join(X_test[i]))
  print(emoji.emojize(emoji_dictionary[str(np.argmax(y_test[i]))]))
  print(emoji.emojize(emoji_dictionary[str(y_pred[i])]))

I want to eat
🍴
🍴
he did not answer
😞
😞
he got a raise
😃
😞
she got me a present
❤️
❤️
ha ha ha it was so funny
😃
😃
he is a good friend
❤️
😃
I am upset
❤️
😞
We had such a lovely dinner tonight
❤️
😃
where is the food
🍴
🍴
Stop making this joke ha ha ha
😃
😃
where is the ball
⚾
⚾
work is hard
😞
😞
This girl is messing with me
😞
❤️
are you serious ha ha
😃
😃
Let us go play baseball
⚾
⚾
This stupid grader is not working
😞
😞
work is horrible
😞
😞
Congratulation for having a baby
😃
😃
stop messing around
😞
⚾
any suggestions for dinner
🍴
🍴
I love taking breaks
❤️
❤️
you brighten my day
😃
😃
I boiled rice
🍴
🍴
she is a bully
😞
❤️
Why are you feeling bad
😞
😞
I am upset
😞
😞
I worked during my birthday
😞
😃
My grandmother is the love of my life
❤️
❤️
enjoy your break
😃
⚾
valentine day is near
❤️
😃
I miss you so much
❤️
❤️
throw the ball
⚾
⚾
My life is so boring
😞
❤️
she said yes
😃
😃
will you be my valentine
❤️
❤️
he can pitch really well
⚾
⚾
dance with me
😃
😃
I am starving
🍴
🍴
See you at the restaurant
🍴
🍴

In [60]:
text = "i want food"
t = pd.Series(text) 

In [61]:
emb_out = embedding_output(t)

In [62]:
emb_out

array([[[ 1.1891e-01,  1.5255e-01, -8.2073e-02, -7.4144e-01,
          7.5917e-01, -4.8328e-01, -3.1009e-01,  5.1476e-01,
         -9.8708e-01,  6.1757e-04, -1.5043e-01,  8.3770e-01,
         -1.0797e+00, -5.1460e-01,  1.3188e+00,  6.2007e-01,
          1.3779e-01,  4.7108e-01, -7.2874e-02, -7.2675e-01,
         -7.4116e-01,  7.5263e-01,  8.8180e-01,  2.9561e-01,
          1.3548e+00, -2.5701e+00, -1.3523e+00,  4.5880e-01,
          1.0068e+00, -1.1856e+00,  3.4737e+00,  7.7898e-01,
         -7.2929e-01,  2.5102e-01, -2.6156e-01, -3.4684e-01,
          5.5841e-01,  7.5098e-01,  4.9830e-01, -2.6823e-01,
         -2.7443e-03, -1.8298e-02, -2.8096e-01,  5.5318e-01,
          3.7706e-02,  1.8555e-01, -1.5025e-01, -5.7512e-01,
         -2.6671e-01,  9.2121e-01],
        [ 1.3627e-01, -5.4478e-02,  3.7030e-01, -4.1574e-01,
          6.0568e-01, -4.2729e-01, -5.0151e-01,  3.5923e-01,
         -4.9154e-01,  2.1827e-01, -1.5193e-01,  5.2536e-01,
         -2.4206e-01,  2.3875e-02,  8.2250e-01,  

In [63]:
p = model.predict_classes(emb_out)

In [64]:
for i in range(len(p)):
    print(emoji.emojize(emoji_dictionary[str(p[i])]))

🍴


In [68]:
type(emoji.emojize(emoji_dictionary[str(p[0])]))

str