In [1]:
import pandas as pd
import numpy as np

In [2]:
with open('../datasets/emoji/train_text.txt', 'r', encoding='utf-8') as f:
    x_train = f.read().splitlines()


In [3]:
with open('../datasets/emoji/train_labels.txt', 'r', encoding='utf-8') as f:
    y_train = f.read().splitlines()

In [4]:
emoji_map = {}
with open("../datasets/emoji/mapping.txt", 'r') as f:
    for line in f:
        label_id, emoji,desc = line.strip().split('\t')
        emoji_map[int(label_id)] = [emoji,desc]


In [5]:
emoji_map

{0: ['❤', '_red_heart_'],
 1: ['😍', '_smiling_face_with_hearteyes_'],
 2: ['😂', '_face_with_tears_of_joy_'],
 3: ['💕', '_two_hearts_'],
 4: ['🔥', '_fire_'],
 5: ['😊', '_smiling_face_with_smiling_eyes_'],
 6: ['😎', '_smiling_face_with_sunglasses_'],
 7: ['✨', '_sparkles_'],
 8: ['💙', '_blue_heart_'],
 9: ['😘', '_face_blowing_a_kiss_'],
 10: ['📷', '_camera_'],
 11: ['🇺🇸', '_United_States_'],
 12: ['☀', '_sun_'],
 13: ['💜', '_purple_heart_'],
 14: ['😉', '_winking_face_'],
 15: ['💯', '_hundred_points_'],
 16: ['😁', '_beaming_face_with_smiling_eyes_'],
 17: ['🎄', '_Christmas_tree_'],
 18: ['📸', '_camera_with_flash_'],
 19: ['😜', '_winking_face_with_tongue_']}

In [6]:
import re

In [7]:
def clean_text(text):
    
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    text = re.sub(r"http\S+", "", text)  
    text = re.sub(r"@\w+", "", text)     
    text = re.sub(r"#", "", text)     
    return text.lower()

x_train = [clean_text(t) for t in x_train]


In [8]:
df=pd.DataFrame({'commnets':x_train,'labels':y_train})

In [9]:
df

Unnamed: 0,commnets,labels
0,sunday afternoon walking through venice in the...,12
1,time for some bbq and whiskey libations chomp ...,19
2,love love love all these people friends bff...,0
3,toysrus,0
4,man these are the funniest kids ever that face...,2
...,...,...
44995,here to celebrate the nunez wedding love my be...,0
44996,1 night in paris wait paris las vegas hotel a...,1
44997,be safe this weekend everyone happylaborday b...,11
44998,pizza five50 user in las vegas nv,1


In [10]:
df.shape

(45000, 2)

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [12]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(df['commnets'])

In [13]:
total_words=len(tokenizer.word_index)+1
total_words

56424

In [14]:
tokenizer.index_word

{1: 'the',
 2: 'user',
 3: 'to',
 4: 'my',
 5: 'a',
 6: 'i',
 7: 'in',
 8: 'and',
 9: 'you',
 10: 'with',
 11: 'of',
 12: 'for',
 13: 'this',
 14: 'california',
 15: 'at',
 16: 'love',
 17: 'is',
 18: 'on',
 19: 'so',
 20: 'me',
 21: 'it',
 22: 'amp',
 23: 'happy',
 24: 'los',
 25: 'day',
 26: 'san',
 27: 'we',
 28: 'angeles',
 29: 'beach',
 30: 'from',
 31: 'all',
 32: 'vegas',
 33: 'was',
 34: 'be',
 35: 'your',
 36: 'its',
 37: 'that',
 38: 'our',
 39: 'by',
 40: 'night',
 41: 'out',
 42: 'christmas',
 43: 'today',
 44: 'time',
 45: 'just',
 46: 'new',
 47: 'up',
 48: 'one',
 49: 'are',
 50: 'la',
 51: 'im',
 52: 'good',
 53: 'when',
 54: 'las',
 55: 'have',
 56: 'last',
 57: 'ca',
 58: 'like',
 59: 'beautiful',
 60: 'hollywood',
 61: 'best',
 62: 'park',
 63: 'these',
 64: 'what',
 65: 'but',
 66: 'get',
 67: 'thank',
 68: 'family',
 69: 'birthday',
 70: 'back',
 71: 'got',
 72: 'disneyland',
 73: 'some',
 74: 'see',
 75: 'little',
 76: 'here',
 77: 'fun',
 78: 'great',
 79: 'franc

In [15]:
tokenizer.word_counts

OrderedDict([('sunday', 332),
             ('afternoon', 99),
             ('walking', 56),
             ('through', 155),
             ('venice', 304),
             ('in', 6013),
             ('the', 13830),
             ('sun', 239),
             ('with', 5275),
             ('user', 12236),
             ('abbot', 15),
             ('kinney', 19),
             ('time', 1294),
             ('for', 4871),
             ('some', 799),
             ('bbq', 73),
             ('and', 5771),
             ('whiskey', 20),
             ('libations', 3),
             ('chomp', 2),
             ('belch', 1),
             ('lucilles', 4),
             ('smokehouse', 5),
             ('barbque', 4),
             ('love', 3530),
             ('all', 1622),
             ('these', 962),
             ('people', 388),
             ('friends', 582),
             ('bff', 57),
             ('celebrate', 72),
             ('blessed', 178),
             ('sundayfunday', 70),
             ('san', 1872),
    

In [16]:
tokenizer.texts_to_sequences([df['commnets'][0]])[0]


[177, 598, 965, 406, 197, 7, 1, 262, 10, 2, 2722, 2257, 197]

In [17]:
inputsequences=[]
for i in range(df.shape[0]):
    line=df['commnets'][i]
    token_list=tokenizer.texts_to_sequences([line])[0]
    for j in range(1,len(token_list)):
        n_gram_sequence=token_list[:j+1]
        n_gram_sequence.append(df['labels'][i])
        inputsequences.append(n_gram_sequence)

In [18]:
inputsequences

[[177, 598, '12'],
 [177, 598, 965, '12'],
 [177, 598, 965, 406, '12'],
 [177, 598, 965, 406, 197, '12'],
 [177, 598, 965, 406, 197, 7, '12'],
 [177, 598, 965, 406, 197, 7, 1, '12'],
 [177, 598, 965, 406, 197, 7, 1, 262, '12'],
 [177, 598, 965, 406, 197, 7, 1, 262, 10, '12'],
 [177, 598, 965, 406, 197, 7, 1, 262, 10, 2, '12'],
 [177, 598, 965, 406, 197, 7, 1, 262, 10, 2, 2722, '12'],
 [177, 598, 965, 406, 197, 7, 1, 262, 10, 2, 2722, 2257, '12'],
 [177, 598, 965, 406, 197, 7, 1, 262, 10, 2, 2722, 2257, 197, '12'],
 [44, 12, '19'],
 [44, 12, 73, '19'],
 [44, 12, 73, 773, '19'],
 [44, 12, 73, 773, 8, '19'],
 [44, 12, 73, 773, 8, 2161, '19'],
 [44, 12, 73, 773, 8, 2161, 8686, '19'],
 [44, 12, 73, 773, 8, 2161, 8686, 11272, '19'],
 [44, 12, 73, 773, 8, 2161, 8686, 11272, 17335, '19'],
 [44, 12, 73, 773, 8, 2161, 8686, 11272, 17335, 11272, '19'],
 [44, 12, 73, 773, 8, 2161, 8686, 11272, 17335, 11272, 7187, '19'],
 [44, 12, 73, 773, 8, 2161, 8686, 11272, 17335, 11272, 7187, 6145, '19'],
 [44

In [19]:
max_sequence_len1=max(len(x) for x in inputsequences)
max_sequence_len1

32

In [20]:
inputsequences=np.array(pad_sequences(inputsequences,maxlen=max_sequence_len1,padding='pre'))
inputsequences

array([[  0,   0,   0, ..., 177, 598,  12],
       [  0,   0,   0, ..., 598, 965,  12],
       [  0,   0,   0, ..., 965, 406,  12],
       ...,
       [  0,   0,   0, ...,  96,  54,  13],
       [  0,   0,   0, ...,  54,  32,  13],
       [  0,   0,   0, ...,  32, 221,  13]])

In [21]:
import tensorflow as tf

In [22]:
x,y=inputsequences[:,:-1],inputsequences[:,-1]

In [23]:
y=tf.keras.utils.to_categorical(y,num_classes=20)

In [24]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [25]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=34)

# LSTM

In [26]:
## train lstm
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,LSTM,Dropout,GRU

model1=Sequential()
model1.add(Embedding(total_words,100,input_length=max_sequence_len1-1))
model1.add(Dropout(0.2))
model1.add(LSTM(100))
model1.add(Dense(20,activation='softmax'))


model1.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model1.summary()



In [27]:
history=model1.fit(x_train,y_train,epochs=15,validation_data=(x_test,y_test),verbose=1)

Epoch 1/15


KeyboardInterrupt: 

In [28]:
from tensorflow.keras.models import load_model

In [None]:
#model1.save('emoji_predictor.h5')



In [31]:
model1=load_model('emoji_predictor.h5')



In [32]:
import pickle
with open('tokenizer.pickle','wb') as file:
    pickle.dump(tokenizer,file,protocol=pickle.HIGHEST_PROTOCOL)

In [33]:
def prdict_next_word(model,tokenizer,text,max_sequence_len):
    token_list=tokenizer.texts_to_sequences([text])[0]
    if len(token_list)>=max_sequence_len:
        token_list=token_list[-(max_sequence_len-1):]
    token_list=pad_sequences([token_list],maxlen=max_sequence_len-1,padding='pre')
    predicted=model.predict(token_list,verbose=0)
    predicted_word_index=np.argmax(predicted,axis=1)  ## This line means: which word have high probability take that index
    
    return predicted_word_index

In [34]:
model1.evaluate(x_test, y_test)

[1m2772/2772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 18ms/step - accuracy: 0.8484 - loss: 0.5244


[0.5187798738479614, 0.8494327664375305]

In [44]:
inp='Still bitch im trill never been no fiend :@vibesbygallo #mustard @ Connecticut'
print("op: ",prdict_next_word(model1,tokenizer,inp,max_sequence_len1))

op:  [15]


# bidirectional lstm

In [37]:
from keras.models import Sequential
from keras.layers import Dropout, Dense, LSTM, Bidirectional,Embedding, GlobalMaxPool1D
embed_len=128


model2 = Sequential()
model2.add(Embedding(total_words, embed_len, input_length=max_sequence_len1))
model2.add(Bidirectional(LSTM(80,return_sequences=True)))
model2.add(Bidirectional(LSTM(80,return_sequences=True)))
model2.add(GlobalMaxPool1D())
model2.add(Dropout(0.5))
model2.add(Dense(64, activation="relu"))
model2.add(Dropout(0.5))
model2.add(Dense(20, activation="softmax"))

model2.summary()



In [38]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=0,verbose=1)

model2.compile(loss = 'categorical_crossentropy',
             optimizer = "adam",
             metrics=["accuracy"])

In [None]:
batch_size=64
epochs = 15

model2.fit(X_train, y_train, batch_size=batch_size, epochs = epochs, 
          shuffle=True,validation_data=(X_test, y_test), callbacks=[early_stop])

In [None]:
#model2.save('bidirectionalRNN.h5')

In [39]:
model2=load_model('bidirectionRNN.h5')



In [41]:
model2.summary()

In [42]:
model2.evaluate(x_test, y_test)

[1m2772/2772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 34ms/step - accuracy: 0.8433 - loss: 0.5995


[0.5907636284828186, 0.8450461030006409]

In [43]:
inp='Still bitch im trill never been no fiend :@vibesbygallo #mustard @ Connecticut'
print("op: ",prdict_next_word(model2,tokenizer,inp,max_sequence_len1))

op:  [16]
