In [215]:

from google.colab import drive
 
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [216]:
from __future__ import division, print_function
import gensim
from gensim import models
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import collections
import re
import string

### Read data

In [217]:
#data = pd.read_csv('/content/gdrive/MyDrive/dataset/csv/mergr_draft.csv',header = None)
data = pd.read_csv('/content/merge.csv',header = None)


In [218]:
data.columns = ['Text', 'Label']

In [219]:
data.head()

Unnamed: 0,Text,Label
0,Super ganda sya and kasyang kasya i like it th...,2
1,Maganda naman siyaaaaa and ang bilis magship n...,2
2,OKS LANG BAGAY KAY BFâ¤ï¸\nThank you â¤ï¸,2
3,The items were delivered securely! Although it...,2
4,maganda highly recommend\nokay nmn worth it d...,2


In [220]:
data.Label.unique()

array([2, 1, 0])

In [221]:
data.shape
len(data)

3002

In [222]:
pos = []
neg = []
neu = []

for l in data.Label:
    if l == 0:
        pos.append(0)
        neg.append(1)
        neu.append(0)
    elif l == 1:
        pos.append(0)
        neg.append(0)
        neu.append(1)
    elif l == 2:
        pos.append(1)
        neg.append(0)
        neu.append(0)

In [223]:
len(neu)

3002

In [224]:
data['Pos']= pos
data['Neg']= neg
data['Neu'] = neu

In [225]:
data.head()

Unnamed: 0,Text,Label,Pos,Neg,Neu
0,Super ganda sya and kasyang kasya i like it th...,2,1,0,0
1,Maganda naman siyaaaaa and ang bilis magship n...,2,1,0,0
2,OKS LANG BAGAY KAY BFâ¤ï¸\nThank you â¤ï¸,2,1,0,0
3,The items were delivered securely! Although it...,2,1,0,0
4,maganda highly recommend\nokay nmn worth it d...,2,1,0,0


### Clean data

In [226]:
def remove_punct(text):
    text_nopunct = ''
    text_nopunct = re.sub('['+string.punctuation+']', '', text)
    return text_nopunct

data['Text_Clean'] = data['Text'].apply(lambda x: remove_punct(x))

In [227]:
from nltk import word_tokenize, WordNetLemmatizer
import nltk
nltk.download('punkt')
tokens = [word_tokenize(tweet) for tweet in data.Text_Clean]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [228]:
def lowerStemmer(tokens): 
    return [w.lower() for w in tokens]    
    
lower_tokens = [lowerStemmer(token) for token in tokens]

In [229]:
stoplist = ""
with open('/content/gdrive/MyDrive/dataset/stopword/stopwords.txt') as f:
  contents = f.read()
  stoplist = stoplist + contents
stopwords = word_tokenize(stoplist)
print(stoplist)

i
me
my
myself
we
our
ours
ourselves
you
you're
you've
you'll
you'd
your
yours
yourself
yourselves
he
him
his
himself
she
she's
her
hers
herself
it
it's
its
itself
they
them
their
theirs
themselves
what
which
who
whom
this
that
that'll
these
those
am
is
are
was
were
be
been
being
have
has
had
having
do
does
did
doing
a
an
the
and
if
or
as
until
while
of
at
by
for
with
about
between
into
through
during
before
after
to
from
in
out
further
then
once
here
there
when
where
why
how
all
any
both
each
other
some
such
own
so
than
s
t
can
will
just
don
should
should've
now
d
ll
m
o
re
ve
y
ain
aren
couldn
didn
doesn
hadn
hasn
haven
isn
ma
mightn
mustn
needn
shan
shan't
shouldn
wasn
weren
won
wouldn
akin
aking
ako
alin
am
amin
aming
ang
ano
anumang
apat
at
atin
ating
ay
bakit
bawat
bilang
dahil
din
dito
doon
gagawin
gayunman
ginagawa
ginawa
ginawang
gumawa
habang
hanggang
iba
ibabaw
ibig
ikaw
ilagay
ilalim
ilan
inyong
isang
ito
iyo
iyon
iyong
ka
kailanman
kami
kanila
kanilang
kanino
kanya
kanyang

In [230]:
def removeStopWords(tokens): 
    return [word for word in tokens if word not in stoplist]

In [231]:
filtered_words = [removeStopWords(sen) for sen in lower_tokens]
print(filtered_words)

[['super', 'ganda', 'sya', 'kasyang', 'kasya', 'like', 'thank', 'seller', 'araw', 'dumating', 'agad', 'super', 'ganda', 'sya', 'kasyang', 'kasya', 'like', 'thank', 'seller', 'araw', 'dumating', 'agad', 'super', 'ganda', 'sya', 'kasyang', 'kasya', 'like', 'thank', 'seller', 'araw', 'dumating', 'agad', 'super', 'ganda', 'sya'], ['maganda', 'naman', 'siyaaaaa', 'bilis', 'magship', 'item'], ['oks', 'bagay', 'kay', 'bfâ\x9d¤ï¸\x8f', 'thank', 'â\x9d¤ï¸\x8f'], ['items', 'delivered', 'securely', 'although', 'takes', 'couple', 'days', 'received', 'order', 'items', 'complete', 'look', 'okay', 'anyways', 'thank', 'seller'], ['maganda', 'highly', 'recommend', 'okay', 'nmn', 'worth', 'nmn', 'survived', 'maganda', 'tsaka', 'wort', 'dear', 'seller', 'wanted', 'tell', 'satisfied', 'greatful', 'product'], ['thank', 'thank', 'ðÿ', '’', '•', 'god', 'blessed', 'ðÿ˜‡', 'next', 'trasaction', 'ðÿ¤—ðÿ˜˜ðÿ–¤ðÿ–¤ðÿ–¤thank', 'thank', 'ðÿ', '’', '•', 'god', 'blessed', 'ðÿ˜‡', 'next', 'trasaction', 'ðÿ¤—ðÿ˜˜ðÿ–¤ðÿ

In [232]:
result = [' '.join(sen) for sen in filtered_words]

In [233]:
data['Text_Final'] = result

In [234]:
data['tokens'] = filtered_words

In [235]:
len(data['tokens'])

3002

In [236]:
data = data[['Text_Final', 'tokens', 'Label', 'Pos','Neu', 'Neg']]

In [237]:
data[:2]

Unnamed: 0,Text_Final,tokens,Label,Pos,Neu,Neg
0,super ganda sya kasyang kasya like thank selle...,"[super, ganda, sya, kasyang, kasya, like, than...",2,1,0,0
1,maganda naman siyaaaaa bilis magship item,"[maganda, naman, siyaaaaa, bilis, magship, item]",2,1,0,0


### Split data into test and train

In [238]:
data_train, data_test = train_test_split(data, test_size=0.10, random_state=42)

In [239]:
data_train['Label']

2902    0
73      2
1537    1
2638    0
621     2
       ..
1638    1
1095    1
1130    1
1294    1
860     2
Name: Label, Length: 2701, dtype: int64

In [240]:
data_test[:5]

Unnamed: 0,Text_Final,tokens,Label,Pos,Neu,Neg
2786,incomplete packageitem linaw linaw ayos makipa...,"[incomplete, packageitem, linaw, linaw, ayos, ...",0,0,0,1
2148,same right sounds fck,"[same, right, sounds, fck]",0,0,0,1
1410,okay naman pero sure ginagalaw mouse tsaka lal...,"[okay, naman, pero, sure, ginagalaw, mouse, ts...",1,0,1,0
251,maganda subraaa kaso rider napakasungit yung n...,"[maganda, subraaa, kaso, rider, napakasungit, ...",2,1,0,0
2506,nadismaya akod ganun kaganda,"[nadismaya, akod, ganun, kaganda]",0,0,0,1


In [241]:
all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

38569 words total, with a vocabulary size of 5916
Max sentence length is 199


In [242]:
all_test_words = [word for tokens in data_test["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

4194 words total, with a vocabulary size of 1442
Max sentence length is 43


### Load Google News Word2Vec model

In [243]:
'''
#tagalog model embeddings
tagalog_model = gensim.models.KeyedVectors.load_word2vec_format('/content/gdrive/MyDrive/dataset/tagalog/cc.tl.300.vec')

#emoji model 
emoji_model = gensim.models.KeyedVectors.load_word2vec_format('/content/gdrive/MyDrive/dataset/emoji/emoji2vec.bin', binary=True)

#word2vec_path = '/content/gdrive/MyDrive/dataset/e/GoogleNews-vectors-negative300.bin.gz'
english_model = gensim.models.KeyedVectors.load_word2vec_format('/content/gdrive/MyDrive/dataset/e/GoogleNews-vectors-negative300.bin.gz', binary=True)
'''

"\n#tagalog model embeddings\ntagalog_model = gensim.models.KeyedVectors.load_word2vec_format('/content/gdrive/MyDrive/dataset/tagalog/cc.tl.300.vec')\n\n#emoji model \nemoji_model = gensim.models.KeyedVectors.load_word2vec_format('/content/gdrive/MyDrive/dataset/emoji/emoji2vec.bin', binary=True)\n\n#word2vec_path = '/content/gdrive/MyDrive/dataset/e/GoogleNews-vectors-negative300.bin.gz'\nenglish_model = gensim.models.KeyedVectors.load_word2vec_format('/content/gdrive/MyDrive/dataset/e/GoogleNews-vectors-negative300.bin.gz', binary=True)\n"

In [244]:
i = 0;

In [245]:
def embeddings(words):

  vectorized = []
  v = []

  for word in words:
    v.append(word)

    if(word in emoji_model):
      #try emoji embeddings first 
      vector = emoji_model[word]
      vectorized.append(vector)
      #print(vector)    
    else:
      if(word in tagalog_model):
        #try tagalog embeddings
        vector = tagalog_model[word]
        vectorized.append(vector)
        #print(vector)

      else: 
        if(word in english_model):
          #ilagay dito ang english embeddings
          vector = english_model[word]
          vectorized.append(vector)
          #print(vector)
        

  length = len(vectorized)

  summed = np.sum(vectorized, axis=0)
  averaged = np.divide(summed, length)
  return averaged

GAWA GAWAN EMBEDDINGS

### Get Embeddings

In [246]:
data_train['tokens']

2902    [lng, binalot, bubble, wrap, basang, basa, mad...
73      [ðÿ˜, over, ok, lahat, nung, item, sobrang, s...
1537    [okay, sana, kaso, sira, yung, tip, tapos, mak...
2638              [parehas, dun, pic, liit, nung, tinted]
621     [but, thing, ok, naman, sya, well, pack, kaya,...
                              ...                        
1638                                        [ok, quality]
1095    [okey, sya, medyo, mahina, nga, lng, sakto, pe...
1130    [maganda, tela, but, print, not, kasi, madali,...
1294    [maganda, naman, kaso, nagustuhan, yung, tela,...
860     [super, gandaaaaaaaaaaa, safe, mabilis, dumati...
Name: tokens, Length: 2701, dtype: object

In [247]:
len(data_train)

2701

In [248]:
training_embeddings = [embeddings(sen) for sen in data_train['tokens']]
len(training_embeddings)



2701

Get vectors value

In [249]:
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300

Get vectors

In [250]:
def getVector(word):

  if(word in emoji_model):
    #try emoji embeddings first 
    vector = emoji_model[word]
      
  else:
    if(word in tagalog_model):
      #try tagalog embeddings
      vector = tagalog_model[word]
      

    else: 
      if(word in english_model):
        #ilagay dito ang english embeddings
        vector = english_model[word]
        
      else:
         vector = np.random.rand(EMBEDDING_DIM)
             
 
  length = len(vector)
  summed = np.sum(vector, axis=0)
  averaged = np.divide(summed, length)
  #return averaged
  return vector

### Tokenize and Pad sequences

In [251]:
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["Text_Final"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["Text_Final"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 5916 unique tokens.


In [252]:

#import pickle
'''
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

'''

"\nwith open('tokenizer.pickle', 'wb') as handle:\n    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)\n\n"

In [253]:
#training_sequences

In [254]:
'''
with open('/content/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
'''

"\nwith open('/content/tokenizer.pickle', 'rb') as handle:\n    tokenizer = pickle.load(handle)\n"

In [255]:
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [256]:
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
#for word,index in train_word_index.items():
  #train_embedding_weights[index,:] =
  #print(word)

print(len(train_embedding_weights[0]))

300


In [257]:
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
  train_embedding_weights[index,:] = getVector(word)
print(train_embedding_weights.shape)

(5917, 300)


In [258]:
len(train_embedding_weights)

5917

In [259]:
test_sequences = tokenizer.texts_to_sequences(data_test["Text_Final"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [260]:
print(len(data_test["Text_Final"]))

301


### Define CNN

In [261]:
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    filter_sizes = [2,3,4,5]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)


    l_merge = concatenate(convs, axis=1)

    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [262]:
label_names = ['Pos', 'Neg','Neu']

In [263]:
y_train = data_train[label_names].values

In [264]:
x_train = train_cnn_data
y_tr = y_train

In [None]:
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                len(list(label_names)))

### Train CNN

In [266]:
len(x_train)

2701

In [267]:
num_epochs = 25
batch_size = 34

In [268]:
hist = model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0.1, shuffle=True, batch_size=batch_size)
#hist = model.fit(train_cnn_data, y_tr, epochs=num_epochs,validation_split=0.1 ,shuffle=True, batch_size=batch_size)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [269]:
MAX_SEQUENCE_LENGTH

50

### Test CNN

In [270]:
test_cnn_data[3]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,   13,   16,   95,    1, 1734,
        445,    1,  168,    5,   13,    2], dtype=int32)

In [271]:
predictions = model.predict(test_cnn_data, batch_size=1024, verbose=1)



In [272]:
labels = [2,0,1]

In [273]:
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])

In [None]:
data_test.Label

In [275]:
sum(data_test.Label==prediction_labels)/len(prediction_labels)

0.8205980066445183

In [276]:
pos = 2
neg = 0
neu = 1

for label in prediction_labels:
  if(label==0):
    neg+=1

  if(label==1):
    neu+=1

  if(label==2):
    pos+=1

print('Predicted sentiments count:')
print('2 :',pos)
print('0 :',neg)
print('1 :',neu)

Predicted sentiments count:
2 : 113
0 : 133
1 : 58


In [277]:
print('Test datasets count')
data_test.Label.value_counts()

Test datasets count


2    108
0    105
1     88
Name: Label, dtype: int64

In [278]:
len(prediction_labels)

301

In [279]:
prediction_labels

[0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 2,
 2,
 1,
 0,
 2,
 1,
 0,
 0,
 0,
 2,
 2,
 2,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 2,
 0,
 2,
 0,
 0,
 0,
 2,
 1,
 2,
 2,
 2,
 2,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 2,
 1,
 2,
 1,
 0,
 2,
 0,
 0,
 2,
 2,
 2,
 2,
 2,
 0,
 1,
 0,
 1,
 2,
 1,
 0,
 0,
 0,
 2,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 2,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 2,
 1,
 0,
 1,
 0,
 2,
 1,
 0,
 2,
 1,
 1,
 2,
 2,
 0,
 1,
 2,
 0,
 2,
 2,
 1,
 0,
 2,
 0,
 0,
 2,
 0,
 2,
 2,
 0,
 0,
 0,
 1,
 2,
 0,
 1,
 0,
 0,
 2,
 2,
 2,
 0,
 1,
 1,
 1,
 2,
 2,
 2,
 1,
 0,
 0,
 0,
 2,
 1,
 0,
 0,
 2,
 0,
 2,
 0,
 2,
 1,
 0,
 2,
 0,
 2,
 0,
 2,
 0,
 1,
 1,
 0,
 0,
 0,
 2,
 0,
 0,
 2,
 0,
 2,
 2,
 0,
 0,
 0,
 2,
 0,
 0,
 1,
 1,
 2,
 1,
 2,
 0,
 0,
 0,
 2,
 2,
 2,
 0,
 0,
 2,
 2,
 0,
 2,
 2,
 0,
 0,
 2,
 1,
 2,
 2,
 0,
 0,
 2,
 0,
 1,
 0,
 0,
 2,
 2,
 2,
 0,
 2,
 1,
 0,
 2,
 2,
 2,
 2,
 1,
 1,
 0,
 1,
 2,
 0,
 2,
 2,
 2,
 0,
 0,
 0,
 0,
 1,
 0,
 2,
 2,
 0,
 2,
 1,
 0,
 2,
 0,
 2,
 0,
 0,
 2,
 2,
 0,
 2,
 2,
 0,


In [280]:
model.save('tag-lish_cnn.h5')
import keras

In [281]:
m = keras.models.load_model('/content/tag-lish_cnn.h5')

In [282]:
predictions = m.predict(test_cnn_data, batch_size=1024, verbose=1)



In [283]:
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])

In [284]:
prediction_labels

[0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 2,
 2,
 1,
 0,
 2,
 1,
 0,
 0,
 0,
 2,
 2,
 2,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 2,
 0,
 2,
 0,
 0,
 0,
 2,
 1,
 2,
 2,
 2,
 2,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 2,
 1,
 2,
 1,
 0,
 2,
 0,
 0,
 2,
 2,
 2,
 2,
 2,
 0,
 1,
 0,
 1,
 2,
 1,
 0,
 0,
 0,
 2,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 2,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 2,
 1,
 0,
 1,
 0,
 2,
 1,
 0,
 2,
 1,
 1,
 2,
 2,
 0,
 1,
 2,
 0,
 2,
 2,
 1,
 0,
 2,
 0,
 0,
 2,
 0,
 2,
 2,
 0,
 0,
 0,
 1,
 2,
 0,
 1,
 0,
 0,
 2,
 2,
 2,
 0,
 1,
 1,
 1,
 2,
 2,
 2,
 1,
 0,
 0,
 0,
 2,
 1,
 0,
 0,
 2,
 0,
 2,
 0,
 2,
 1,
 0,
 2,
 0,
 2,
 0,
 2,
 0,
 1,
 1,
 0,
 0,
 0,
 2,
 0,
 0,
 2,
 0,
 2,
 2,
 0,
 0,
 0,
 2,
 0,
 0,
 1,
 1,
 2,
 1,
 2,
 0,
 0,
 0,
 2,
 2,
 2,
 0,
 0,
 2,
 2,
 0,
 2,
 2,
 0,
 0,
 2,
 1,
 2,
 2,
 0,
 0,
 2,
 0,
 1,
 0,
 0,
 2,
 2,
 2,
 0,
 2,
 1,
 0,
 2,
 2,
 2,
 2,
 1,
 1,
 0,
 1,
 2,
 0,
 2,
 2,
 2,
 0,
 0,
 0,
 0,
 1,
 0,
 2,
 2,
 0,
 2,
 1,
 0,
 2,
 0,
 2,
 0,
 0,
 2,
 2,
 0,
 2,
 2,
 0,


pang ayos ng input

In [285]:
input = "Sayang 30 pang load din sana"


In [286]:
def listToString(s): 
    
    # initialize an empty string
    str1 = " " 
    
    # return string  
    return (str1.join(s))

In [287]:
clean = remove_punct(input)
clean= word_tokenize(clean)
clean = lowerStemmer(clean)
clean = removeStopWords(clean)
clean = removeTagalogStopWords(clean)
print('clean',clean)
input = listToString(clean)
print(input)

data['input'] = input

clean_sequences  = tokenizer.texts_to_sequences(data['input'].tolist())
clean_input = pad_sequences(clean_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print(clean_input[0:1])

NameError: ignored

PREDICT

In [None]:
input_predictions = model.predict(clean_input[0:1], batch_size=1024, verbose=1)

In [None]:
input_prediction_labels=[]
for p in input_predictions:
  print(p)
  input_prediction_labels.append(labels[np.argmax(p)])

In [None]:
print(input_prediction_labels)

Tagalog-English Stopwords

In [None]:
len(TRAINING_VOCAB)

In [None]:
#!pip install emoji

In [None]:

from emoji import UNICODE_EMOJI

# search your emoji
def is_emoji(s):
    return s in UNICODE_EMOJI['en']


In [None]:
import re

string = "at gh❤️ time❤️ 🥰🤩🥰  what time ?"
string_token = word_tokenize(string)

tokens = []

for token in string_token:
  tokens.append(token)
  for i in range(len(token)):
    if(is_emoji(token[i])):
      tokens.append(token[i])


print(tokens)