In [19]:
import numpy as np
import pandas as pd
import emoji
import re
from textblob import TextBlob

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.corpus import stopwords

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, SimpleRNN, Embedding

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [20]:
data = pd.read_csv('../Data/Train.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,TEXT,Label
0,0,Vacation wasted ! #vacation2017 #photobomb #ti...,0
1,1,"Oh Wynwood, you’re so funny! : @user #Wynwood ...",1
2,2,Been friends since 7th grade. Look at us now w...,2
3,3,This is what it looks like when someone loves ...,3
4,4,RT @user this white family was invited to a Bl...,3


In [21]:
data['Label'].value_counts().nunique()

20

In [22]:
data['TEXT']=data['TEXT'].str.lower()

In [23]:
pd.set_option('display.max_columns',None)

In [24]:
from string import punctuation
exclude=punctuation
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [25]:
data['TEXT'][1]


'oh wynwood, you’re so funny! : @user #wynwood #art #itwas3am #flowers #vibes @ wynwood…\n'

In [26]:
def remove_punctuations(text):
    for char in exclude:
        text=text.replace(char,'')
    return text

In [27]:
data['TEXT']=data['TEXT'].apply(remove_punctuations)

In [28]:
data['TEXT'][0]

'vacation wasted  vacation2017 photobomb tired vacationwasted mcgar30 miami  port of…\n'

In [29]:
stemmer=PorterStemmer()
lemmatizer=WordNetLemmatizer()

In [30]:
import nltk
nltk.download('stopwords')

def preprocess(text):
    # กรองเฉพาะตัวอักษรภาษาอังกฤษ
    text = re.sub(r'[^a-zA-Z]', ' ', text)

    # แก้คำผิด (ช้าในข้อมูลเยอะ)
    try:
        textBlb = TextBlob(text)
        text = textBlb.correct().string
    except Exception as e:
        print("Spell correction error:", e)

    # Tokenize ด้วย split แทน word_tokenize
    tokens = text.lower().split()

    # ลบ stopwords และ stem คำ
    filtered_words = [
        stemmer.stem(word) for word in tokens
        if word not in stopwords.words('english')
    ]

    # รวมคำกลับเป็นข้อความ
    text = ' '.join(filtered_words)

    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\usEr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))

[]


In [32]:
data['TEXT']= data['TEXT'].apply(preprocess)

KeyboardInterrupt: 

In [None]:
data['TEXT'][0]

'vacation wasted  vacation2017 photobomb tired vacationwasted mcgar30 miami  port of…\n'

In [None]:
emoji_map=pd.read_csv("../Data/Mapping.csv")

emoji_map.drop(columns=["Unnamed: 0"],inplace=True)
emoji_map.head()

Unnamed: 0,emoticons,number
0,😜,0
1,📸,1
2,😍,2
3,😂,3
4,😉,4


In [None]:
emoji_map.set_index('number').to_dict()

{'emoticons': {0: '😜',
  1: '📸',
  2: '😍',
  3: '😂',
  4: '😉',
  5: '🎄',
  6: '📷',
  7: '🔥',
  8: '😘',
  9: '❤',
  10: '😁',
  11: '🇺🇸',
  12: '☀',
  13: '✨',
  14: '💙',
  15: '💕',
  16: '😎',
  17: '😊',
  18: '💜',
  19: '💯'}}

In [None]:
X = data['TEXT'].values
Y = data['Label'].values

In [None]:
file = open('../Data/glove.6B.100d.txt', 'r', encoding = 'utf8')
content = file.readlines()
file.close()

In [None]:
embeddings = {}

for line in content:
    line = line.split()
    embeddings[line[0]] = np.array(line[1:], dtype = float)

In [None]:
embeddings['the']

array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
       -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
        0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
       -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
        0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
       -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
        0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
        0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
       -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
       -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
       -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
       -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
       -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
       -1.2526  ,  0.071624,  0.70565 ,  0.49744 , 

In [None]:
def get_maxlen(data):
    maxlen = 0
    for sent in data:
        maxlen = max(maxlen, len(sent))
    return maxlen

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
word2index = tokenizer.word_index

In [None]:
word2index['the']

1

In [None]:
word2index['and']

7

In [None]:
Xtokens = tokenizer.texts_to_sequences(X)
maxlen = get_maxlen(Xtokens)
print(maxlen)
Xtrain = pad_sequences(Xtokens, maxlen = maxlen,  padding = 'post', truncating = 'post')

34


In [None]:
Xtokens[:2]

[[637, 6307, 15797, 5332, 1762, 24343, 24344, 149, 1384, 294],
 [215, 1361, 1148, 20, 621, 2, 1361, 208, 24345, 1404, 488, 8656]]

In [None]:
Y[1]

np.int64(1)

In [None]:
# change to onehotencoding
Ytrain = to_categorical(Y)
Ytrain.shape

(70000, 20)

In [None]:
word2index['i']

6

In [None]:
embed_size = 100
# index: total number of words and column will be the embed size to make embeded matrix
embedding_matrix = np.zeros((len(word2index)+1, embed_size))

for word, i in word2index.items():
    if word in embeddings:
#         embedding_matrix[i] = embeddings[word]
#         print(word)
        embedding_matrix[i] = embeddings[word]
        embed_vector = embeddings[word]
        embedding_matrix[i] = embed_vector
    else:
        print(word)
        embedding_matrix[i] = np.random.normal(0, 1, embed_size)

️
the…
it’s
and…
of…
to…
i’m
selfie
️…
•
a…
you…
for…
i…
amp…
at…
my…
bday
in…
by…
nofilter
don’t
with…
this…
new…
can’t
love…
bestie
lmao
latergram
on…
so…
is…
beach…
merrychristmas
bestfriend
our…
me…
your…
lasvegas
you’re
sundayfunday
newyorkcity
be…
christmastree
we…
family…
san…
out…
it…
besties
happy…
from…
familytime
north…
y’all
instagood
youve
foodporn
i’ve
was…
we’re
i’ll
los…
4thofjuly
selfies
vsco
throwbackthursday
angeles…
i️
music…
datenight
that…
state…
south…
regram
ootd
vscocam
all…
washingtondc
day…
nyc…
happybirthday
dogsofinstagram
an…
are…
photooftheday
happyholidays
nyfw
downtown…
high…
hbd
one…
lake…
york…
beautiful…
get…
atampt
fourthofjuly
city…
she’s
happynewyear
hoco
latepost
magickingdom
let’s
park…
favs
university…
amazing…
great…
chicago…
balayage
linkinbio
here’s
thanks…
centralpark
mylove
tonights
didn’t
repost…
that’s
picoftheday
fun…
idk
dtla
ivoted
us…
waltdisneyworld
friyay
litty
more…
miami…
he’s
birthday…
southbeach
see…
now…
happythanksgiving
tist

In [None]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.038194  , -0.24487   ,  0.72812   , ..., -0.1459    ,
         0.8278    ,  0.27062   ],
       [-0.53907   ,  0.033098  ,  0.52285   , ..., -0.87803   ,
         0.53809   , -0.29624   ],
       ...,
       [ 1.10904876, -0.64099899,  0.00752356, ..., -0.92447647,
         0.55117331,  1.38853636],
       [ 0.14505376,  0.40668914, -1.29351867, ...,  0.67521599,
         0.77311842, -1.50665193],
       [ 0.97447066,  0.96112082, -0.53779363, ..., -1.03920264,
        -0.41775421,  0.38666563]])

In [None]:
model = Sequential([
    Embedding(input_dim = len(word2index) + 1,
              output_dim = embed_size,
              input_length = maxlen,
              weights = [embedding_matrix],
              trainable = False
             ),
    
    LSTM(units = 16, return_sequences = True),
    LSTM(units = 4),
    Dense(20, activation = 'softmax')
])

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])



In [None]:
Xtrain.shape

(70000, 34)

In [None]:
Ytrain.shape

(70000, 20)

In [None]:
model.fit(Xtrain, Ytrain, epochs = 25)

Epoch 1/25
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 7ms/step - accuracy: 0.2098 - loss: 2.7777
Epoch 2/25
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 7ms/step - accuracy: 0.2292 - loss: 2.5892
Epoch 3/25
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 8ms/step - accuracy: 0.2529 - loss: 2.4806
Epoch 4/25
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 7ms/step - accuracy: 0.2745 - loss: 2.4316
Epoch 5/25
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 7ms/step - accuracy: 0.2824 - loss: 2.3932
Epoch 6/25
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 7ms/step - accuracy: 0.2869 - loss: 2.3728
Epoch 7/25
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 7ms/step - accuracy: 0.2921 - loss: 2.3532
Epoch 8/25
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 7ms/step - accuracy: 0.2974 - loss: 2.3410
Epoch 9/25
[1m2

<keras.src.callbacks.history.History at 0x1975539bd40>

In [None]:
import pickle
with open('../src/model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [None]:
test = ["I feel good", "I feel not good", "lets eat dinner"]

test_seq = tokenizer.texts_to_sequences(test)
Xtest = pad_sequences(test_seq, maxlen = maxlen, padding = 'post', truncating = 'post')

y_pred = model.predict(Xtest)
y_pred = np.argmax(y_pred, axis = 1)
y_pred

for i in range(len(test)):
    print(test[i], emoji_map['emoticons'][i])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
I feel good 😜
I feel not good 📸
lets eat dinner 😍
