In [430]:
import numpy as np
import pandas as pd
import emoji
import re
from textblob import TextBlob

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.corpus import stopwords

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, SimpleRNN, Embedding

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [431]:
data = pd.read_csv('../Data/Train.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,TEXT,Label
0,0,Vacation wasted ! #vacation2017 #photobomb #ti...,0
1,1,"Oh Wynwood, you’re so funny! : @user #Wynwood ...",1
2,2,Been friends since 7th grade. Look at us now w...,2
3,3,This is what it looks like when someone loves ...,3
4,4,RT @user this white family was invited to a Bl...,3


In [432]:
data['Label'].value_counts()

Label
9     15091
2      7076
3      6842
7      4363
15     3843
13     3250
16     2832
17     2751
1      2592
11     2434
14     2342
6      2083
8      1992
5      1977
12     1909
4      1878
19     1745
18     1722
10     1721
0      1557
Name: count, dtype: int64

In [433]:
Xtest = data['TEXT'].tolist()

In [434]:
Xtest = Xtest[60000:]

In [435]:
Xtest

['Pac Man @ Otay Ranch Town Center\n',
 'Sleepin at Sassafras Ridge @ A Day At The Farm\n',
 "Pudding'd &amp; Therapied !!! Waiting for Radiation Ride to kick off the weekend. @ St. Luke's-Elks Rehab\n",
 'TONIGHT!!!! Yall already know where to be. YACHT NOT LIT Yall heard djyoungchow , yall…\n',
 "Cuteness. My cup runneth over. ️ @ Long's Pool &amp; Deck Bar\n",
 '@user was so lit last night, you do not want to see the after pictures @ Sound Academy\n',
 'New music this week. #venus #rnbhouston #rnblovers @user @ San Diego, California\n',
 'Forever an infrastructure nerd, I ️ the various creatures that patrol these depths. This is the…\n',
 'We choose to be @ VXI Global Solutions\n',
 'Killing some nail are here for thessaclarke16 this shape is on her hands…\n',
 'Fjeld Duet By Christopher House #dancerlife @ The Winchester…\n',
 'A picture of my being super touristy. : taylorantle #chicago #thebean…\n',
 'Before and After magical chin rubs... #ginger #persian #happysunday @ South of 

In [436]:
data['Label'].value_counts().nunique()

20

🔹 Step 1: Lowercase all text

In [437]:
data['TEXT'] = data['TEXT'].str.lower()

🔹 Step 2: Remove URLs, mentions, hashtags, HTML

In [438]:
import re

def clean_text(text):
    text = re.sub(r"http\\S+|www\\S+", "", text)  # URLs
    text = re.sub(r"@\\w+", "", text)            # Mentions
    text = re.sub(r"#\\w+", "", text)            # Hashtags
    text = re.sub(r"<.*?>", "", text)            # HTML tags
    text = re.sub(r"[^a-zA-Z\\s]", "", text)     # Special chars
    return text.strip()

data['TEXT'] = data['TEXT'].apply(clean_text)

🔹 Step 3: Fix spacing and repeat letters

In [439]:
def fix_repeat(text):
    return re.sub(r'(.)\\1{2,}', r'\\1\\1', text)

data['TEXT'] = data['TEXT'].apply(fix_repeat)

🔹 Step 4: Remove stopwords

In [440]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = text.split()
    return ' '.join([word for word in tokens if word not in stop_words])

data['TEXT'] = data['TEXT'].apply(remove_stopwords)

In [441]:
stemmer=PorterStemmer()
lemmatizer=WordNetLemmatizer()

In [442]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from autocorrect import Speller

spell = Speller(lang='en')
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def full_clean(text):
    tokens = text.split()
    tokens = [w for w in tokens if w not in stopwords.words('english')]
    tokens = [stemmer.stem(w) for w in tokens]
    return ' '.join(tokens)

data['TEXT'] = data['TEXT'].apply(full_clean)

In [443]:
emoji_map=pd.read_csv("../Data/Mapping.csv")

emoji_map.drop(columns=["Unnamed: 0"],inplace=True)
emoji_map.head()

Unnamed: 0,emoticons,number
0,😜,0
1,📸,1
2,😍,2
3,😂,3
4,😉,4


In [444]:
emoji_map.set_index('number').to_dict()

{'emoticons': {0: '😜',
  1: '📸',
  2: '😍',
  3: '😂',
  4: '😉',
  5: '🎄',
  6: '📷',
  7: '🔥',
  8: '😘',
  9: '🥰',
  10: '😁',
  11: '🇺🇸',
  12: '☀',
  13: '✨',
  14: '💙',
  15: '💕',
  16: '😎',
  17: '😊',
  18: '💜',
  19: '💯'}}

In [445]:
X = data['TEXT'].values
Y = data['Label'].values

In [446]:
file = open('../Data/glove.6B.100d.txt', 'r', encoding = 'utf8')
content = file.readlines()
file.close()

In [447]:
embeddings = {}

for line in content:
    line = line.split()
    embeddings[line[0]] = np.array(line[1:], dtype = float)

In [448]:
embeddings['the']

array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
       -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
        0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
       -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
        0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
       -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
        0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
        0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
       -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
       -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
       -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
       -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
       -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
       -1.2526  ,  0.071624,  0.70565 ,  0.49744 , 

In [449]:
data

Unnamed: 0.1,Unnamed: 0,TEXT,Label
0,0,vacationwastedvacationphotobombtiredvacationwa...,0
1,1,ohwynwoodyouresofunnyuserwynwoodartitwasamflow...,1
2,2,beenfriendssincethgradelookatusnowweallfollowi...,2
3,3,thisiswhatitlookslikewhensomeonelovesyouuncond...,3
4,4,rtuserthiswhitefamilywasinvitedtoablackbarbecu...,3
...,...,...,...
69995,69995,yesicallgalinamybubiegofollowmybeautifulfriend...,3
69996,69996,iseayouseattleballardseafoodfestiv,16
69997,69997,ifoneofmydaughtersiswearingthisandasksmeforice...,2
69998,69998,guesswhowhooppeopleontheirhomecomingasuramsatl...,3


In [450]:
def get_maxlen(data):
    maxlen = 0
    for sent in data:
        maxlen = max(maxlen, len(sent))
    return maxlen

In [451]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X)

import pickle
with open('../src/tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)
word2index = tokenizer.word_index

In [452]:
Xtokens = tokenizer.texts_to_sequences(X)
maxlen = get_maxlen(Xtokens)
print(maxlen)
Xtrain = pad_sequences(Xtokens, maxlen = maxlen,  padding = 'post', truncating = 'post')

2


In [453]:
# change to onehotencoding
Ytrain = to_categorical(Y)
Ytrain.shape

(70000, 20)

In [454]:
# โหลดไฟล์ GloVe
glove_path = '../Data/glove.6B.100d.txt'

embedding_index = {}
with open(glove_path, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coeffs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coeffs

In [455]:
embedding_matrix = np.zeros((len(word2index) + 1, 100))

for word, i in word2index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [463]:
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense


model = Sequential([
    Embedding(input_dim = len(word2index) + 1,
              output_dim = 100,  # dimension of the embedding
              input_length = 21,   # maxlen 
              weights = [embedding_matrix],
              trainable = True
             ),
    
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.3),
    LSTM(32),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(20, activation='softmax')
])

model.compile(optimizer = 'SGD', loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [464]:
Ytrain

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [485]:
Xtrain.shape

(60000, 2)

In [486]:
Ytrain.shape

(60000, 20)

In [487]:
Ytest  = Ytrain[60000:]
Xtrain  = Xtrain[:60000]
Ytrain  = Ytrain[:60000]


In [488]:
Ytest

array([], shape=(0, 20), dtype=float64)

In [489]:
# import numpy as np

# y_train_classes = np.argmax(Ytrain, axis=1)

In [490]:
# from sklearn.utils.class_weight import compute_class_weight

# classes = np.unique(y_train_classes)
# weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train_classes)
# class_weights = dict(zip(classes, weights))

In [491]:
class_weights 

{np.int64(0): np.float64(2.2189349112426036),
 np.int64(1): np.float64(1.3464991023339319),
 np.int64(2): np.float64(0.49480455220188024),
 np.int64(3): np.float64(0.510899182561308),
 np.int64(4): np.float64(1.893939393939394),
 np.int64(5): np.float64(1.7678255745433118),
 np.int64(6): np.float64(1.7035775127768313),
 np.int64(7): np.float64(0.7882291119285338),
 np.int64(8): np.float64(1.772002362669817),
 np.int64(9): np.float64(0.23237800154918667),
 np.int64(10): np.float64(2.027027027027027),
 np.int64(11): np.float64(1.4478764478764479),
 np.int64(12): np.float64(1.8359853121175032),
 np.int64(13): np.float64(1.0676156583629892),
 np.int64(14): np.float64(1.480019733596448),
 np.int64(15): np.float64(0.9179926560587516),
 np.int64(16): np.float64(1.2350761630300535),
 np.int64(17): np.float64(1.274426508071368),
 np.int64(18): np.float64(2.0394289598912305),
 np.int64(19): np.float64(2.014775016789792)}

In [492]:
model.fit(Xtrain, Ytrain, epochs = 10, batch_size = 32)

Epoch 1/10


[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - accuracy: 0.2076 - loss: 2.8817
Epoch 2/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.2171 - loss: 2.7378
Epoch 3/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.2147 - loss: 2.7357
Epoch 4/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.2160 - loss: 2.7333
Epoch 5/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.2156 - loss: 2.7314
Epoch 6/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 6ms/step - accuracy: 0.2144 - loss: 2.7397
Epoch 7/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.2159 - loss: 2.7334
Epoch 8/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.2144 - loss: 2.7381
Epoch 9/10
[1m1875/1875[0

<keras.src.callbacks.history.History at 0x1d37bd5ddc0>

In [493]:
import pickle
model.save("../src/model.keras")

In [494]:
from sklearn.metrics import classification_report



In [495]:
Xtest

['Pac Man @ Otay Ranch Town Center\n',
 'Sleepin at Sassafras Ridge @ A Day At The Farm\n',
 "Pudding'd &amp; Therapied !!! Waiting for Radiation Ride to kick off the weekend. @ St. Luke's-Elks Rehab\n",
 'TONIGHT!!!! Yall already know where to be. YACHT NOT LIT Yall heard djyoungchow , yall…\n',
 "Cuteness. My cup runneth over. ️ @ Long's Pool &amp; Deck Bar\n",
 '@user was so lit last night, you do not want to see the after pictures @ Sound Academy\n',
 'New music this week. #venus #rnbhouston #rnblovers @user @ San Diego, California\n',
 'Forever an infrastructure nerd, I ️ the various creatures that patrol these depths. This is the…\n',
 'We choose to be @ VXI Global Solutions\n',
 'Killing some nail are here for thessaclarke16 this shape is on her hands…\n',
 'Fjeld Duet By Christopher House #dancerlife @ The Winchester…\n',
 'A picture of my being super touristy. : taylorantle #chicago #thebean…\n',
 'Before and After magical chin rubs... #ginger #persian #happysunday @ South of 

In [496]:
#test = ["I love this product! It's amazing and works great!"," I hate this product! It's terrible and doesn't work at all!","This is the best purchase I've ever made!","I will never buy this again, it's awful!"]

test_seq = tokenizer.texts_to_sequences(Xtest)
X_test = pad_sequences(test_seq, maxlen = maxlen, padding = 'post', truncating = 'post')





In [497]:
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis = 1)
y_pred 

for i in range(len(Xtest)):
    print(Xtest[i], emoji_map['emoticons'][y_pred[i]])

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
Pac Man @ Otay Ranch Town Center
 🥰
Sleepin at Sassafras Ridge @ A Day At The Farm
 🥰
Pudding'd &amp; Therapied !!! Waiting for Radiation Ride to kick off the weekend. @ St. Luke's-Elks Rehab
 🥰
TONIGHT!!!! Yall already know where to be. YACHT NOT LIT Yall heard djyoungchow , yall…
 🥰
Cuteness. My cup runneth over. ️ @ Long's Pool &amp; Deck Bar
 🥰
@user was so lit last night, you do not want to see the after pictures @ Sound Academy
 🥰
New music this week. #venus #rnbhouston #rnblovers @user @ San Diego, California
 🥰
Forever an infrastructure nerd, I ️ the various creatures that patrol these depths. This is the…
 🥰
We choose to be @ VXI Global Solutions
 🥰
Killing some nail are here for thessaclarke16 this shape is on her hands…
 🥰
Fjeld Duet By Christopher House #dancerlife @ The Winchester…
 🥰
A picture of my being super touristy. : taylorantle #chicago #thebean…
 🥰
Before and After magical chin rubs... #gin

In [498]:
y_pred

array([9, 9, 9, ..., 9, 9, 9])

In [499]:
y_true = np.argmax(Ytest, axis=1)

In [500]:
y_true

array([], dtype=int64)

In [501]:
y_pred

array([9, 9, 9, ..., 9, 9, 9])

In [502]:
import numpy as np

values, counts = np.unique(y_pred, return_counts=True)
print(dict(zip(values, counts)))

{np.int64(9): np.int64(10000)}


In [503]:

values, counts = np.unique(y_pred, return_counts=True)
print(dict(zip(values, counts)))

{np.int64(9): np.int64(10000)}
