[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/hamidrg/Textual_Emotion_detect/blob/master/RNNs/LSTM.ipynb)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalMaxPooling1D

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
import kagglehub
path = kagglehub.dataset_download("akhilvibhakar/isear-raw")

df = pd.read_csv(path + "/ISEAR-raw-utf8.csv")
df.head()

Downloading from https://www.kaggle.com/api/v1/datasets/download/akhilvibhakar/isear-raw?dataset_version_number=1...


100%|██████████| 507k/507k [00:00<00:00, 668kB/s]

Extracting files...





Unnamed: 0,ID,CITY,COUN,SUBJ,SEX,AGE,RELI,PRAC,FOCC,MOCC,...,SELF,RELA,VERBAL,NEUTRO,Field1,Field3,Field2,MYKEY,SIT,STATE
0,11001,1,1,1,1,33,1,2,6,1,...,3,3,2,0,joy,4,3,110011,"During the period of falling in love, each tim...",1
1,11001,1,1,1,1,33,1,2,6,1,...,2,2,0,0,fear,3,2,110012,When I was involved in a traffic accident.,1
2,11001,1,1,1,1,33,1,2,6,1,...,2,1,0,0,anger,1,3,110013,When I was driving home after several days of...,1
3,11001,1,1,1,1,33,1,2,6,1,...,1,1,0,2,sadness,4,4,110014,When I lost the person who meant the most to me.,1
4,11001,1,1,1,1,33,1,2,6,1,...,0,2,0,0,disgust,4,4,110015,The time I knocked a deer down - the sight of ...,1


In [None]:
import spacy

# Load English model
nlp = spacy.load("en_core_web_sm")

In [None]:
import re

def clean_text_spacy(text):
    # Remove user handles and URLs
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r"http\S+|www.\S+", '', text)

    doc = nlp(text.lower())

    tokens = [
        token.lemma_ for token in doc
        if not token.is_stop               # remove stopwords
        and not token.is_punct             # remove punctuation
        and not token.like_url             # remove urls
        and not token.like_email           # remove emails
        and not token.is_space             # remove white space
    ]

    return " ".join(tokens)

In [None]:
df['Clean_Text'] = df['SIT'].apply(clean_text_spacy)

In [None]:
df.head()

Unnamed: 0,ID,CITY,COUN,SUBJ,SEX,AGE,RELI,PRAC,FOCC,MOCC,...,RELA,VERBAL,NEUTRO,Field1,Field3,Field2,MYKEY,SIT,STATE,Clean_Text
0,11001,1,1,1,1,33,1,2,6,1,...,3,2,0,joy,4,3,110011,"During the period of falling in love, each tim...",1,period fall love time meet á especially meet l...
1,11001,1,1,1,1,33,1,2,6,1,...,2,0,0,fear,3,2,110012,When I was involved in a traffic accident.,1,involve traffic accident
2,11001,1,1,1,1,33,1,2,6,1,...,1,0,0,anger,1,3,110013,When I was driving home after several days of...,1,drive home day hard work á motorist ahead driv...
3,11001,1,1,1,1,33,1,2,6,1,...,1,0,2,sadness,4,4,110014,When I lost the person who meant the most to me.,1,lose person mean
4,11001,1,1,1,1,33,1,2,6,1,...,2,0,0,disgust,4,4,110015,The time I knocked a deer down - the sight of ...,1,time knock deer sight animal á injury helpless...


In [None]:
X = df['Clean_Text'].values
y = df['Field1'].values


le = LabelEncoder()
y_encoded = le.fit_transform(y)


tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)


maxlen = 400
X_padded = pad_sequences(sequences, maxlen=maxlen, padding='post')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)

In [None]:
from sklearn.utils import class_weight


class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)


class_weights_dict = dict(enumerate(class_weights))
print(class_weights_dict)

{0: np.float64(1.0034364261168385), 1: np.float64(1.0045871559633028), 2: np.float64(0.9864864864864865), 3: np.float64(0.9864864864864865), 4: np.float64(0.9988597491448119), 5: np.float64(1.0138888888888888), 6: np.float64(1.006896551724138)}


In [None]:

model = Sequential([
    Embedding(input_dim=8273, output_dim=300, input_length=maxlen),
    LSTM(128, return_sequences=True),
    GlobalMaxPooling1D(),
    Dense(len(np.unique(y_encoded)), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=['accuracy'])
model.build(input_shape=(None, maxlen))
model.summary()



In [None]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
from tensorflow.keras.callbacks import ReduceLROnPlateau

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',     # What to monitor
    factor=0.02,              # Reduce by half
    patience=2,              # If no improvement for 2 epochs
    min_lr=1e-6,             # Lower bound on learning rate
    verbose=1                # Print updates
)

history = model.fit(
    X_train, y_train,
    epochs=10,
    validation_data=(X_test, y_test),
    batch_size=128,
    callbacks=[early_stop, reduce_lr],
    class_weight=class_weights_dict
)

Epoch 1/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 56ms/step - accuracy: 0.2080 - loss: 1.9316 - val_accuracy: 0.4615 - val_loss: 1.7443 - learning_rate: 0.0010
Epoch 2/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 45ms/step - accuracy: 0.5406 - loss: 1.5087 - val_accuracy: 0.5665 - val_loss: 1.2548 - learning_rate: 0.0010
Epoch 3/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 45ms/step - accuracy: 0.7205 - loss: 0.9143 - val_accuracy: 0.5671 - val_loss: 1.2368 - learning_rate: 0.0010
Epoch 4/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 46ms/step - accuracy: 0.8034 - loss: 0.6291 - val_accuracy: 0.5665 - val_loss: 1.3288 - learning_rate: 0.0010
Epoch 5/10
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.8581 - loss: 0.4628
Epoch 5: ReduceLROnPlateau reducing learning rate to 2.0000000949949027e-05.
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 

In [None]:
model.save("/content/drive/MyDrive/End2End-NLP-Project-LSTM-ISEAR.h5")

# from tensorflow.keras.models import load_model
# loaded_model = load_model("emotion_model.h5")



In [None]:
loss, acc = model.evaluate(X_test, y_test)
print("Test Accuracy:", acc)

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.5660 - loss: 1.2381
Test Accuracy: 0.5671446919441223


In [None]:
def predict_emotion(text):
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=maxlen, padding='post')
    pred = model.predict(padded)
    class_idx = np.argmax(pred)
    return le.inverse_transform([class_idx])[0]

In [None]:
print(predict_emotion("I'm so excited to watch this movie!"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
disgust


In [None]:
from sklearn.metrics import classification_report

# 1. Predict class probabilities
y_pred_probs = model.predict(X_test)

# 2. Convert probabilities to class labels
y_pred = y_pred_probs.argmax(axis=1)

# 3. Print classification report (Precision, Recall, F1, Accuracy per class)
print(classification_report(y_test, y_pred, digits=4))

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step
              precision    recall  f1-score   support

           0     0.4385    0.5112    0.4720       223
           1     0.5538    0.6205    0.5853       224
           2     0.6900    0.6667    0.6781       207
           3     0.4540    0.3854    0.4169       205
           4     0.6637    0.6912    0.6772       217
           5     0.6623    0.6595    0.6609       232
           6     0.5052    0.4292    0.4641       226

    accuracy                         0.5671      1534
   macro avg     0.5668    0.5662    0.5649      1534
weighted avg     0.5669    0.5671    0.5654      1534

