# Emotion Classification with FastText and LSTM

# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.models import load_model

import zipfile


# Data Preparation

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/emotion_dataset/train.txt', sep=';', header=None, names=['text', 'label'])
val_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/emotion_dataset/val.txt', sep=';', header=None, names=['text', 'label'])
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/emotion_dataset/test.txt', sep=';', header=None, names=['text', 'label'])


nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # punctuations
    text = text.lower()  # lower case
    text = re.sub(r'\d+', '', text)  # numbers
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Stopwords
    text = re.sub(r'\s+', ' ', text).strip()  # space
    return text


train_df['text'] = train_df['text'].apply(clean_text)
val_df['text'] = val_df['text'].apply(clean_text)
test_df['text'] = test_df['text'].apply(clean_text)

label_to_int = {'joy': 0, 'sadness': 1, 'anger': 2, 'fear': 3, 'love': 4, 'surprise': 5}

train_df['label'] = train_df['label'].map(label_to_int)
val_df['label'] = val_df['label'].map(label_to_int)
test_df['label'] = test_df['label'].map(label_to_int)

max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_df['text'])

X_train = tokenizer.texts_to_sequences(train_df['text'])
X_val = tokenizer.texts_to_sequences(val_df['text'])
X_test = tokenizer.texts_to_sequences(test_df['text'])

max_len = 100
X_train = pad_sequences(X_train, maxlen=max_len)
X_val = pad_sequences(X_val, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

y_train = train_df['label']
y_val = val_df['label']
y_test = test_df['label']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Download FastText

In [None]:
# https://fasttext.cc/docs/en/english-vectors.html

In [None]:
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
# !gunzip cc.en.300.vec.gz

In [None]:
fasttext_file_path = '/content/drive/MyDrive/Colab Notebooks/wiki-news-300d-1M.vec.zip'

In [None]:
with zipfile.ZipFile(fasttext_file_path, 'r') as zip_ref:
    zip_ref.extractall('/content/fasttext')

In [None]:
fasttext_vec_path = '/content/fasttext/wiki-news-300d-1M.vec'

# Load FastText Vectors and Create Embedding Matrix

In [None]:
def load_fasttext_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        for line in f:
            values = line.rstrip().split(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

embeddings_index = load_fasttext_embeddings(fasttext_vec_path)

In [None]:
embedding_dim = 300
embedding_matrix = np.zeros((max_words, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# Modeling

In [None]:
model = Sequential([
    Embedding(input_dim=max_words,
              output_dim=embedding_dim,
              input_length=max_len,
              weights=[embedding_matrix],
              trainable=False, name='embedding_layer'),
    LSTM(128),
    BatchNormalization(),
    Dropout(0.2),
    Dense(6, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',
              optimizer=Adam(learning_rate=0.001),
              metrics=['accuracy'])

model.summary()

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    verbose=1,
    restore_best_weights=True)

history = model.fit(X_train, y_train,
                    epochs=100,
                    validation_data=(X_val, y_val),
                    batch_size=32,
                    verbose=1,
                    callbacks=[early_stopping])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_layer (Embedding  (None, 100, 300)          3000000   
 )                                                               
                                                                 
 lstm (LSTM)                 (None, 128)               219648    
                                                                 
 batch_normalization (Batch  (None, 128)               512       
 Normalization)                                                  
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 6)                 774       
                                                                 
Total params: 3220934 (12.29 MB)
Trainable params: 22067

# Model Performance

In [None]:
val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=0)
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)

print(f"Val Loss: {val_loss}", f"Val Accuracy: {val_accuracy}")
print(f"Test Loss: {test_loss}", f"Test Accuracy: {test_accuracy}")

Val Loss: 0.16910257935523987 Val Accuracy: 0.9340000152587891
Test Loss: 0.1719626933336258 Test Accuracy: 0.9265000224113464


In [None]:
y_pred = model.predict(X_test, verbose=0)
y_pred_classes = y_pred.argmax(axis=-1)

classification_rep = classification_report(y_test, y_pred_classes)
accuracy = accuracy_score(y_test, y_pred_classes)

print(accuracy)
print(classification_rep)

0.9265
              precision    recall  f1-score   support

           0       0.93      0.96      0.94       695
           1       0.97      0.96      0.96       581
           2       0.91      0.94      0.93       275
           3       0.89      0.91      0.90       224
           4       0.87      0.77      0.82       159
           5       0.88      0.64      0.74        66

    accuracy                           0.93      2000
   macro avg       0.91      0.86      0.88      2000
weighted avg       0.93      0.93      0.93      2000



# Prediction

In [None]:
np.random.seed(42)

random_indices = np.random.choice(X_test.shape[0], size=10, replace=False)

X_random_test = X_test[random_indices]

y_random_test = y_test.iloc[random_indices]


In [None]:
y_random_pred = model.predict(X_random_test, verbose=0)

In [None]:
y_random_pred_classes = y_random_pred.argmax(axis=-1)

In [None]:
results = pd.DataFrame({
    'text': test_df['text'].iloc[random_indices].values,
    'actual_label': y_random_test.values,
    'predicted_label': y_random_pred_classes
})

In [None]:
int_to_label = {v: k for k, v in label_to_int.items()}

results['actual_label'] = results['actual_label'].map(int_to_label)
results['predicted_label'] = results['predicted_label'].map(int_to_label)

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 500)

print(results)

                                                                                                                                                text actual_label predicted_label
0  feel dirty spending day mk show buddy decided would get two player starter us luckily us liked everblight liked circle maybe tad much worked well      sadness         sadness
1                                                                                                                could feel breath smell sweet scent          joy            love
2                                                                                                                                    want feel loved         love            love
3                                           felt need write sometimes anxious feelings impatient thoughts lists things still could done baby arrives        anger           anger
4                                                                                                             

# Prediction for New Data

In [None]:
model.save('emotion_lstm_model.keras')

In [None]:
loaded_model = load_model('emotion_lstm_model.keras')

In [None]:
new_text = "I feel very happy and joyful today!"

cleaned_text = clean_text(new_text)

tokenized_text = tokenizer.texts_to_sequences([cleaned_text])

padded_text = pad_sequences(tokenized_text, maxlen=max_len)


In [None]:
predicted_probabilities = loaded_model.predict(padded_text, verbose=0)

predicted_class = predicted_probabilities.argmax(axis=-1)[0]

In [None]:
predicted_label = int_to_label[predicted_class]

In [None]:
print(f"Text: {new_text}")
print(f"Predicted Label: {predicted_label}")

Text: I feel very happy and joyful today!
Predicted Label: joy


In [None]:
def predict_emotion(text):
    cleaned_text = clean_text(text)
    tokenized_text = tokenizer.texts_to_sequences([cleaned_text])
    padded_text = pad_sequences(tokenized_text, maxlen=max_len)
    predicted_probabilities = loaded_model.predict(padded_text, verbose=0)
    predicted_class = predicted_probabilities.argmax(axis=-1)[0]
    predicted_label = int_to_label[predicted_class]
    print(text, "Emotion:", predicted_label)


In [None]:
predict_emotion(new_text)

I feel very happy and joyful today! Emotion: joy


In [None]:
messages = [
    "I had such a wonderful day with you today!",
    "I'm feeling really sad and lonely right now.",
    "I'm so angry that you didn't call me back!",
    "I'm terrified of what might happen next.",
    "I love you more than words can express.",
    "I'm so surprised by the gift you sent me!",
    "I feel so joyful when we are together.",
    "I'm feeling really anxious about tomorrow.",
    "I can't believe you remembered our anniversary!",
    "I feel so much love for you every day."
]

In [None]:
for message in messages:
    predict_emotion(message)

I had such a wonderful day with you today! Emotion: anger
I'm feeling really sad and lonely right now. Emotion: sadness
I'm so angry that you didn't call me back! Emotion: anger
I'm terrified of what might happen next. Emotion: fear
I love you more than words can express. Emotion: anger
I'm so surprised by the gift you sent me! Emotion: anger
I feel so joyful when we are together. Emotion: joy
I'm feeling really anxious about tomorrow. Emotion: fear
I can't believe you remembered our anniversary! Emotion: anger
I feel so much love for you every day. Emotion: joy
