In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, BatchNormalization
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from keras.regularizers import l2

### Data

In [2]:
train_df = pd.read_csv('preprocessed_train.csv')
test_df = pd.read_csv('preprocessed_test.csv')
val_df = pd.read_csv('preprocessed_val.csv')

In [3]:
train_df.head()

Unnamed: 0,text,sentiment
0,fuck bayless isoing,anger
1,make feel threatened,fear
2,dirty southern wanker,anger
3,omg peyton good enough help u playoff dumbass ...,surprise
4,need board create bit space name we’ll good,joy


In [4]:
test_df.head()

Unnamed: 0,text,sentiment
0,i’m really sorry situation although love name ...,sadness
1,king fan here good luck guy interesting game w...,joy
2,i’m sorry hear friend it’s best likely didn’t ...,sadness
3,girlfriend weak well jump pathetic,sadness
4,name towed line dark side cross something like...,anger


In [5]:
val_df.head()

Unnamed: 0,text,sentiment
0,i ve never sad life,sadness
1,could easily taken real camera legitimate sour...,joy
2,wah mum people call bullshit can t ban go side...,anger
3,least name time gain confidence,joy
4,good want thrash liberal offspring world,anger


In [6]:
print(train_df['text'].apply(type).value_counts())

text
<class 'str'>      22829
<class 'float'>        2
Name: count, dtype: int64


In [7]:
train_df = train_df[train_df['text'].apply(type) != float]
train_df.reset_index(drop=True, inplace=True)
print(train_df['text'].apply(type).value_counts())

text
<class 'str'>    22829
Name: count, dtype: int64


In [8]:
test_df = test_df[test_df['text'].apply(type) != float]
test_df.reset_index(drop=True, inplace=True)
print(test_df['text'].apply(type).value_counts())

text
<class 'str'>    2859
Name: count, dtype: int64


In [9]:
val_df = val_df[val_df['text'].apply(type) != float]
val_df.reset_index(drop=True, inplace=True)
print(val_df['text'].apply(type).value_counts())

text
<class 'str'>    2852
Name: count, dtype: int64


### Data processing for LSTM model

In [10]:
X_train = train_df['text']
y_train = train_df['sentiment']

X_test = test_df['text']
y_test = test_df['sentiment']

X_val = val_df['text']
y_val = val_df['sentiment']

In [11]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)
y_val = le.transform(y_val)

In [12]:
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
y_val = to_categorical(y_val)

In [13]:
tokenizer = Tokenizer(oov_token='UNK')
tokenizer.fit_on_texts(pd.concat([X_train, X_test], axis=0))

In [14]:
tokenizer.texts_to_sequences(X_train[0].split())

[[66], [8991], [8992]]

In [15]:
tokenizer.texts_to_matrix(X_train[0].split()).shape

(3, 16864)

In [16]:
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)
sequences_val = tokenizer.texts_to_sequences(X_val)

In [17]:
max_len = max([len(text) for text in train_df['text']])
max_len

145

In [18]:
X_train = pad_sequences(sequences_train, maxlen=max_len, truncating='pre')
X_test = pad_sequences(sequences_test, maxlen=max_len, truncating='pre')
X_val = pad_sequences(sequences_val, maxlen=max_len, truncating='pre')

In [19]:
vocab_size = len(tokenizer.index_word) + 1

In [20]:
num_tokens = vocab_size
embedding_dim = 100
hits = 0
misses = 0
embedding_index = {}

In [21]:
base_path = os.getcwd() 
glove_path = os.path.join(base_path, 'glove.6B.100d.txt')

In [22]:
# Load the Glove embeddings
with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        # Split the line into word and coefficients
        values = line.strip().split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

In [23]:
embedding_matrix = np.zeros((num_tokens, embedding_dim))

# Map the embeddings to your vocabulary
for word, i in tokenizer.word_index.items():
    if i >= num_tokens:
        continue  # Skip words beyond the vocabulary size
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        # Words found in embedding index
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        # Words not found in embedding index
        misses += 1

print(f"Converted {hits} words. Missed {misses} words.")

Converted 14543 words. Missed 2320 words.


In [24]:
num_classes = y_train.shape[1]
adam = Adam(learning_rate=0.005)

model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=X_train.shape[1], weights=[embedding_matrix], trainable=False))
model.add(Bidirectional(LSTM(256, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(num_classes, activation='softmax'))

model.build(
    input_shape=(None, X_train.shape[1])
)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary()



In [25]:
# Ensure both X_train and y_train have the same number of samples
X_train = X_train[:len(y_train)]
y_train = y_train[:len(X_train)]

# Check again to confirm
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

X_train shape: (22829, 145)
y_train shape: (22829, 6)


In [26]:
# Callbacks to prevent overfitting
callbacks = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)

In [None]:
# Train the model
history = model.fit(
    X_train, 
    y_train,
    validation_data=(X_val, y_val),
    verbose=True,
    batch_size=256,
    epochs=30,
    callbacks=[callbacks]
)

Epoch 1/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 2s/step - accuracy: 0.4024 - loss: 1.4654 - val_accuracy: 0.5698 - val_loss: 1.1450
Epoch 2/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 2s/step - accuracy: 0.5639 - loss: 1.1424 - val_accuracy: 0.5929 - val_loss: 1.0664
Epoch 3/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 2s/step - accuracy: 0.5988 - loss: 1.0600 - val_accuracy: 0.6048 - val_loss: 1.0466
Epoch 4/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 2s/step - accuracy: 0.6153 - loss: 1.0038 - val_accuracy: 0.5799 - val_loss: 1.0877
Epoch 5/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m243s[0m 3s/step - accuracy: 0.6077 - loss: 1.0256 - val_accuracy: 0.6346 - val_loss: 0.9911
Epoch 6/30
[1m85/90[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m13s[0m 3s/step - accuracy: 0.6431 - loss: 0.9460

In [None]:
model.evaluate(X_val, y_val, verbose=1)

In [None]:
predicted = model.predict(X_test)
y_pred = predicted.argmax(axis=-1)

print(classification_report(le.transform(test_df['sentiment']), y_pred))

In [None]:
# Plot training and validation loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Plot training and validation accuracy
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
# Second model
model2 = Sequential()
model2.add(Embedding(vocab_size, 100, input_length=X_train.shape[1], weights=[embedding_matrix], trainable=False))
model2.add(Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.3, return_sequences=True)))
model2.add(Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.3)))
model2.add(Dense(num_classes, activation='softmax', kernel_regularizer=l2(0.01)))

# Compile the model
adam = Adam(learning_rate=0.001)
model2.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

model2.summary()

In [None]:
# Train the model
history = model2.fit(
    X_train, 
    y_train,
    validation_data=(X_val, y_val),
    batch_size=256,
    epochs=30,
    callbacks=[callbacks],
    verbose=True
)

In [None]:
model2.evaluate(X_val, y_val, verbose=1)

In [None]:
predicted = model2.predict(X_test)
y_pred = predicted.argmax(axis=-1)

print(classification_report(le.transform(test_df['sentiment']), y_pred))

In [None]:
# Plot training and validation loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Plot training and validation accuracy
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
texts = [
    "OMG! I just got a job offer today. I am so happy! Today is the best day", #joy
    "I never make her separate from me because i don t ever want her to feel like i m ashamed with her", #sadness
    "I cant walk into a shop anywhere where i do not feel uncomfortable", #fear
    "I am feeling outraged it shows everywhere. I can not belive he did that to me. This makes me so angry and frustrated.", #anger
]

for text in texts:
    print("Text: " + text)
    text = tokenizer.texts_to_sequences([text])
    text = pad_sequences(text, maxlen=max_len, truncating='pre')
    emotion = le.inverse_transform(np.argmax(model2.predict(text), axis=-1))[0]
    
    print(emotion)