# Import and Setup

In [1]:
import numpy as np 
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings("ignore") 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('dataset/tweet_emotions.csv')
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


# EDA

## Removing ID, duplicate and Null Values

In [3]:
#If there is any null value throughout the row, remove it.
data = data.dropna()
data = data.reset_index(drop=True)
data.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   40000 non-null  int64 
 1   sentiment  40000 non-null  object
 2   content    40000 non-null  object
dtypes: int64(1), object(2)
memory usage: 937.6+ KB


It was noticed that there are no null values present, therefore no values were dropped

In [4]:
# Remove the id column
data = data.drop(['tweet_id'], axis=1)
data.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [5]:
# Remove duplciate rows if they have the same "content" value. Also print the number of removed rows.
print("Number of duplicate rows before removing: ", data.duplicated().sum())
data = data.drop_duplicates(subset='content')
data = data.reset_index(drop=True)
print("Number of duplicate rows after removing: ", data.duplicated().sum())

Number of duplicate rows before removing:  91
Number of duplicate rows after removing:  0


## Cleaning Text

The text cleaning function is designed to preprocess and clean the text data in the dataset. It performs the following operations:

1. **Remove URLs**: Eliminates any URLs from the text.
2. **Remove Non-Word Characters**: Replaces non-word characters with spaces.
3. **Remove @Mentions**: Removes mentions (e.g., `@username`).
4. **Remove Hashtags**: Removes the `#` symbol from hashtags.
5. **Remove Non-ASCII Characters**: Removes any non-ASCII characters.
6. **Remove Digits**: Eliminates digits from the text.
7. **Fix Multiple Spaces**: Replaces multiple spaces with a single space.
8. **Trim Spaces**: Removes leading and trailing spaces.

The function is applied to the `content` column of the dataset to standardize the text format.

### Text Cleaning Function Overview


In [6]:
import re

# Function to clean text
def clean_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\W', ' ', text)
    #remove any @mentions
    text = re.sub(r'@\w+', '', text)
    #remove # from #hashtags
    text = re.sub(r'#', '', text)
    #remove any non-ascii characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    #remove any digits
    text = re.sub(r'\d', '', text)
    
    
    #Fix double or multiple spacing cause from removal
    text = re.sub(r'\s+', ' ', text)
    #Remove any leading or trailing spaces
    text = re.sub(r'^\s+|\s+?$', '', text)
    return text

# Apply the function to the "content" column
data['content'] = data['content'].apply(clean_text)

# Display the cleaned data
print(data.head())


    sentiment                                            content
0       empty  tiffanylue i know i was listenin to bad habit ...
1     sadness  Layin n bed with a headache ughhhh waitin on y...
2     sadness                     Funeral ceremony gloomy friday
3  enthusiasm                wants to hang out with friends SOON
4     neutral  dannycastillo We want to trade with someone wh...


Saving the cleaned data to a new file

In [7]:
data.to_csv('dataset/cleaned_tweet_emotions.csv', index=False)

In [8]:
df = pd.read_csv('dataset/cleaned_tweet_emotions.csv')
df.head()

Unnamed: 0,sentiment,content
0,empty,tiffanylue i know i was listenin to bad habit ...
1,sadness,Layin n bed with a headache ughhhh waitin on y...
2,sadness,Funeral ceremony gloomy friday
3,enthusiasm,wants to hang out with friends SOON
4,neutral,dannycastillo We want to trade with someone wh...


In [9]:
print(df.shape)
df = df.dropna()
df.isnull().sum()

(39827, 2)


sentiment    0
content      0
dtype: int64

In [10]:
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing.sequence import pad_sequences

# Load Pre-trained GloVe Embeddings (200d)
def load_glove_embeddings(filepath):
    glove_dict = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            glove_dict[word] = vector
    return glove_dict

# Convert tweets to TF-IDF weighted GloVe embeddings
def get_tweet_embedding(tweet, glove_dict, tfidf, feature_names):
    words = tweet.split()
    tweet_vector = np.zeros(200)  # GloVe 200d
    word_count = 0

    for word in words:
        if word in glove_dict and word in feature_names:
            weight = tfidf.get(word, 1)  # Default to 1 if word not in TF-IDF dict
            tweet_vector += weight * glove_dict[word]
            word_count += weight

    return tweet_vector / word_count if word_count != 0 else tweet_vector

# Load GloVe Embeddings
glove_path = "glove.twitter.27B.200d.txt"
glove_embeddings = load_glove_embeddings(glove_path)

# TF-IDF Vectorizer (Fitted on Your Dataset)
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_vectorizer.fit(df['content'])
feature_names = set(tfidf_vectorizer.get_feature_names_out())

# Get TF-IDF Scores
idf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_vectorizer.idf_))

# Convert Tweets to GloVe Embeddings
df['embedding'] = df['content'].apply(lambda x: get_tweet_embedding(x, glove_embeddings, idf_scores, feature_names))

# Check the output
print(df.head())

    sentiment                                            content  \
0       empty  tiffanylue i know i was listenin to bad habit ...   
1     sadness  Layin n bed with a headache ughhhh waitin on y...   
2     sadness                     Funeral ceremony gloomy friday   
3  enthusiasm                wants to hang out with friends SOON   
4     neutral  dannycastillo We want to trade with someone wh...   

                                           embedding  
0  [-0.009687531020362074, -0.11519767683321691, ...  
1  [0.02141141140250999, -0.05418356123417788, -0...  
2  [-0.042901569450643766, -0.05677819936404877, ...  
3  [0.03292089124552941, 0.22727344006666284, 0.1...  
4  [0.08450802176830757, 0.374923210176829, 0.003...  


### Label encoding for the sentiment

In [11]:
#print all unique values in the sentiment column
print(df['sentiment'].unique())

['empty' 'sadness' 'enthusiasm' 'neutral' 'worry' 'surprise' 'love' 'fun'
 'hate' 'happiness' 'boredom' 'relief' 'anger']


In [12]:
#Perform label encoding in a new colomn. [empty, sadness, worry, hate, boredom, anger] is the first label which is negative. [neutral] label is neutral. [enthusiasm, love, fun, happiness, relief] is positive label. Create a new colomn for encoding.
df['sentiment_encoded'] = df['sentiment'].apply(lambda x: 0 if x in ['empty', 'sadness', 'worry', 'hate', 'boredom', 'anger'] else 1 if x in ['neutral'] else 2)
print(df.head())

    sentiment                                            content  \
0       empty  tiffanylue i know i was listenin to bad habit ...   
1     sadness  Layin n bed with a headache ughhhh waitin on y...   
2     sadness                     Funeral ceremony gloomy friday   
3  enthusiasm                wants to hang out with friends SOON   
4     neutral  dannycastillo We want to trade with someone wh...   

                                           embedding  sentiment_encoded  
0  [-0.009687531020362074, -0.11519767683321691, ...                  0  
1  [0.02141141140250999, -0.05418356123417788, -0...                  0  
2  [-0.042901569450643766, -0.05677819936404877, ...                  0  
3  [0.03292089124552941, 0.22727344006666284, 0.1...                  2  
4  [0.08450802176830757, 0.374923210176829, 0.003...                  1  


In [13]:
print(df.shape)
print(df.dtypes)

(39826, 4)
sentiment            object
content              object
embedding            object
sentiment_encoded     int64
dtype: object


In [14]:
import ast
import numpy as np

def parse_embedding(embedding):
    # 1) If it's already a NumPy array, just ensure dtype float32.
    if isinstance(embedding, np.ndarray):
        return embedding.astype(np.float32)
    
    # 2) If it's a string that looks like "array([-0.07, 0.08, ...])", remove "array(" and trailing ")".
    if isinstance(embedding, str):
        if embedding.startswith("array(") and embedding.endswith(")"):
            # remove the leading array( and trailing )
            embedding = embedding[len("array("):-1]  # everything inside the parentheses

        # now it should look like "[-0.07, 0.08, ...]"
        python_list = ast.literal_eval(embedding)  # parse as Python list
        return np.array(python_list, dtype=np.float32)
    
    # 3) Otherwise, try to convert it to float32 array anyway (covers lists or other formats).
    return np.array(embedding, dtype=np.float32)

# Now apply
df['embedding'] = df['embedding'].apply(parse_embedding)


In [15]:
print(df['embedding'].iloc[0])
print(type(df['embedding'].iloc[0]))  # <class 'numpy.ndarray'>
print(df['embedding'].iloc[0].dtype)  # float32 (or float64, depending on your code)


[-9.68753081e-03 -1.15197673e-01  1.60114467e-03  6.72760904e-02
 -7.20403269e-02  1.47297859e-01  5.40728629e-01 -1.48637025e-02
 -3.08873057e-01 -3.68432611e-01  2.64210701e-02 -8.24295804e-02
 -6.73609018e-01 -1.20922513e-01  1.40795037e-02 -2.12514296e-01
  1.42123729e-01  1.22511707e-01 -1.10287197e-01 -6.84935153e-02
  8.92048776e-02  3.88010554e-02  3.59237716e-02 -2.04882324e-01
 -1.55432457e-02  1.09560859e+00 -9.63690355e-02  5.08775190e-02
 -1.70597211e-02 -1.00725733e-01 -8.15619975e-02 -6.00672141e-02
 -3.71983171e-01 -2.07319250e-03 -2.55327523e-01  1.18449517e-01
  8.96067172e-03 -1.71255633e-01  1.96225077e-01  2.61133909e-02
  5.39022923e-01  2.40764380e-01  1.49123102e-01 -6.02549203e-02
 -1.87833413e-01 -1.30441096e-02  4.63306576e-01 -1.86550915e-02
 -5.30624390e-02  9.78403836e-02  7.81050473e-02 -3.36621135e-01
 -8.14983472e-02 -1.78257227e-01 -3.73955537e-03  3.29251401e-02
 -6.43697083e-02 -1.70929253e-01  1.30399451e-01 -1.61359191e-01
  1.14719747e-02  2.67389

In [16]:
#Store only the embedding colomn and the sentiment_encoded colomn in a new dataframe.
df = df[['embedding', 'sentiment_encoded']]
print(df.head())

                                           embedding  sentiment_encoded
0  [-0.009687531, -0.11519767, 0.0016011447, 0.06...                  0
1  [0.021411411, -0.05418356, -0.049089134, -0.20...                  0
2  [-0.042901568, -0.0567782, 0.06105573, 0.10129...                  0
3  [0.03292089, 0.22727343, 0.10493607, -0.464905...                  2
4  [0.084508024, 0.3749232, 0.0037533038, 0.09657...                  1


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Split the dataset into train (70%) and temp (30%)
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)

# Split the temp dataset into validation (20% of original) and test (10% of original)
val_df, test_df = train_test_split(temp_df, test_size=1/3, random_state=42)

# Print the sizes of the splits to verify
print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

Train set size: 27878
Validation set size: 7965
Test set size: 3983


In [18]:
print(df['embedding'].iloc[0])
print(type(df['embedding'].iloc[0]))  # <class 'numpy.ndarray'>
print(df['embedding'].iloc[0].dtype)  # float32 (or float64, depending on your code)


[-9.68753081e-03 -1.15197673e-01  1.60114467e-03  6.72760904e-02
 -7.20403269e-02  1.47297859e-01  5.40728629e-01 -1.48637025e-02
 -3.08873057e-01 -3.68432611e-01  2.64210701e-02 -8.24295804e-02
 -6.73609018e-01 -1.20922513e-01  1.40795037e-02 -2.12514296e-01
  1.42123729e-01  1.22511707e-01 -1.10287197e-01 -6.84935153e-02
  8.92048776e-02  3.88010554e-02  3.59237716e-02 -2.04882324e-01
 -1.55432457e-02  1.09560859e+00 -9.63690355e-02  5.08775190e-02
 -1.70597211e-02 -1.00725733e-01 -8.15619975e-02 -6.00672141e-02
 -3.71983171e-01 -2.07319250e-03 -2.55327523e-01  1.18449517e-01
  8.96067172e-03 -1.71255633e-01  1.96225077e-01  2.61133909e-02
  5.39022923e-01  2.40764380e-01  1.49123102e-01 -6.02549203e-02
 -1.87833413e-01 -1.30441096e-02  4.63306576e-01 -1.86550915e-02
 -5.30624390e-02  9.78403836e-02  7.81050473e-02 -3.36621135e-01
 -8.14983472e-02 -1.78257227e-01 -3.73955537e-03  3.29251401e-02
 -6.43697083e-02 -1.70929253e-01  1.30399451e-01 -1.61359191e-01
  1.14719747e-02  2.67389

In [20]:
import numpy as np
import pandas as pd

def prepare_lstm_data(df, label_col='sentiment', embed_col='embedding'):
    """
    df:       DataFrame with at least 2 columns: [label_col, embed_col]
    label_col: name of the sentiment/label column
    embed_col: name of the embedding column (a numerical vector or numeric data)
    """
    # 1) Extract labels
    y = df[label_col].values  # shape -> (num_samples,)

    # 2) Extract numeric features (assuming 'embedding' column contains numeric vectors)
    #    If 'embedding' is already stored as a vector (list/np.array) per row, convert each row to np.array:
    X = np.array(df[embed_col].tolist())  # shape -> (num_samples, embedding_dim)

    # 3) Reshape to 3D for LSTM: (samples, timesteps=1, features=embedding_dim)
    #    If each row is just one “step” with that embedding:
    X = X.reshape((X.shape[0], 1, X.shape[1]))

    return X, y

# -------------------------------------------------------
# Example usage with train_df and val_df
# -------------------------------------------------------
import numpy as np
import pandas as pd

# Keras / TensorFlow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Sklearn for additional metrics
from sklearn.metrics import classification_report, confusion_matrix
X_train, y_train = prepare_lstm_data(train_df,
                                     label_col='sentiment_encoded',
                                     embed_col='embedding')

X_val, y_val = prepare_lstm_data(val_df,
                                 label_col='sentiment_encoded',
                                 embed_col='embedding')

print("X_train shape:", X_train.shape)  # (28000, 1, embedding_dim) for example
print("y_train shape:", y_train.shape)  # (28000,)

print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)


X_train shape: (27878, 1, 200)
y_train shape: (27878,)
X_val shape: (7965, 1, 200)
y_val shape: (7965,)


In [None]:
def build_lstm_model(input_shape):
    model = Sequential()
    
    # LSTM layer
    model.add(LSTM(units=64, return_sequences=False, input_shape=input_shape))
    # Optional Dropout
    model.add(Dropout(0.2))
    
    # Dense output layer (binary classification -> 1 output neuron with sigmoid)
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile the model
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(learning_rate=1e-3),
        metrics=['accuracy']
    )
    return model


In [22]:
# Note: X_train.shape[1:] is (timesteps, features)
model = build_lstm_model(X_train.shape[1:])
model.summary()


In [23]:
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_val, y_val),
    verbose=1
)


Epoch 1/10
[1m872/872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.2410 - loss: -0.6799 - val_accuracy: 0.2939 - val_loss: -4.6501
Epoch 2/10
[1m872/872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.2864 - loss: -5.8441 - val_accuracy: 0.3140 - val_loss: -9.4798
Epoch 3/10
[1m872/872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.2976 - loss: -10.2061 - val_accuracy: 0.3254 - val_loss: -14.1459
Epoch 4/10
[1m872/872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.3031 - loss: -14.6552 - val_accuracy: 0.3041 - val_loss: -18.7950
Epoch 5/10
[1m872/872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.3015 - loss: -19.1884 - val_accuracy: 0.3053 - val_loss: -23.3421
Epoch 6/10
[1m872/872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.3011 - loss: -25.2534 - val_accuracy: 0.3358 - val_loss: -27.6855
Epoch 7/