
# **Emotion Detection from Text**


# **Inference with a pretrained classifier**

In [None]:
!pip install keras-nlp --upgrade

In [None]:
!pip install kaggle

In [None]:
from google.colab import files
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d pashupatigupta/emotion-detection-from-text


Downloading emotion-detection-from-text.zip to /content
  0% 0.00/1.56M [00:00<?, ?B/s]
100% 1.56M/1.56M [00:00<00:00, 113MB/s]


In [None]:
!unzip emotion-detection-from-text.zip


Archive:  emotion-detection-from-text.zip
  inflating: tweet_emotions.csv      


In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('tweet_emotions.csv')

# Display the first few rows of the dataframe to understand its structure
print(df.head())


     tweet_id   sentiment                                            content
0  1956967341       empty  @tiffanylue i know  i was listenin to bad habi...
1  1956967666     sadness  Layin n bed with a headache  ughhhh...waitin o...
2  1956967696     sadness                Funeral ceremony...gloomy friday...
3  1956967789  enthusiasm               wants to hang out with friends SOON!
4  1956968416     neutral  @dannycastillo We want to trade with someone w...


In [None]:
# Display the shape of the dataframe (rows, columns)
print("Dataset shape:", df.shape)

# Get a concise summary of the dataframe (column datatypes, non-null values, etc.)
df.info()

# Display basic statistical details about the dataset
print(df.describe())

# Check for any missing values in the dataset
print(df.isnull().sum())

# Display the distribution of emotions in the dataset
# Display the distribution of sentiments in the dataset
print(df['sentiment'].value_counts())



Dataset shape: (40000, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   40000 non-null  int64 
 1   sentiment  40000 non-null  object
 2   content    40000 non-null  object
dtypes: int64(1), object(2)
memory usage: 937.6+ KB
           tweet_id
count  4.000000e+04
mean   1.845184e+09
std    1.188579e+08
min    1.693956e+09
25%    1.751431e+09
50%    1.855443e+09
75%    1.962781e+09
max    1.966441e+09
tweet_id     0
sentiment    0
content      0
dtype: int64
neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
import keras_nlp
from sklearn.model_selection import train_test_split

# **Process NLP Dataset**

In [None]:
# Define your features and labels
X = df['content']  # Text content of the tweet
y = df['sentiment']  # Emotions or sentiments

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Optional: Check the size of each set
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

Training set size: 32000
Testing set size: 8000


**Convert the labels to the categorical format as expected by Keras.**

In [None]:
# Filter the dataset for only 'happiness' and 'sadness' sentiments
filtered_df = df[df['sentiment'].isin(['happiness', 'sadness'])]

# Check the shape and distribution after filtering
print("Filtered dataset shape:", filtered_df.shape)
print(filtered_df['sentiment'].value_counts())


Filtered dataset shape: (10374, 3)
happiness    5209
sadness      5165
Name: sentiment, dtype: int64


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Features and Labels
X = filtered_df['content']
y = filtered_df['sentiment']

# Encoding labels ('happiness' and 'sadness' to 0 and 1)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


In [None]:
import keras_nlp
import tensorflow as tf

# Since we're doing binary classification, num_classes is 2
num_classes = 2

# Choose a model from the presets suitable for your task
model_name = "bert_tiny_en_uncased_sst2"  # This is just an example; select based on your requirement

# Initialize the BertClassifier
classifier = keras_nlp.models.BertClassifier.from_preset(
    preset=model_name,
    num_classes=num_classes,
    activation='sigmoid'  # Use 'sigmoid' for binary classification
)


Downloading from https://www.kaggle.com/api/v1/models/keras/bert/keras/bert_tiny_en_uncased_sst2/3/download/config.json...
100%|██████████| 2.14k/2.14k [00:00<00:00, 4.06MB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/bert/keras/bert_tiny_en_uncased_sst2/3/download/assets/tokenizer/vocabulary.txt...
100%|██████████| 226k/226k [00:00<00:00, 1.13MB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/bert/keras/bert_tiny_en_uncased_sst2/3/download/model.weights.h5...
100%|██████████| 16.8M/16.8M [00:00<00:00, 23.1MB/s]


In [None]:
y_train_categorical = tf.keras.utils.to_categorical(y_train, num_classes=2)
y_test_categorical = tf.keras.utils.to_categorical(y_test, num_classes=2)


In [None]:
import tensorflow as tf

# Assuming 'keras' is correctly imported from 'tensorflow' as follows:
from tensorflow import keras

# Compile the classifier
classifier.compile(
    loss=keras.losses.BinaryCrossentropy(),
    optimizer=keras.optimizers.Adam(),
    jit_compile=True,  # This accelerates training if your environment supports it
    metrics=["accuracy"],
)


classifier.backbone.trainable = False


classifier.fit(
    x=X_train,
    y=y_train_categorical,  # Use the categorical labels
    validation_data=(X_test, y_test_categorical),
    batch_size=32 , # Adjust the batch size as necessary
    epochs=3
)



Epoch 1/3
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m308s[0m 1s/step - accuracy: 0.5635 - loss: 0.7130 - val_accuracy: 0.7190 - val_loss: 0.5499
Epoch 2/3
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m290s[0m 1s/step - accuracy: 0.7063 - loss: 0.5689 - val_accuracy: 0.7272 - val_loss: 0.5418
Epoch 3/3
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 1s/step - accuracy: 0.7207 - loss: 0.5560 - val_accuracy: 0.7301 - val_loss: 0.5366


<keras.src.callbacks.history.History at 0x7c600c6f2e00>

In [None]:
# Evaluate the model using the categorical labels
eval_result = classifier.evaluate(X_test, y_test_categorical, batch_size=32)
print(f"Test loss: {eval_result[0]} - Test accuracy: {eval_result[1]}")


[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 577ms/step - accuracy: 0.7336 - loss: 0.5291
Test loss: 0.5366498827934265 - Test accuracy: 0.7301204800605774


In [None]:
# Example tweets with clear emotional content
example_tweets = [
    "Just saw the most uplifting concert ever. Left feeling overjoyed and full of energy!", # Likely to convey happiness
    "Feeling down after hearing today's news. It's hard to stay positive in times like these." # Likely to convey sadness
]

# Predict emotions for these example tweets
predictions = classifier.predict(example_tweets)

# Assuming the model outputs logits or probabilities for two classes
probabilities = tf.nn.softmax(predictions, axis=-1)

# Define class names according to your specific classification task
class_names = ["sadness", "happiness"]

# Iterate over the predictions and print out the class and confidence
for i, prediction in enumerate(probabilities):
    predicted_class = class_names[np.argmax(prediction)]
    confidence = np.max(prediction)
    print(f"Tweet: '{example_tweets[i]}'\nPredicted emotion: {predicted_class} with a {(100 * confidence).round(2)}% confidence.\n")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Tweet: 'Just saw the most uplifting concert ever. Left feeling overjoyed and full of energy!'
Predicted emotion: sadness with a 60.79% confidence.

Tweet: 'Feeling down after hearing today's news. It's hard to stay positive in times like these.'
Predicted emotion: sadness with a 50.37% confidence.



# **Fine-tuning BERT for Emotion Classification**

In [None]:
preprocessor = keras_nlp.models.BertPreprocessor.from_preset(
    model_name,
    sequence_length=128,
)


Downloading from https://www.kaggle.com/api/v1/models/keras/bert/keras/bert_tiny_en_uncased_sst2/3/download/tokenizer.json...
100%|██████████| 547/547 [00:00<00:00, 435kB/s]


In [None]:
import tensorflow as tf

def tf_clean_text(text):
    # Convert text tensors to lowercase
    return tf.strings.lower(text)


In [None]:
import re

def custom_preprocess(text):
    # Example: Lowercasing and removing punctuation
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text


In [None]:
X_train_preprocessed = [custom_preprocess(x) for x in X_train]
X_test_preprocessed = [custom_preprocess(x) for x in X_test]


In [None]:
# Ensure classifier is compiled as previously
classifier.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy'],
)

# Fit the classifier on the preprocessed training data
classifier.fit(
    x=np.array(X_train_preprocessed),  # Convert to NumPy array for TensorFlow compatibility
    y=y_train_categorical,
    validation_data=(np.array(X_test_preprocessed), y_test_categorical),
    batch_size=32,
    epochs=3
)


Epoch 1/3
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m915s[0m 3s/step - accuracy: 0.6545 - loss: 0.6328 - val_accuracy: 0.7614 - val_loss: 0.4952
Epoch 2/3
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m926s[0m 4s/step - accuracy: 0.7995 - loss: 0.4663 - val_accuracy: 0.7735 - val_loss: 0.5047
Epoch 3/3
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m920s[0m 4s/step - accuracy: 0.8363 - loss: 0.3904 - val_accuracy: 0.7639 - val_loss: 0.5211


<keras.src.callbacks.history.History at 0x7c600180d270>

# **Fine tuning a pretrained backbone**

In [None]:
import tensorflow as tf
import keras_nlp
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Assuming df is your filtered DataFrame
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df['sentiment'])
y_categorical = to_categorical(encoded_labels)

X_train, X_test, y_train, y_test = train_test_split(df['content'], y_categorical, test_size=0.2, random_state=42)

# Load the BERT classifier
classifier = keras_nlp.models.BertClassifier.from_preset(
    "bert_tiny_en_uncased",
    num_classes=2,
)


Downloading from https://www.kaggle.com/api/v1/models/keras/bert/keras/bert_tiny_en_uncased/2/download/config.json...
100%|██████████| 507/507 [00:00<00:00, 292kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/bert/keras/bert_tiny_en_uncased/2/download/model.weights.h5...
100%|██████████| 16.8M/16.8M [00:00<00:00, 26.5MB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/bert/keras/bert_tiny_en_uncased/2/download/tokenizer.json...
100%|██████████| 547/547 [00:00<00:00, 312kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/bert/keras/bert_tiny_en_uncased/2/download/assets/tokenizer/vocabulary.txt...
100%|██████████| 226k/226k [00:00<00:00, 1.21MB/s]


In [None]:
classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss='binary_crossentropy',
    metrics=['accuracy'],
)


In [None]:
classifier.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=3,
    batch_size=32
)


Epoch 1/3
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m882s[0m 3s/step - accuracy: 0.5650 - loss: 0.7282 - val_accuracy: 0.7152 - val_loss: 0.5749
Epoch 2/3
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m889s[0m 3s/step - accuracy: 0.7271 - loss: 0.5656 - val_accuracy: 0.7981 - val_loss: 0.4535
Epoch 3/3
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m888s[0m 3s/step - accuracy: 0.8000 - loss: 0.4605 - val_accuracy: 0.8193 - val_loss: 0.4395


<keras.src.callbacks.history.History at 0x7c6008f35f90>