In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, GlobalMaxPooling1D, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from bnlp import CleanText

# Load and clean data
df = pd.read_csv("Bangla Emotion Dataset.csv")

clean_text = CleanText(
   fix_unicode=True,
   unicode_norm=True,
   unicode_norm_form="NFKC",
   remove_url=True,
   remove_email=True,
   remove_emoji=True,
   remove_number=True,
   remove_digits=True,
   remove_punct=True,
   replace_with_url="",
   replace_with_email="",
   replace_with_number="",
   replace_with_digit="",
   replace_with_punct = ""
)

df['clean_description'] = df['Data'].apply(clean_text)

# Prepare target variables
emotions = ['Love', 'Joy', 'Surprise', 'Anger', 'Sadness', 'Fear']
y_emotions = df[emotions].values

# Prepare topic labels
mlb = MultiLabelBinarizer()
y_topic = mlb.fit_transform(df['Topic'].str.split(','))

# Tokenize the text
max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words, lower=True, oov_token='<OOV>')
tokenizer.fit_on_texts(df['clean_description'])
X = tokenizer.texts_to_sequences(df['clean_description'])
X = pad_sequences(X, maxlen=max_len)

# Split the data
X_train, X_test, y_emotions_train, y_emotions_test, y_topic_train, y_topic_test = train_test_split(
    X, y_emotions, y_topic, test_size=0.2, random_state=42)

# Build the model
def build_model(vocab_size, max_len, num_emotions, num_topics):
    input_layer = Input(shape=(max_len,))
    embedding_layer = Embedding(vocab_size, 100, input_length=max_len)(input_layer)
    lstm_layer = Bidirectional(LSTM(64, return_sequences=True))(embedding_layer)
    attention_layer = Attention()([lstm_layer, lstm_layer])
    pooling_layer = GlobalMaxPooling1D()(attention_layer)
    
    emotions_output = Dense(num_emotions, activation='sigmoid', name='emotions')(pooling_layer)
    topics_output = Dense(num_topics, activation='sigmoid', name='topics')(pooling_layer)
    
    model = Model(inputs=input_layer, outputs=[emotions_output, topics_output])
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss={'emotions': 'binary_crossentropy', 'topics': 'binary_crossentropy'},
                  metrics={'emotions': 'accuracy', 'topics': 'accuracy'})
    return model

# Train the model
model = build_model(max_words, max_len, len(emotions), y_topic.shape[1])
history = model.fit(X_train, {'emotions': y_emotions_train, 'topics': y_topic_train},
                    validation_data=(X_test, {'emotions': y_emotions_test, 'topics': y_topic_test}),
                    epochs=10, batch_size=32)

# Evaluate the model
eval_results = model.evaluate(X_test, {'emotions': y_emotions_test, 'topics': y_topic_test})
print("Emotions Loss:", eval_results[1])
print("Emotions Accuracy:", eval_results[3])
print("Topics Loss:", eval_results[2])
print("Topics Accuracy:", eval_results[4])

# Function to predict emotions and topic for new data
def predict_emotion_and_topic(text):
    cleaned_text = clean_text(text)
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)
    
    emotions_pred, topics_pred = model.predict(padded_sequence)
    
    emotions_result = dict(zip(emotions, emotions_pred[0]))
    
    # Apply threshold to topic predictions
    topics_binary = (topics_pred > 0.5).astype(int)
    topics_result = mlb.inverse_transform(topics_binary)[0]
    
    return {
        'Emotions': emotions_result,
        'Topics': topics_result
    }

# Example usage
new_text = "চমক ভাই সত্যিই একটা চমক"
results = predict_emotion_and_topic(new_text)
print("\nPredictions for new text:")
print(results)

# Evaluation function
def evaluate_predictions(y_true, y_pred, task):
    accuracy = np.mean(np.all(y_true == y_pred, axis=1))
    hamming_loss = np.mean(np.sum(np.abs(y_true - y_pred), axis=1) / y_true.shape[1])
    print(f"{task} Classification:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Hamming Loss: {hamming_loss:.4f}")

# Evaluate the model
emotions_pred, topics_pred = model.predict(X_test)
emotions_pred_binary = (emotions_pred > 0.5).astype(int)
topics_pred_binary = (topics_pred > 0.5).astype(int)

evaluate_predictions(y_emotions_test, emotions_pred_binary, "Emotions")
evaluate_predictions(y_topic_test, topics_pred_binary, "Topics")

2024-09-26 12:11:51.611301: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2024-09-26 12:11:51.611322: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2024-09-26 12:11:51.611336: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (abir-ThinkPad): /proc/driver/nvidia/version does not exist
2024-09-26 12:11:51.611482: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/10


2024-09-26 12:11:54.474034: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" vendor: "GenuineIntel" model: "110" frequency: 2099 num_cores: 8 environment { key: "cpu_instruction_set" value: "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 32768 l2_cache_size: 262144 l3_cache_size: 8388608 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }




2024-09-26 12:12:28.170198: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" vendor: "GenuineIntel" model: "110" frequency: 2099 num_cores: 8 environment { key: "cpu_instruction_set" value: "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 32768 l2_cache_size: 262144 l3_cache_size: 8388608 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Emotions Loss: 0.44261908531188965
Emotions Accuracy: 0.5596997141838074
Topics Loss: 0.20831391215324402
Topics Accuracy: 0.45555824041366577


2024-09-26 12:24:07.523028: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" vendor: "GenuineIntel" model: "110" frequency: 2099 num_cores: 8 environment { key: "cpu_instruction_set" value: "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 32768 l2_cache_size: 262144 l3_cache_size: 8388608 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }



Predictions for new text:
{'Emotions': {'Love': 0.91592854, 'Joy': 0.55088073, 'Surprise': 0.004137116, 'Anger': 0.0028719532, 'Sadness': 0.00292659, 'Fear': 0.0014635733}, 'Topics': ()}
Emotions Classification:
  Accuracy: 0.4858
  Hamming Loss: 0.1425
Topics Classification:
  Accuracy: 0.3209
  Hamming Loss: 0.0704
