In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from bnlp import CleanText

# Load and clean data
df = pd.read_csv("Bangla Emotion Dataset.csv")

clean_text = CleanText(
   fix_unicode=True,
   unicode_norm=True,
   unicode_norm_form="NFKC",
   remove_url=True,
   remove_email=True,
   remove_emoji=True,
   remove_number=True,
   remove_digits=True,
   remove_punct=True,
   replace_with_url="",
   replace_with_email="",
   replace_with_number="",
   replace_with_digit="",
   replace_with_punct = ""
)

df['clean_description'] = df['Data'].apply(clean_text)

# Prepare target variables
emotions = ['Love', 'Joy', 'Surprise', 'Anger', 'Sadness', 'Fear']
y_emotions = df[emotions].values

# Prepare topic labels
mlb = MultiLabelBinarizer()
y_topic = mlb.fit_transform(df['Topic'].str.split(','))

# Load BanglaBERT tokenizer and model
model_name = "csebuetnlp/banglabert_small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = TFAutoModel.from_pretrained(model_name, from_pt=True)

# Tokenize the text
max_len = 128

def tokenize_function(examples):
    return tokenizer(examples, padding="max_length", truncation=True, max_length=max_len, return_tensors="tf")

# Split the data first
X_train, X_test, y_emotions_train, y_emotions_test, y_topic_train, y_topic_test = train_test_split(
    df['clean_description'].tolist(), y_emotions, y_topic, test_size=0.2, random_state=42
)

# Tokenize the split data
X_train_tokenized = tokenize_function(X_train)
X_test_tokenized = tokenize_function(X_test)

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_train_tokenized),
    {'emotions': y_emotions_train, 'topics': y_topic_train}
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_test_tokenized),
    {'emotions': y_emotions_test, 'topics': y_topic_test}
))

# Build the model
def build_model(num_emotions, num_topics):
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    attention_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
    
    bert_output = bert_model(input_ids, attention_mask=attention_mask)[0]
    cls_token = bert_output[:, 0, :]
    
    emotions_output = Dense(num_emotions, activation='sigmoid', name='emotions')(cls_token)
    topics_output = Dense(num_topics, activation='sigmoid', name='topics')(cls_token)
    
    model = Model(inputs=[input_ids, attention_mask], outputs=[emotions_output, topics_output])
    model.compile(optimizer=Adam(learning_rate=2e-5),
                  loss={'emotions': 'binary_crossentropy', 'topics': 'binary_crossentropy'},
                  metrics={'emotions': 'accuracy', 'topics': 'accuracy'})
    return model

# Train the model
model = build_model(len(emotions), y_topic.shape[1])
history = model.fit(
    train_dataset.batch(16),
    validation_data=test_dataset.batch(16),
    epochs=3
)

# Evaluate the model
eval_results = model.evaluate(test_dataset.batch(16))
print("Emotions Loss:", eval_results[1])
print("Emotions Accuracy:", eval_results[3])
print("Topics Loss:", eval_results[2])
print("Topics Accuracy:", eval_results[4])

# Function to predict emotions and topic for new data
def predict_emotion_and_topic(text):
    cleaned_text = clean_text(text)
    inputs = tokenizer(cleaned_text, padding="max_length", truncation=True, max_length=max_len, return_tensors="tf")
    
    emotions_pred, topics_pred = model.predict(inputs)
    
    emotions_result = dict(zip(emotions, emotions_pred[0]))
    
    # Apply threshold to topic predictions
    topics_binary = (topics_pred > 0.5).astype(int)
    topics_result = mlb.inverse_transform(topics_binary)[0]
    
    return {
        'Emotions': emotions_result,
        'Topics': topics_result
    }

# Example usage
new_text = "চমক ভাই সত্যিই একটা চমক"
results = predict_emotion_and_topic(new_text)
print("\nPredictions for new text:")
print(results)

# Evaluation function
def evaluate_predictions(y_true, y_pred, task):
    accuracy = np.mean(np.all(y_true == y_pred, axis=1))
    hamming_loss = np.mean(np.sum(np.abs(y_true - y_pred), axis=1) / y_true.shape[1])
    print(f"{task} Classification:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Hamming Loss: {hamming_loss:.4f}")

# Evaluate the model
test_predictions = model.predict(test_dataset.batch(16))
emotions_pred, topics_pred = test_predictions[0], test_predictions[1]
emotions_pred_binary = (emotions_pred > 0.5).astype(int)
topics_pred_binary = (topics_pred > 0.5).astype(int)

evaluate_predictions(y_emotions_test, emotions_pred_binary, "Emotions")
evaluate_predictions(y_topic_test, topics_pred_binary, "Topics")