In [1]:
# importing Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from tokenizers import Tokenizer
from xgboost import XGBClassifier

In [2]:
# loading and reading dataset
df = pd.read_csv("Bangla Emotion Dataset.csv")
df.head()

Unnamed: 0,ID,Data,Love,Joy,Surprise,Anger,Sadness,Fear,Topic,Domain
0,19667,ভালবাসা আরো হাগার গুণ বেড়ে গেল,1,1,0,0,0,0,Personal,Youtube
1,19120,ভালোবাসার আরেক নাম প্রিয় নবী হযরত মুহাম্মদ ( স...,1,0,0,0,0,0,Music,Youtube
2,10666,দেশ থেকে চলে এসে এখন আমি যে তার পাশে দাঁড়িয়ে...,1,1,0,0,0,0,Personal,Facebook
3,17250,অন্তরে শুধু তুমি,1,0,0,0,0,0,Personal,Youtube
4,21877,ভালবেসে বিয়ে করেছেন একজন এসিড দগ্ধ নারীকে। সম্...,1,1,0,0,0,0,Education,Facebook


In [3]:
df.shape

(20642, 10)

In [4]:
df.describe()

Unnamed: 0,ID,Love,Joy,Surprise,Anger,Sadness,Fear
count,20642.0,20642.0,20642.0,20642.0,20642.0,20642.0,20642.0
mean,14127.935374,0.20187,0.443416,0.048106,0.197413,0.250412,0.017731
std,8007.245026,0.401405,0.4968,0.213995,0.398056,0.433261,0.131975
min,8.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7362.25,0.0,0.0,0.0,0.0,0.0,0.0
50%,14313.5,0.0,0.0,0.0,0.0,0.0,0.0
75%,20789.75,0.0,1.0,0.0,0.0,1.0,0.0
max,28164.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
# Clean text using bnlp library
from bnlp import CleanText

clean_text = CleanText(
   fix_unicode=True,
   unicode_norm=True,
   unicode_norm_form="NFKC",
   remove_url=True,
   remove_email=True,
   remove_emoji=True,
   remove_number=True,
   remove_digits=True,
   remove_punct=True,
   replace_with_url="",
   replace_with_email="",
   replace_with_number="",
   replace_with_digit="",
   replace_with_punct = ""
)

df['clean_description'] = df['Data'].apply(clean_text)
print("Clean descriptions:")
print(df['clean_description'].head(15))

Clean descriptions:
0                       ভালবাসা আরো হাগার গুণ বেড়ে গেল
1       ভালোবাসার আরেক নাম প্রিয় নবী হযরত মুহাম্মদ  সা
2     দেশ থেকে চলে এসে এখন আমি যে তার পাশে দাঁড়িয়ে...
3                                      অন্তরে শুধু তুমি
4     ভালবেসে বিয়ে করেছেন একজন এসিড দগ্ধ নারীকে সম্...
5      নোয়াখালী এসে ব্লগ করর অনুরদ রইলো আফ্রিদি ভাইয়া
6                                 সুন্দর একটা ব্লগ ছিলো
7     মানুষের দোয়াভালোবাসা নিয়ে সাফল্যের হিমালয় প...
8                              চমক ভাই সত্যিই একটা চমক 
9                                      অনেক দারুন লাগলো
10                               জীবনে আপনার মত হতে চাই
11                                    অনেক অনেক ধন্যবাদ
12        দারুন বস সামনে আরো নতুন অনেক চমক দিবে আসা করি
13     অসাধারণ পারভেজ ভাই অসাধারণ কম্বিনেশন ডিজে রাহাত 
14                            বড় মানুষের মনটাও বড় হয়
Name: clean_description, dtype: object


In [6]:
# Prepare features
X = df['clean_description']

emotions = ['Love', 'Joy', 'Surprise', 'Anger', 'Sadness', 'Fear']
y_emotions = df[emotions].values

# Prepare topic labels
mlb = MultiLabelBinarizer()
y_topic = mlb.fit_transform(df['Topic'].str.split(','))

# TF-IDF Vectorization with n-grams
tfidf = TfidfVectorizer(max_features=None, ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(X)

# Split the data
X_train, X_test, y_emotions_train, y_emotions_test, y_topic_train, y_topic_test = train_test_split(
    X_tfidf, y_emotions, y_topic, test_size=0.2, random_state=42)


In [7]:
# XGBoost for Emotion Classification
xgb_emotions = MultiOutputClassifier(XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
xgb_emotions.fit(X_train, y_emotions_train)

# XGBoost for Topic Classification
xgb_topic = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_topic.fit(X_train, y_topic_train)

In [8]:
from sklearn.metrics import accuracy_score, hamming_loss
# Evaluation function
def evaluate_model(model, X_test, y_test, model_name, task):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    hl = hamming_loss(y_test, y_pred)
    print(f"{model_name} - {task.capitalize()} Classification:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Hamming Loss: {hl:.4f}")
    
# Evaluate models
evaluate_model(xgb_emotions, X_test, y_emotions_test, "XGBoost", "emotions")
evaluate_model(xgb_topic, X_test, y_topic_test, "XGBoost", "topic")


XGBoost - Emotions Classification:
  Accuracy: 0.2490
  Hamming Loss: 0.1621
XGBoost - Topic Classification:
  Accuracy: 0.2037
  Hamming Loss: 0.0723


In [9]:
# Function to predict emotions and topic for new data
def predict_emotion_and_topic(text):
    cleaned_text = clean_text(text)
    input_tfidf = tfidf.transform([cleaned_text])
    
    emotions_pred = xgb_emotions.predict(input_tfidf)[0]
    topic_pred = xgb_topic.predict(input_tfidf)[0]
    
    return {
        'Emotions': dict(zip(emotions, emotions_pred)),
        'Topics': mlb.inverse_transform(topic_pred.reshape(1, -1))[0]
    }

# Example usage
new_text = "চমক ভাই সত্যিই একটা চমক"
results = predict_emotion_and_topic(new_text)
print("\nPredictions for new text:")
print(results)


Predictions for new text:
{'Emotions': {'Love': 1, 'Joy': 1, 'Surprise': 0, 'Anger': 0, 'Sadness': 0, 'Fear': 0}, 'Topics': ('Education',)}
