In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import random
from collections import defaultdict
nltk.download('stopwords')
from nltk.corpus import stopwords


label_mapping = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear',
    5: 'surprise'
}


music_recommendations = {
    'sadness': ['Melancholic Piano', 'Sad Violin Music'],
    'joy': ['Happy Acoustic Guitar', 'Uplifting Piano'],
    'love': ['Romantic Piano', 'Love Songs Instrumental'],
    'anger': ['Intense Rock Instrumental', 'Heavy Metal Instrumental'],
    'fear': ['Dark Cinematic Music', 'Tense Ambient Soundscapes'],
    'surprise': ['Energetic Orchestral Music', 'Exciting Electronic Beats'],
}


data = pd.read_csv('emotions.csv')


sample_size = 1000


data_sample = data.sample(n=sample_size, random_state=42).reset_index(drop=True)


stop_words = set(stopwords.words('english'))

def preprocess_text(text):

    text = text.lower()


    contractions = {
        "dont": "do not",
        "cant": "cannot",
        "wont": "will not",
        "im": "i am",
        "ive": "i have",
        "id": "i would",
        "youre": "you are",
        "isnt": "is not",
        "wasnt": "was not",
        "shouldnt": "should not",
        "couldnt": "could not",
        "doesnt": "does not",
        "havent": "have not",
        "hasnt": "has not",
        "hadnt": "had not",
        "arent": "are not",
        "werent": "were not",
        "wouldnt": "would not",
        "mustnt": "must not",
        "mightnt": "might not",
        "didnt": "did not",
        "neednt": "need not",
        "oughtnt": "ought not",
        "im": "i am",
        "hes": "he is",
        "shes": "she is",
        "its": "it is",
        "thats": "that is",
        "theres": "there is",
        "whats": "what is",
        "wheres": "where is",
        "whos": "who is",
        "theyre": "they are",
        "weve": "we have",
        "were": "we are",
    }


    for contraction, replacement in contractions.items():
        text = re.sub(r'\b' + contraction + r'\b', replacement, text)


    tokens = text.split()
    tokens = handle_negations(tokens)


    tokens = [re.sub(r'[^a-zA-Z]', '', token) for token in tokens]


    tokens = [token for token in tokens if token and token not in stop_words]

    return tokens

def handle_negations(tokens):
    negation_words = set(['no', 'not', 'never', 'none', 'cannot', 'dont', 'do not'])
    transformed_tokens = []
    negate = False
    for token in tokens:
        if token in negation_words:
            negate = True
        elif negate:
            transformed_tokens.append('not_' + token)
            negate = False
        else:
            transformed_tokens.append(token)
    return transformed_tokens


documents = data_sample['text'].tolist()
processed_docs = [preprocess_text(doc) for doc in documents]


vocab = set()
for doc in processed_docs:
    vocab.update(doc)
vocab = list(vocab)
vocab_to_id = {word: idx for idx, word in enumerate(vocab)}
id_to_vocab = {idx: word for idx, word in enumerate(vocab)}


documents_word_ids = [[vocab_to_id[word] for word in doc if word in vocab_to_id] for doc in processed_docs]

K = 10

alpha = 0.1
beta = 0.1


D = len(documents_word_ids)
V = len(vocab)

N_dk = np.zeros((D, K)) + alpha
N_kw = np.zeros((K, V)) + beta
N_k = np.zeros(K) + V * beta


topic_assignments = []
for d, doc in enumerate(documents_word_ids):
    current_doc_topics = []
    for w in doc:
        k = random.randint(0, K - 1)
        N_dk[d, k] += 1
        N_kw[k, w] += 1
        N_k[k] += 1
        current_doc_topics.append(k)
    topic_assignments.append(current_doc_topics)


def gibbs_sampling(iterations):
    for it in range(iterations):
        for d, doc in enumerate(documents_word_ids):
            for i, w in enumerate(doc):
                k = topic_assignments[d][i]

                N_dk[d, k] -= 1
                N_kw[k, w] -= 1
                N_k[k] -= 1

                left = N_kw[:, w] / N_k
                right = N_dk[d, :] / np.sum(N_dk[d, :])
                p_k = left * right
                p_k /= np.sum(p_k)

                new_k = np.random.choice(np.arange(K), p=p_k)

                N_dk[d, new_k] += 1
                N_kw[new_k, w] += 1
                N_k[new_k] += 1

                topic_assignments[d][i] = new_k
        if (it + 1) % 10 == 0:
            print(f"Iteration {it + 1} completed.")

iterations = 100
gibbs_sampling(iterations)

topic_to_emotion = {}
for k in range(K):
    topic_docs = [d for d in range(D) if np.argmax(N_dk[d, :]) == k]
    labels = data_sample.iloc[topic_docs]['label'].tolist()
    if labels:
        labels_counts = np.bincount(labels)
        majority_label = np.argmax(labels_counts)
        topic_to_emotion[k] = label_mapping[majority_label]
    else:
        topic_to_emotion[k] = 'unknown'

print("Topic to Emotion Mapping:")
for k in range(K):
    print(f"Topic {k}: {topic_to_emotion[k]}")

def predict_emotion(text):
    tokens = preprocess_text(text)
    word_ids = [vocab_to_id[word] for word in tokens if word in vocab_to_id]

    N_dk_new = np.zeros(K) + alpha

    topic_assignments_new = []
    for w in word_ids:
        k = random.randint(0, K - 1)
        N_dk_new[k] += 1
        topic_assignments_new.append(k)

    iterations_new = 20
    for it in range(iterations_new):
        for i, w in enumerate(word_ids):
            k = topic_assignments_new[i]

            N_dk_new[k] -= 1

            left = N_kw[:, w] / N_k
            right = N_dk_new / np.sum(N_dk_new)
            p_k = left * right
            p_k /= np.sum(p_k)

            new_k = np.random.choice(np.arange(K), p=p_k)

            N_dk_new[new_k] += 1
            topic_assignments_new[i] = new_k

    theta_new = N_dk_new / np.sum(N_dk_new)

    pred_topic = np.argmax(theta_new)
    pred_emotion = topic_to_emotion.get(pred_topic, 'unknown')

    return pred_emotion

def compute_accuracy():
    correct = 0
    for d in range(D):
        pred_topic = np.argmax(N_dk[d, :])
        pred_emotion = topic_to_emotion.get(pred_topic, 'unknown')
        true_label = data_sample.iloc[d]['label']
        true_emotion = label_mapping[true_label]
        if pred_emotion == true_emotion:
            correct += 1
    accuracy = correct / D
    print(f"Accuracy: {accuracy:.4f}")

compute_accuracy()
