TFIDF + NAIVE ABYES

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB


data = pd.read_csv("processed_emotions_dataset_2.csv")


data['preprocessed_text'] = data['preprocessed_text'].fillna("")


label_mapping = {label: idx for idx, label in enumerate(data['label'].unique())}
data['label'] = data['label'].map(label_mapping)


X = data['preprocessed_text']
y = data['label']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)


accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


Model Accuracy: 80.80%


In [None]:
import numpy as np
import gensim
from gensim.models import KeyedVectors
import gensim.downloader as api
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score





nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')


model = api.load("word2vec-google-news-300")

stop_words = set(stopwords.words('english'))
negation_words = ['not', "don't", 'no', 'never', "can't", "won't"]

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = []
    negation = False
    for word in tokens:
        if word in negation_words:
            negation = not negation
            continue
        if word.isalpha() and word not in stop_words:
            if negation:
                word = 'not_' + word
            filtered_tokens.append(word)
            negation = False
    return filtered_tokens

def get_sentence_vector(tokens):
    vectors = []
    for word in tokens:
        negate = False
        if word.startswith('not_'):
            word = word[4:]
            negate = True
        try:
            vec = model[word]
            if negate:
                vec = -vec
            vectors.append(vec)
        except KeyError:
            continue
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)



data = pd.read_csv("emotions.csv")
train_data = list(zip(data["text"], data["label"]))

numeric_to_string_mapping = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear',
    5: 'surprise'
}

label_mapping = {label: idx for idx, label in enumerate(numeric_to_string_mapping.values())}

train_data = [(sentence, numeric_to_string_mapping[label]) for sentence, label in train_data]

X = []
y = []

for sentence, label in train_data:
    tokens = preprocess_text(sentence)
    vector = get_sentence_vector(tokens)
    X.append(vector)
    y.append(label_mapping[label])


X = np.array(X)
y = np.array(y)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



mean_vectors = []
for cl in np.unique(y_train):
    mean_vectors.append(np.mean(X_train[y_train == cl], axis=0))


S_W = np.zeros((model.vector_size, model.vector_size))
for cl, mv in zip(np.unique(y_train), mean_vectors):
    class_scatter = np.zeros((model.vector_size, model.vector_size))
    for row in X_train[y_train == cl]:
        row, mv = row.reshape(model.vector_size, 1), mv.reshape(model.vector_size, 1)
        class_scatter += (row - mv).dot((row - mv).T)
    S_W += class_scatter


overall_mean = np.mean(X_train, axis=0).reshape(model.vector_size, 1)
S_B = np.zeros((model.vector_size, model.vector_size))
for i, mean_vec in enumerate(mean_vectors):
    n = X_train[y_train == i, :].shape[0]
    mean_vec = mean_vec.reshape(model.vector_size, 1)
    S_B += n * (mean_vec - overall_mean).dot((mean_vec - overall_mean).T)

eig_vals, eig_vecs = np.linalg.eig(np.linalg.pinv(S_W).dot(S_B))


eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))]


eig_pairs = sorted(eig_pairs, key=lambda k: k[0], reverse=True)


k = len(np.unique(y_train)) - 1
W = np.hstack([eig_pairs[i][1].reshape(model.vector_size, 1) for i in range(k)])

X_train_lda = X_train.dot(W)
X_test_lda = X_test.dot(W)


mean_vectors_lda = []
for cl in np.unique(y_train):
    mean_vectors_lda.append(np.mean(X_train_lda[y_train == cl], axis=0))

y_pred = []
for sample in X_test_lda:
    distances = [np.linalg.norm(sample - mean_vec) for mean_vec in mean_vectors_lda]
    y_pred.append(np.argmin(distances))


accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model Accuracy: 69.31%


In [None]:
import numpy as np
import pandas as pd
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

model = api.load("word2vec-google-news-300")


data = pd.read_csv("processed_emotions_dataset_2.csv")


data = data[['preprocessed_text', 'label']]

data['preprocessed_text'] = data['preprocessed_text'].fillna('').astype(str)


X = []
y = data['label'].values
for sentence in data['preprocessed_text']:
    tokens = sentence.split()
    vectors = [model[word] for word in tokens if word in model]
    sentence_vector = np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)
    X.append(sentence_vector)

X = np.array(X)
y = np.array(y)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


mean_vectors = []
for cl in np.unique(y_train):
    mean_vectors.append(np.mean(X_train[y_train == cl], axis=0))


S_W = np.zeros((model.vector_size, model.vector_size))
for cl, mv in zip(np.unique(y_train), mean_vectors):
    class_scatter = np.zeros((model.vector_size, model.vector_size))
    for row in X_train[y_train == cl]:
        row, mv = row.reshape(model.vector_size, 1), mv.reshape(model.vector_size, 1)
        class_scatter += (row - mv).dot((row - mv).T)
    S_W += class_scatter

overall_mean = np.mean(X_train, axis=0).reshape(model.vector_size, 1)
S_B = np.zeros((model.vector_size, model.vector_size))
for i, mean_vec in enumerate(mean_vectors):
    n = X_train[y_train == i, :].shape[0]
    mean_vec = mean_vec.reshape(model.vector_size, 1)
    S_B += n * (mean_vec - overall_mean).dot((mean_vec - overall_mean).T)


eig_vals, eig_vecs = np.linalg.eig(np.linalg.pinv(S_W).dot(S_B))


eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))]


eig_pairs = sorted(eig_pairs, key=lambda k: k[0], reverse=True)


k = len(np.unique(y_train)) - 1
W = np.hstack([eig_pairs[i][1].reshape(model.vector_size, 1) for i in range(k)])


X_train_lda = X_train.dot(W)
X_test_lda = X_test.dot(W)


mean_vectors_lda = []
for cl in np.unique(y_train):
    mean_vectors_lda.append(np.mean(X_train_lda[y_train == cl], axis=0))


y_pred = []
for sample in X_test_lda:
    distances = [np.linalg.norm(sample - mean_vec) for mean_vec in mean_vectors_lda]
    y_pred.append(np.argmin(distances))


accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


Model Accuracy: 53.57%


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize import word_tokenize
import nltk


nltk.download('punkt')


negation_words = ['not', "don't", 'no', 'never', "can't", "won't"]

def handle_negation(text):
    tokens = word_tokenize(text)
    processed_tokens = []
    negation = False
    for word in tokens:
        if word.lower() in negation_words:
            negation = not negation
        elif word.isalpha():
            if negation:
                word = 'not_' + word
            processed_tokens.append(word)
            negation = False
    return ' '.join(processed_tokens)


data = pd.read_csv("processed_emotions_dataset_2.csv")

data['preprocessed_text'] = data['preprocessed_text'].fillna("").apply(handle_negation)


label_mapping = {label: idx for idx, label in enumerate(data['label'].unique())}
data['label'] = data['label'].map(label_mapping)


X = data['preprocessed_text']
y = data['label']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


model = MultinomialNB()
model.fit(X_train_tfidf, y_train)


y_pred = model.predict(X_test_tfidf)


accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Model Accuracy: 80.79%


In [None]:
import pandas as pd


comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

reverse_label_mapping = {v: k for k, v in label_mapping.items()}
comparison_df['Actual_Label'] = comparison_df['Actual'].map(reverse_label_mapping)
comparison_df['Predicted_Label'] = comparison_df['Predicted'].map(reverse_label_mapping)


print(comparison_df.head(20))


        Actual  Predicted  Actual_Label  Predicted_Label
228500       0          0             4                4
408551       3          3             1                1
201709       3          3             1                1
405406       1          1             0                0
74096        2          3             2                1
358880       2          3             2                1
336349       1          1             0                0
340486       2          3             2                1
298125       3          3             1                1
87054        1          1             0                0
416083       2          3             2                1
153467       1          3             0                1
418437       2          3             2                1
18701        0          0             4                4
94059        3          3             1                1
332          0          0             4                4
324917       1          1      

In [None]:

precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None, labels=np.unique(y))


metrics_df = pd.DataFrame({
    'Label': [label for label in label_mapping.keys()],
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1
})

print(metrics_df)

   Label  Precision    Recall  F1 Score
0      4   0.880450  0.663430  0.756687
1      0   0.819868  0.928403  0.870767
2      2   0.918909  0.370555  0.528137
3      1   0.744127  0.956956  0.837227
4      5   0.951948  0.230141  0.370670
5      3   0.920896  0.734761  0.817365


In [None]:

user_input = input("Enter your text: ")
tokens = preprocess_text(user_input)
vector = get_sentence_vector(tokens)
vector = vector.reshape(1, -1)


vector_lda = vector.dot(W)


distances = []
for mean_vec in mean_vectors_lda:
    distances.append(np.linalg.norm(vector_lda - mean_vec))
predicted_class = np.argmin(distances)

emotion = numeric_to_string_mapping[predicted_class]

music_recommendations = {
    'sadness': ['Melancholic Piano', 'Sad Violin Music'],
    'joy': ['Happy Acoustic Guitar', 'Uplifting Piano'],
    'love': ['Romantic Piano', 'Love Songs Instrumental'],
    'anger': ['Intense Rock Instrumental', 'Heavy Metal Instrumental'],
    'fear': ['Dark Cinematic Music', 'Tense Ambient Soundscapes'],
    'surprise': ['Energetic Orchestral Music', 'Exciting Electronic Beats'],
}

print(f"Detected Emotion: {emotion.capitalize()}")
print(f"Recommended Music for {emotion}: {music_recommendations[emotion]}")


In [None]:
import numpy as np
import pandas as pd
import gensim.downloader as api
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE


nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')


model = api.load("word2vec-google-news-300")


stop_words = set(stopwords.words('english'))
negation_words = ['not', "don't", 'no', 'never', "can't", "won't"]

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = []
    negation = False
    for word in tokens:
        if word in negation_words:
            negation = not negation
            continue
        if word.isalpha() and word not in stop_words:
            if negation:
                word = 'not_' + word
            filtered_tokens.append(word)
            negation = False
    return filtered_tokens

def get_sentence_vector(tokens):
    vectors = []
    for word in tokens:
        negate = False
        if word.startswith('not_'):
            word = word[4:]
            negate = True
        try:
            vec = model[word]
            if negate:
                vec = -vec
            vectors.append(vec)
        except KeyError:
            continue
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)


data = pd.read_csv("emotions.csv")


X = []
y = []
print("Processing text data...")
for idx, row in data.iterrows():
    sentence, label = row['text'], row['label']
    tokens = preprocess_text(sentence)
    vector = get_sentence_vector(tokens)
    X.append(vector)
    y.append(label)


X = np.array(X)
y = np.array(y)


print("Applying SMOTE to balance the dataset...")
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


print("Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)


print("Training Logistic Regression classifier...")
classifier = LogisticRegression(
    max_iter=1000, multi_class='multinomial', solver='lbfgs', class_weight='balanced'
)
classifier.fit(X_train, y_train)

print("Predicting on test data...")
y_pred = classifier.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


numeric_to_string_mapping = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear',
    5: 'surprise'
}
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=[numeric_to_string_mapping[i] for i in range(6)]))


In [None]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import time

preprocessed_data = pd.read_csv('processed_emotions_dataset.csv',index_col=0)
preprocessed_data['preprocessed_text_split'] = preprocessed_data['preprocessed_text'].str.split()
preprocessed_data = preprocessed_data.dropna()
print(preprocessed_data)