In [None]:
import numpy as np
import pandas as pd
import gensim
from gensim.models import KeyedVectors
import gensim.downloader as api
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk


nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')


model = api.load("word2vec-google-news-300")

stop_words = set(stopwords.words('english'))
negation_words = ['not', "don't", 'no', 'never', "can't", "won't"]

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = []
    negation = False
    for word in tokens:
        if word in negation_words:
            negation = not negation
            continue
        if word.isalpha() and word not in stop_words:
            if negation:
                word = 'not_' + word
            filtered_tokens.append(word)
            negation = False
    return filtered_tokens

def get_weighted_sentence_vector(tokens, tfidf_weights):
    vectors = []
    for word in tokens:
        negate = False
        if word.startswith('not_'):
            word = word[4:]
            negate = True
        try:
            vec = model[word]
            weight = tfidf_weights.get(word, 0.0)
            vec = vec * weight
            if negate:
                vec = -vec
            vectors.append(vec)
        except KeyError:
            continue
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)


data = pd.read_csv("emotions.csv")
data['tokens'] = data['text'].apply(preprocess_text)


all_tokens = [' '.join(tokens) for tokens in data['tokens']]
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(all_tokens)
tfidf_weights = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_vectorizer.idf_))


X = np.array([get_weighted_sentence_vector(tokens, tfidf_weights) for tokens in data['tokens']])
y = data['label'].values


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


y_pred = clf.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))
