In [None]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import PCA
from transformers import AutoTokenizer, AutoModel
import torch
import nltk


nltk.download('punkt')
nltk.download('stopwords')


tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
stop_words = set(stopwords.words('english'))

def preprocess_text(text):

    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return " ".join(filtered_tokens)

def get_sentence_embedding(text):

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)

        return outputs.last_hidden_state[:, 0, :].squeeze().numpy()


data = pd.read_csv("emotions.csv")
data["processed_text"] = data["text"].apply(preprocess_text)


numeric_to_string_mapping = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
data["label_string"] = data["label"].map(numeric_to_string_mapping)


embeddings = np.array([get_sentence_embedding(text) for text in data["processed_text"]])


string_to_numeric_mapping = {v: k for k, v in numeric_to_string_mapping.items()}
data["label_numeric"] = data["label_string"].map(string_to_numeric_mapping)


X_train, X_test, y_train, y_test = train_test_split(
    embeddings, data["label_numeric"].values, test_size=0.2, random_state=42, stratify=data["label_numeric"]
)


pca = PCA(n_components=50)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)


clf = LogisticRegression(max_iter=1000, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
