In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from transformers import AutoTokenizer, AutoModel, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from xgboost import XGBClassifier
from datasets import Dataset


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

data = pd.read_csv("emotions.csv")


numeric_to_string_mapping = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
string_to_numeric_mapping = {v: k for k, v in numeric_to_string_mapping.items()}

data["label_string"] = data["label"].map(numeric_to_string_mapping)


tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("bert-base-uncased").to(device)

def preprocess_text(text):
    
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = bert_model(**inputs)
        return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()


data["processed_text"] = data["text"].apply(preprocess_text)
embeddings = np.array(data["processed_text"].to_list())
labels = data["label"].values


X_train, X_test, y_train, y_test = train_test_split(
    embeddings, labels, test_size=0.2, random_state=42, stratify=labels
)


smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)



clf = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42, tree_method="gpu_hist")
clf.fit(X_train, y_train)


y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


train_dataset = Dataset.from_dict({
    "text": [" ".join(map(str, x)) for x in X_train],
    "label": y_train
})

test_dataset = Dataset.from_dict({
    "text": [" ".join(map(str, x)) for x in X_test],
    "label": y_test
})

tokenized_train = train_dataset.map(lambda e: tokenizer(e["text"], truncation=True, padding="max_length", max_length=128), batched=True)
tokenized_test = test_dataset.map(lambda e: tokenizer(e["text"], truncation=True, padding="max_length", max_length=128), batched=True)


bert_finetune = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(numeric_to_string_mapping)).to(device)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model=bert_finetune,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)


trainer.train()


predictions = trainer.predict(tokenized_test)
final_preds = np.argmax(predictions.predictions, axis=1)
final_accuracy = accuracy_score(y_test, final_preds)
print(f"Fine-Tuned BERT Accuracy: {final_accuracy * 100:.2f}%")
