## Naive Bayes - Word Embeddings model

In [None]:
# Import the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Load the pre-trained GloVe embeddings
glove_file = "glove.6B.200d.txt"
word_embeddings = {}
with open(glove_file, "r", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        embedding = np.array(values[1:], dtype="float32")
        word_embeddings[word] = embedding

In [None]:
# Load the dataset
data = pd.read_csv("balanced.csv")

In [None]:
# Define variables
X = data['clean_text']
y = data['condition_label']

In [None]:
# Convert a sentence into a feature vector using GloVe embeddings
def sentence_to_vector(words):
    vectors = [word_embeddings[word] for word in words if word in word_embeddings]
    return np.mean(vectors, axis=0) if vectors else np.zeros(200)

X = np.array([sentence_to_vector(words) for words in X])

In [None]:
# Scale and transform variables
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
y = np.array(y)

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

In [None]:
# Train Naive Bayes classifier
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [None]:
# Make predictions on the test data
y_pred = classifier.predict(X_test)

In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.5f}")

In [None]:
# Print classification report
print(classification_report(y_test, y_pred))

In [None]:
# Define and display confusion matrix
def plot_confusion_matrix(y_test, y_pred, labels):
    cm = confusion_matrix(y_test, y_pred, labels=labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    fig, ax = plt.subplots(figsize=(6,6))
    plt.rcParams.update({'font.size':13, 'font.monospace':'Computer Modern Typewriter'})
    disp.plot(ax=ax, cmap = "binary", colorbar=False)

plot_confusion_matrix(y_test, y_pred,   labels=classifier.classes_)