## Naive Bayes - TF-IDF model

In [None]:
# Import the necessary libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [None]:
# Load the dataset
data = pd.read_csv('balanced.csv')

In [None]:
# Turn label into discrete variable
label_encoder = LabelEncoder()
data['condition_label'] = label_encoder.fit_transform(data['condition_label'])

In [None]:
# Split data into training and testing sets
X = data['clean_text']
y = data['condition_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

In [None]:
# Convert text data into numerical features using TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

In [None]:
# Train Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_counts, y_train)

In [None]:
# Make predictions on the test data
y_pred = classifier.predict(X_test_counts)

In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

In [None]:
# Print classification report
print(classification_report(y_test, y_pred))

In [None]:
# Define and display confusion matrix
def plot_confusion_matrix(y_test, y_pred, labels):
    cm = confusion_matrix(y_test, y_pred, labels=labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    fig, ax = plt.subplots(figsize=(6,6))
    plt.rcParams.update({'font.size':13, 'font.monospace':'Computer Modern Typewriter'})
    disp.plot(ax=ax, cmap = "binary", colorbar=False)

plot_confusion_matrix(y_test, y_pred,   labels=classifier.classes_)