In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report ,confusion_matrix,ConfusionMatrixDisplay

In [2]:
email_data = pd.read_csv("data/emails.csv")

# separating the text and category
X = email_data["text"]
y = email_data["spam"]

# splitting the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=5)

# Turning the data into numerical format using TfidfVectorizer
vectorizer = TfidfVectorizer(min_df = 1, stop_words="english", lowercase=True)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Changing the y variable to type int
y_train = y_train.astype("int")
y_test = y_test.astype("int")


In [3]:
from sklearn.ensemble import RandomForestClassifier

# Create an instance of the RandomForestClassifier
random_forest = RandomForestClassifier()

# Train the RandomForestClassifier model
random_forest.fit(X_train_vec, y_train)


In [16]:
# Predict probabilities for each class
predicted_probabilities = random_forest.predict_proba(X_test_vec)

# Define your custom threshold
new_threshold = 0.445

# Create adjusted predictions based on the threshold
y_pred_adjusted = np.array([1 if prob[1] > new_threshold else 0 for prob in predicted_probabilities])

# Calculate accuracy and print the classification report
accuracy = accuracy_score(y_test, y_pred_adjusted)
print(f"Accuracy: {accuracy:.4f}")

print(classification_report(y_test, y_pred_adjusted, target_names=['Spam', 'Not Spam']))

Accuracy: 0.9948
              precision    recall  f1-score   support

        Spam       0.99      1.00      1.00       449
    Not Spam       1.00      0.98      0.99       124

    accuracy                           0.99       573
   macro avg       1.00      0.99      0.99       573
weighted avg       0.99      0.99      0.99       573

