In [22]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report ,confusion_matrix,ConfusionMatrixDisplay

In [34]:
email_data = pd.read_csv("data/emails.csv")

# separating the text and category
X = email_data["text"]
y = email_data["spam"]

# splitting the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=5)

# Turning the data into numerical format using TfidfVectorizer
vectorizer = TfidfVectorizer(min_df = 1, stop_words="english", lowercase=True)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Changing the y variable to type int
y_train = y_train.astype("int")
y_test = y_test.astype("int")


4131    Subject: updated presentation  i added guadalu...
928     Subject: your in - home source of health infor...
4984    Subject: re : book order  julie ,  there are m...
498     Subject: all graphics software available , che...
1134    Subject: 25 mmg works wonders  how to save on ...
                              ...                        
3046    Subject: new resume  dear vince ,  i am so gra...
1725    Subject: research reporting  tani - are you ha...
4079    Subject: re : aiesec polska - eurolds 2000  cz...
2254    Subject: re : paper - request ( informs meetin...
2915    Subject: congratulations !  congratulations on...
Name: text, Length: 5155, dtype: object

In [26]:
from sklearn.ensemble import GradientBoostingClassifier

# Create an instance of the Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier()

# Train the classifier on the training data
gb_classifier.fit(X_train_vec, y_train)

In [33]:
# Predict probabilities for each class
predicted_probabilities = gb_classifier.predict_proba(X_test_vec)

# Define your custom threshold
new_threshold = 0.4

# Create adjusted predictions based on the threshold
y_pred_adjusted = np.array([1 if prob[1] > new_threshold else 0 for prob in predicted_probabilities])

# Calculate accuracy and print the classification report
accuracy = accuracy_score(y_test, y_pred_adjusted)

print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred_adjusted, target_names=['Spam', 'Not Spam']))

Accuracy: 0.9843
              precision    recall  f1-score   support

        Spam       0.99      0.99      0.99       449
    Not Spam       0.95      0.98      0.96       124

    accuracy                           0.98       573
   macro avg       0.97      0.98      0.98       573
weighted avg       0.98      0.98      0.98       573

