In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.utils import resample


df = pd.read_csv('C://Program Files/spam.csv')
df = df.dropna(subset=['Message', 'Category'])
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})

df_spam = df[df['Category'] == 1]
df_ham = df[df['Category'] == 0]


if len(df_spam) < len(df_ham):
    df_spam = resample(df_spam, replace=True, n_samples=len(df_ham), random_state=42)
df = pd.concat([df_spam, df_ham])

df = df.sample(frac=1, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(df['Message'], df['Category'], test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = MultinomialNB()
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")

def predict_email(email_text):
    email_tfidf = vectorizer.transform([email_text])
    prob = model.predict_proba(email_tfidf)[0][1]  
    prediction = "Spam" if prob > 0.3 else "Not Spam" 
    return prediction

email = input("Enter an email to classify: ")
print("Prediction:", predict_email(email))

Accuracy: 0.9798
Confusion Matrix:
[[938  17]
 [ 22 953]]
Classification Report:


Enter an email to classify:  abdullahsheikh1502@icloud.com


Prediction: Spam
