In [3]:
import os
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

In [4]:
import nltk

nltk.download('punkt', download_dir='C:/Users/user/AppData/Roaming/nltk_data')
nltk.download('stopwords', download_dir='C:/Users/user/AppData/Roaming/nltk_data')

[nltk_data] Downloading package punkt to
[nltk_data]     C:/Users/user/AppData/Roaming/nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:/Users/user/AppData/Roaming/nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [5]:
import nltk
import os

# Set the environment variable for NLTK data
nltk.data.path.append('C:/Users/user/AppData/Roaming/nltk_data')

# Try downloading again
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:

def load_emails(data_directory):
    emails = []
    labels = []
    for filename in os.listdir(data_directory):
        if filename.endswith(".txt"):
            with open(os.path.join(data_directory, filename), 'r', encoding='utf-8', errors='ignore') as file:
                emails.append(file.read())
                labels.append('spam' if 'spam' in filename else 'ham')
    return pd.DataFrame({'email': emails, 'label': labels})


In [5]:
data_directory = r"C:\Users\user\Downloads\training emails"
emails_df = load_emails(data_directory)
emails_df.head()

Unnamed: 0,email,label
0,Subject: christmas tree farm pictures\n,ham
1,"Subject: vastar resources , inc .\ngary , prod...",ham
2,Subject: calpine daily gas nomination\n- calpi...,ham
3,Subject: re : issue\nfyi - see note below - al...,ham
4,Subject: meter 7268 nov allocation\nfyi .\n- -...,ham


In [6]:
def preprocess_text(text):
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalpha() and word not in stopwords.words('english')]
    return ' '.join(words)

In [None]:
emails_df['processed_email'] = emails_df['email'].apply(preprocess_text)

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=3000)
X = tfidf_vectorizer.fit_transform(emails_df['processed_email']).toarray()
y = emails_df['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

In [None]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

In [None]:
svm_predictions = svm_model.predict(X_test)
print("SVM Model")
print(classification_report(y_test, svm_predictions))
print("Accuracy:", accuracy_score(y_test, svm_predictions))

In [None]:
nb_predictions = nb_model.predict(X_test)
print("Naive Bayes Model")
print(classification_report(y_test, nb_predictions))
print("Accuracy:", accuracy_score(y_test, nb_predictions))

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
svm_predictions = svm_model.predict(X_test)
svm_cm = confusion_matrix(y_test, svm_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(svm_cm, annot=True, fmt='d', cmap='Blues', xticklabels=['ham', 'spam'], yticklabels=['ham', 'spam'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - SVM Model')
plt.show()