In [15]:
import pandas as pd
import os
import re

def read_spam():
    category = 'spam'
    directory = './enron1/spam'
    return read_category(category, directory)

def read_ham():
    category = 'ham'
    directory = './enron1/ham'
    return read_category(category, directory)

def read_category(category, directory):
    emails = []
    for filename in os.listdir(directory):
        if not filename.endswith(".txt"):
            continue
        with open(os.path.join(directory, filename), 'r') as fp:
            try:
                content = fp.read()
                emails.append({'name': filename, 'content': content, 'category': category})
            except:
                print(f'skipped {filename}')
    return emails
    
def preprocessor(e):
    # Replace all non-alphabet characters with a space
    e = re.sub('[^a-zA-Z]', ' ', e)
    # Convert the string to lowercase
    return e.lower()

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# The CountVectorizer converts a text sample into a vector (think of it as an array of floats).
vectorizer = CountVectorizer(preprocessor=preprocessor)

# Use train_test_split to split the dataset into a train dataset and a test dataset.
X = df['content']
y = df['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use the vectorizer to transform the existing dataset into a form in which the model can learn from.
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Use the LogisticRegression model to fit to the train dataset.
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Then generate the predictions.
y_pred = model.predict(X_test_vec)

#create the vectorizer
features = vectorizer.get_feature_names_out()


# You can access these numbers known as "coefficients" from the coef_ property of the model
# We will be looking at coef_[0] which represents the importance of each feature.
importance = model.coef_[0]

#Get top 10 positive and negative features
top_positive_features = sorted(zip(importance, features), reverse=True)[:10]
top_negative_features = sorted(zip(importance, features))[:10]


ham = read_ham()
spam = read_spam()

df_ham = pd.DataFrame.from_records(ham)
df_spam = pd.DataFrame.from_records(spam)

df = pd.concat([df_ham, df_spam], ignore_index=True)


print(df.head())

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

print("Top 10 Positive Features (Spam):")
for coef, feature in top_positive_features:
    print(feature, coef)

print("\nTop 10 Negative Features (Ham):")
for coef, feature in top_negative_features:
    print(feature, coef)



skipped 2248.2004-09-23.GP.spam.txt
skipped 2526.2004-10-17.GP.spam.txt
skipped 2698.2004-10-31.GP.spam.txt
skipped 4566.2005-05-24.GP.spam.txt
                             name  \
0  0001.1999-12-10.farmer.ham.txt   
1  0002.1999-12-13.farmer.ham.txt   
2  0003.1999-12-14.farmer.ham.txt   
3  0004.1999-12-14.farmer.ham.txt   
4  0005.1999-12-14.farmer.ham.txt   

                                             content category  
0            Subject: christmas tree farm pictures\n      ham  
1  Subject: vastar resources , inc .\ngary , prod...      ham  
2  Subject: calpine daily gas nomination\n- calpi...      ham  
3  Subject: re : issue\nfyi - see note below - al...      ham  
4  Subject: meter 7268 nov allocation\nfyi .\n- -...      ham  
Accuracy: 0.9748549323017408
Confusion Matrix:
 [[717  12]
 [ 14 291]]
Classification Report:
               precision    recall  f1-score   support

         ham       0.98      0.98      0.98       729
        spam       0.96      0.95      0.96  