# Email Classification with Logistic Regression

In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

## Load the Dataset

In [2]:
# Load the dataset
data = pd.read_csv('email_dataset.csv')
data.head()

Unnamed: 0,text,label
0,Subject: enron methanol ; meter # : 988291\r\n...,ham
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",ham
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",ham
3,"Subject: photoshop , windows , office . cheap ...",spam
4,Subject: re : indian springs\r\nthis deal is t...,ham


## Text Cleaning Function

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

data['clean_text'] = data['text'].apply(clean_text)
data.head()

Unnamed: 0,text,label,clean_text
0,Subject: enron methanol ; meter # : 988291\r\n...,ham,subject enron methanol meter 988291 this is a ...
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",ham,subject hpl nom for january 9 2001 see attache...
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",ham,subject neon retreat ho ho ho we re around to ...
3,"Subject: photoshop , windows , office . cheap ...",spam,subject photoshop windows office cheap main tr...
4,Subject: re : indian springs\r\nthis deal is t...,ham,subject re indian springs this deal is to book...


## TF-IDF Feature Extraction

In [4]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(data['clean_text'])
y = data['label']

print("Feature Matrix Shape:", X.shape)

Feature Matrix Shape: (5171, 5000)


## Split the Data

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)

Training set size: (4136, 5000)
Test set size: (1035, 5000)


## Train Logistic Regression Model

In [6]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

print("Model training completed.")

Model training completed.


## Model Evaluation

In [7]:
# Predictions
y_pred = model.predict(X_test)

# Evaluation Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9874396135265701
Classification Report:
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99       742
        spam       0.98      0.98      0.98       293

    accuracy                           0.99      1035
   macro avg       0.98      0.99      0.98      1035
weighted avg       0.99      0.99      0.99      1035



## Save Model and Vectorizer

In [8]:
joblib.dump(model, 'email_classifier_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

print("Model and vectorizer saved successfully.")

Model and vectorizer saved successfully.
