In [42]:
import pandas as pd

# Load dataset
df = pd.read_csv("data/emails.csv")

print("Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

print("\nColumn names (first 10):")
print(df.columns[:10])


Shape: (5172, 3002)

First 5 rows:
  Email No.  the  to  ect  and  for  of    a  you  hou  ...  connevey  jay  \
0   Email 1    0   0    1    0    0   0    2    0    0  ...         0    0   
1   Email 2    8  13   24    6    6   2  102    1   27  ...         0    0   
2   Email 3    0   0    1    0    0   0    8    0    0  ...         0    0   
3   Email 4    0   5   22    0    5   1   51    2   10  ...         0    0   
4   Email 5    7   6   17    1    5   2   57    0    9  ...         0    0   

   valued  lay  infrastructure  military  allowing  ff  dry  Prediction  
0       0    0               0         0         0   0    0           0  
1       0    0               0         0         0   1    0           0  
2       0    0               0         0         0   0    0           0  
3       0    0               0         0         0   0    0           0  
4       0    0               0         0         0   1    0           0  

[5 rows x 3002 columns]

Column names (first 10):
I

In [43]:
from sklearn.model_selection import train_test_split

# Drop "Email No." (not useful for training)
X = df.drop(columns=["Email No.", "Prediction"])

# Target column
y = df["Prediction"]

print("Features shape:", X.shape)
print("Labels shape:", y.shape)

# Split dataset into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)


Features shape: (5172, 3000)
Labels shape: (5172,)
Training set size: (4137, 3000)
Test set size: (1035, 3000)


In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9826086956521739

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       735
           1       0.96      0.98      0.97       300

    accuracy                           0.98      1035
   macro avg       0.98      0.98      0.98      1035
weighted avg       0.98      0.98      0.98      1035


Confusion Matrix:
[[722  13]
 [  5 295]]


In [45]:
import joblib

# Save the trained model
joblib.dump(model, "spam_classifier.pkl")
print("Model saved as spam_classifier.pkl")

# Example: load model back
loaded_model = joblib.load("spam_classifier.pkl")

# Test on one example email vector (from X_test)
sample = X_test.iloc[0].values.reshape(1, -1)
pred = loaded_model.predict(sample)
print("Sample Prediction:", pred)


Model saved as spam_classifier.pkl
Sample Prediction: [0]




In [46]:
def predict_email(model, email_features):
    """
    model: trained model
    email_features: one row (DataFrame with same columns as training data)
    """
    pred = model.predict(email_features)[0]
    return "Spam" if pred == 1 else "Not Spam"


# Example: pick random test sample
sample = X_test.sample(1, random_state=42)
print("Actual Label:", "Spam" if y_test.loc[sample.index[0]] == 1 else "Not Spam")

print("Predicted Label:", predict_email(model, sample))


Actual Label: Not Spam
Predicted Label: Not Spam


In [47]:
import pandas as pd

df = pd.read_csv("SMSSpamCollection", sep="\t", header=None, names=["label", "text"])
print(df.head())


  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [48]:
from sklearn.model_selection import train_test_split

X = df["text"]                  # sirf text column
y = df["label"].map({"ham": 0, "spam": 1})  # target numeric

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", X_train.shape, "Test size:", X_test.shape)


Train size: (4457,) Test size: (1115,)


In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Text → vectors
vectorizer = TfidfVectorizer(stop_words="english", max_features=3000)

X_train_vec = vectorizer.fit_transform(X_train)   # 4137 rows
X_test_vec = vectorizer.transform(X_test)         # 1435 rows

print("X_train_vec shape:", X_train_vec.shape)
print("y_train shape:", y_train.shape)

# Train Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

# Evaluate
y_pred = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


X_train_vec shape: (4457, 3000)
y_train shape: (4457,)
Accuracy: 0.9721973094170404
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.79      0.88       149

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [50]:
def predict_custom_email(text):
    vec = vectorizer.transform([text])
    pred = model.predict(vec)[0]
    return "Spam" if pred == 1 else "Not Spam"

# Example
print(predict_custom_email("Congratulations! You won a $1000 gift card. Click here to claim."))
print(predict_custom_email("Dear Hassan, please find attached the project report for tomorrow's meeting."))


Spam
Not Spam


In [51]:
import joblib

joblib.dump(model, "spam_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")


['vectorizer.pkl']