In [16]:
import pandas as pd  # Import pandas for data manipulation and analysis
from sklearn.feature_extraction.text import TfidfVectorizer  # Import TfidfVectorizer to convert text documents into TF-IDF feature vectors
from sklearn.ensemble import RandomForestClassifier  # Import RandomForestClassifier to build an ensemble classification model for spam detection
from sklearn.model_selection import train_test_split  # Import train_test_split to split the dataset into training and testing sets
import joblib  # Import joblib for saving and loading the trained model pipeline


In [17]:
# Load the CSV dataset
data = pd.read_csv('spam_ham_dataset.csv')

In [18]:
# Check dataset columns
print(data.head())

   Unnamed: 0 label                                               text  \
0         605   ham  Subject: enron methanol ; meter # : 988291\r\n...   
1        2349   ham  Subject: hpl nom for january 9 , 2001\r\n( see...   
2        3624   ham  Subject: neon retreat\r\nho ho ho , we ' re ar...   
3        4685  spam  Subject: photoshop , windows , office . cheap ...   
4        2030   ham  Subject: re : indian springs\r\nthis deal is t...   

   label_num  
0          0  
1          0  
2          0  
3          1  
4          0  


In [19]:
# Define a text preprocessor
def preprocess_text(text):
    # Preprocessing: convert to lowercase
    return text.lower()

In [20]:
# Vectorize the text using the TF-IDF representation.
# TfidfVectorizer converts raw text documents into a matrix of TF-IDF features.
# Here, we remove common English stop words and apply a custom preprocessor (preprocess_text) for additional text cleaning.
vectorizer = TfidfVectorizer(stop_words='english', preprocessor=preprocess_text)

# Fit the vectorizer on the 'text' column of the dataset and transform the text into a TF-IDF feature matrix.
X = vectorizer.fit_transform(data['text'])

# Extract the target variable (labels) from the dataset.
# 'label_num' contains numeric labels where 0 represents Ham (legitimate messages) and 1 represents Spam.
y = data['label_num']  # 0 = Ham, 1 = Spam


In [21]:
# Split the data into training and test sets, and train the RandomForestClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# Instantiate a RandomForestClassifier with a fixed random state (42) to ensure reproducibility of results.
rf_model = RandomForestClassifier(random_state=42)
# Fit (train) the RandomForestClassifier on the training dataset (features: X_train, labels: y_train).
rf_model.fit(X_train, y_train)


In [23]:
import pickle  # Import the pickle module for serializing and deserializing Python objects

# Open a file named "vectorizer.pkl" in write-binary mode ('wb')
with open("vectorizer.pkl", "wb") as f:
    # Serialize (save) the trained vectorizer object to the file using pickle
    pickle.dump(vectorizer, f)


In [24]:
# Define the predict_spam function that uses the model and vectorizer to classify a message
def predict_spam(message, model, vectorizer):
    """
    Preprocesses the message, vectorizes it, and makes a prediction.
    Returns the label ("Ham" or "Spam") and the probability vector.
    """
    # Preprocess the message using the custom preprocess_text function (e.g., converting to lowercase)
    processed_msg = preprocess_text(message)
    
    # Transform the preprocessed message into a numerical feature vector using the vectorizer
    message_vec = vectorizer.transform([processed_msg])
    
    # Use the model to predict the label of the message; [0] selects the first (and only) prediction
    prediction = model.predict(message_vec)[0]
    # Retrieve the probability distribution for the prediction; [0] selects the first (and only) probability vector
    prob = model.predict_proba(message_vec)[0]
    
    # Define a mapping for numeric labels to human-readable labels:
    # 0 corresponds to "Ham" (legitimate message) and 1 corresponds to "Spam"
    labels = {0: "Ham", 1: "Spam"}
    
    # Print the original message for debugging purposes
    print(f"Message: {message}\n")
    # Print the predicted label using the labels mapping
    print(f"Prediction: {labels[prediction]}")
    # Print the probabilities for both Ham and Spam with four decimal places precision
    print(f"Ham Probability: {prob[0]:.4f}, Spam Probability: {prob[1]:.4f}")
    
    # Return the predicted label (as a string) and the probability vector
    return labels[prediction], prob


In [25]:
# Save the trained Random Forest model pipeline to a file named 'test_classifier_pipeline.pkl'
joblib.dump(rf_model, 'test_classifier_pipeline.pkl')


['test_classifier_pipeline.pkl']

In [26]:
# Example test message
test_message = (
    "Subject: CONGRATULATIONS! YOU’VE WON $1,000,000! Dear Winner, "
    "We are pleased to inform you that your email has been selected in our annual online lottery. "
    "Your email address has been awarded $1,000,000 in cash. To claim your winnings, send your full name, "
    "address, phone number, and bank details to our claims department at: **lotteryclaims@fakeprize.com**. "
    "Failure to respond within 48 hours will result in forfeiture of your winnings. "
    "Best regards, John Doe Lottery Claims Manager"
)


In [27]:
# Call the predict_spam function with a test message, the trained Random Forest model (rf_model), and the vectorizer.
# This function preprocesses the test message, converts it into a numerical feature vector, and then predicts its category (spam or ham)
# along with the corresponding probability scores.
predict_spam(test_message, rf_model, vectorizer)


Message: Subject: CONGRATULATIONS! YOU’VE WON $1,000,000! Dear Winner, We are pleased to inform you that your email has been selected in our annual online lottery. Your email address has been awarded $1,000,000 in cash. To claim your winnings, send your full name, address, phone number, and bank details to our claims department at: **lotteryclaims@fakeprize.com**. Failure to respond within 48 hours will result in forfeiture of your winnings. Best regards, John Doe Lottery Claims Manager

Prediction: Spam
Ham Probability: 0.3200, Spam Probability: 0.6800


('Spam', array([0.32, 0.68]))

In [28]:
from sklearn.metrics import classification_report

y_pred = rf_model.predict(X_test)

# Ensure that y_test and y_pred are available from your model predictions
print("Classification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       742
           1       0.97      0.99      0.98       293

    accuracy                           0.99      1035
   macro avg       0.98      0.99      0.98      1035
weighted avg       0.99      0.99      0.99      1035

