In [34]:
# Import necessary libraries:
# - pandas: for data manipulation and analysis.
# - matplotlib.pyplot: for plotting and visualizing data.
# - seaborn: for enhanced data visualization.
# - RandomForestClassifier: for building a robust classification model.
# - train_test_split: for splitting data into training and testing sets.
# - Pipeline: for creating a streamlined workflow that combines preprocessing and model training.
# - TfidfVectorizer: for converting text data into numerical features using TF-IDF.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import  TfidfVectorizer

In [35]:
data = pd.read_csv('spam_ham_dataset.csv') # reading the dataset

In [36]:
data.head() # first 5 lines of the dataset

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [37]:
# Display a concise summary of the DataFrame 'data', including the number of entries, column data types, non-null counts, and memory usage.
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [38]:
# Generate summary statistics for numerical columns in the DataFrame 'data', such as mean, standard deviation, min, max, and quartiles.
data.describe()

Unnamed: 0.1,Unnamed: 0,label_num
count,5171.0,5171.0
mean,2585.0,0.289886
std,1492.883452,0.453753
min,0.0,0.0
25%,1292.5,0.0
50%,2585.0,0.0
75%,3877.5,1.0
max,5170.0,1.0


In [39]:
data.head() # first 5 lines of the dataset

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [40]:
# Import modules for text preprocessing:
# - re: Regular expressions for text cleaning and pattern matching.
# - nltk: Natural Language Toolkit for natural language processing.
# - stopwords: Provides a list of common stopwords to filter out from text.
# - WordNetLemmatizer: Lemmatizes words to their base form for normalization.
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [41]:
# List of stopwords (manually defined)
stop_words = {
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
    "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers",
    "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves",
    "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are",
    "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does",
    "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until",
    "while", "of", "at", "by", "for", "with", "about", "against", "between", "into",
    "through", "during", "before", "after", "above", "below", "to", "from", "up", "down",
    "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here",
    "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more",
    "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so",
    "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"
}


In [42]:
def preprocess_text_re(text): # function to preprocess text
    import re
    
    # Remove non-alphanumeric characters (retain spaces)
    text = re.sub(r"[^\w\s]", " ", text)
    
    # Normalize spaces
    text = re.sub(r"\s+", " ", text)
    
    # Convert to lowercase
    text = text.strip().lower()
    
    return text

In [43]:
# Apply the custom text preprocessing function to each entry in the 'text' column,
# cleaning and normalizing the text data before model training.
data['text'] = data['text'].apply(preprocess_text_re) 

# Creating The Model


In [44]:
data.head() # first 5 lines of the dataset

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,subject enron methanol meter 988291 this is a ...,0
1,2349,ham,subject hpl nom for january 9 2001 see attache...,0
2,3624,ham,subject neon retreat ho ho ho we re around to ...,0
3,4685,spam,subject photoshop windows office cheap main tr...,1
4,2030,ham,subject re indian springs this deal is to book...,0


In [45]:
data = data.drop(columns='Unnamed: 0') # removing column Unnamed: 0

In [46]:
# Extract the features and labels from the DataFrame.
# 'X' contains the email text data, and 'y' contains the numerical labels (e.g., 0 for legit, 1 for spam).
X = data['text']
y= data['label_num']

In [47]:
# Split the dataset into training and testing sets with 33% of the data reserved for testing,
# ensuring reproducibility with a random state of 42.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [48]:
# Create a pipeline that first converts raw text into TF-IDF features using TfidfVectorizer,
# then trains a RandomForestClassifier on these features for classification.
pipe = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', RandomForestClassifier())
])

In [49]:
# Train the pipeline on the training data, fitting both the TF-IDF vectorizer and the RandomForestClassifier.
pipe.fit(X_train,y_train)

In [50]:
# Use the trained pipeline to predict the labels for the test set.
prediction = pipe.predict(X_test)

In [51]:
# 'prediction' stores the predicted labels for the test set generated by the trained pipeline.
prediction

array([0, 1, 0, ..., 0, 1, 0], dtype=int64)

In [52]:
# Import functions to generate a detailed classification report and confusion matrix for evaluating model performance.
from sklearn.metrics import classification_report, confusion_matrix

In [53]:
# Print the classification report to evaluate the model's performance,
# comparing the predicted labels ('prediction') against the actual labels ('y_test').
print('classification report ', classification_report(prediction,y_test))

classification report                precision    recall  f1-score   support

           0       0.99      0.98      0.98      1257
           1       0.94      0.96      0.95       450

    accuracy                           0.97      1707
   macro avg       0.96      0.97      0.97      1707
weighted avg       0.97      0.97      0.97      1707



In [54]:
# Import the joblib module, which is used for saving and loading the trained model pipeline.
import joblib

In [55]:
# Save the trained pipeline (including the vectorizer and classifier) to a file named 'text_classifier_pipeline.pkl' for later use.
joblib.dump(pipe, 'text_classifier_pipeline.pkl')

['text_classifier_pipeline.pkl']

In [56]:
# shows first 5 lines of the dataset 
data.head()

Unnamed: 0,label,text,label_num
0,ham,subject enron methanol meter 988291 this is a ...,0
1,ham,subject hpl nom for january 9 2001 see attache...,0
2,ham,subject neon retreat ho ho ho we re around to ...,0
3,spam,subject photoshop windows office cheap main tr...,1
4,ham,subject re indian springs this deal is to book...,0


In [57]:
# Access the fourth element (index 3) of the 'text' column in the DataFrame 'data'
data['text'][3]

'subject photoshop windows office cheap main trending abasements darer prudently fortuitous undergone lighthearted charm orinoco taster railroad affluent pornographic cuvier irvin parkhouse blameworthy chlorophyll robed diagrammatic fogarty clears bayda inconveniencing managing represented smartness hashish academies shareholders unload badness danielson pure caffein spaniard chargeable levin'