# E-mail Spam Ham Detection

#### Using BOW, TFIDF Machine Learning Algorithms
(order very imp***)
1. Preprocessing And Cleaning
2. Train Test Split
3. BOW And TF-IDF (Sentences--->vectors) {Preventing Data Leakage}
4. Trained Our Models

In [38]:
import pandas as pd
messages = pd.read_csv('dataset/spam.csv', encoding='latin1', usecols=[0, 1] , names=["label", "message"])

In [39]:
messages

Unnamed: 0,label,message
0,v1,v2
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...
...,...,...
5568,spam,This is the 2nd time we have tried 2 contact u...
5569,ham,Will Ì_ b going to esplanade fr home?
5570,ham,"Pity, * was in mood for that. So...any other s..."
5571,ham,The guy did some bitching but I acted like i'd...


In [40]:
## Data Cleaning And Preprocessing
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\salma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [41]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wnl=WordNetLemmatizer()

In [42]:
corpus=[]
for i in range(0,len(messages)):
    review=re.sub('[^a-zA-z]',' ',messages['message'][i])
    review=review.lower()
    review=review.split()
    review=[wnl.lemmatize(word, pos='v') for word in review if not word in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

In [43]:
corpus[0:4]

['v',
 'go jurong point crazy available bugis n great world la e buffet cine get amore wat',
 'ok lar joke wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply']

In [44]:
## Output Features
y=pd.get_dummies(messages['label'])
print(y.astype(int))
y=y.iloc[:,1].values.astype(int)
y

      ham  spam  v1
0       0     0   1
1       1     0   0
2       1     0   0
3       0     1   0
4       1     0   0
...   ...   ...  ..
5568    0     1   0
5569    1     0   0
5570    1     0   0
5571    1     0   0
5572    1     0   0

[5573 rows x 3 columns]


array([0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0], shape=(5573,))

In [45]:
import joblib

joblib.dump(wnl, 'models/wnl.pkl')

['models/wnl.pkl']

## method 1:

In [46]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(corpus,y,test_size=0.20)
len(X_train),len(y_train)

(4458, 4458)

In [47]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    BaggingClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier
)

from xgboost import XGBClassifier
classifiers = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42),
    "SVM": LinearSVC(class_weight='balanced'),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced', random_state=42),
    "Extra Trees": ExtraTreesClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=42),
    "Bagging": BaggingClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

In [48]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report

scores = []
best_model = None
best_score = 0

for name, clf in classifiers.items():
    print(f"\n🧠 {name}")
    pipe = Pipeline([
        ('cv', CountVectorizer(max_features=3000, stop_words='english', ngram_range=(1, 2))),
        ('clf', clf)
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    if acc > best_score:
        best_score = acc
        best_model = (name, pipe)

print("\n🏆 Best Model:")
print(f"{best_model[0]} with accuracy: {best_score:.4f}")


🧠 Naive Bayes
Accuracy: 0.9820627802690582
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       964
           1       0.97      0.89      0.93       151

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115


🧠 Logistic Regression
Accuracy: 0.9838565022421525
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       964
           1       0.97      0.91      0.94       151

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115


🧠 Random Forest
Accuracy: 0.9757847533632287
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       964
           1       0.96      0.85      0.91       151

    accuracy                           0

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [49]:
best_model

('Logistic Regression',
 Pipeline(steps=[('cv',
                  CountVectorizer(max_features=3000, ngram_range=(1, 2),
                                  stop_words='english')),
                 ('clf',
                  LogisticRegression(class_weight='balanced', max_iter=1000))]))

In [50]:
best_model[1].fit(X_train, y_train)
y_pred = best_model[1].predict(X_test)

In [51]:
test_text = "Congratulations! You've been selected to receive a free iPhone 🎉 Click here to claim your prize now!"
test_text=re.sub('[^a-zA-z]',' ',test_text).lower().split()
test_text=[wnl.lemmatize(word, pos='v') for word in test_text if not word in stopwords.words('english')]
test_text=[' '.join(test_text)]
result = best_model[1].predict(test_text)[0]
print(result)
print("Spam" if result == 1 else "Ham")

1
Spam


In [52]:
test_text = "Hey, are we still meeting at 5 PM today? Let me know if anything changes."
test_text=re.sub('[^a-zA-z]',' ',test_text).lower().split()
test_text=[wnl.lemmatize(word, pos='v') for word in test_text if not word in stopwords.words('english')]
test_text=[' '.join(test_text)]
result = best_model[1].predict(test_text)[0]
print(result)
print("Spam" if result == 1 else "Ham")

0
Ham


## method 2:
## Create Bag Of Words

In [53]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(corpus,y,test_size=0.20)
len(X_train),len(y_train)

(4458, 4458)

In [54]:
## Create the Bag OF Words model
from sklearn.feature_extraction.text import CountVectorizer
## for Binary BOW enable binary=True
cv=CountVectorizer(max_features=3000,ngram_range=(1,2), stop_words='english')
X_train=cv.fit_transform(X_train).toarray()
X_test=cv.transform(X_test).toarray()

In [55]:
cv.vocabulary_

{'excellent': np.int64(744),
 'plan': np.int64(1869),
 'want': np.int64(2825),
 'come': np.int64(428),
 'online': np.int64(1794),
 'want come': np.int64(2827),
 'come online': np.int64(435),
 'sleep': np.int64(2296),
 'surf': np.int64(2445),
 'sleep surf': np.int64(2300),
 'ur': np.int64(2697),
 'wall': np.int64(2815),
 'important': np.int64(1185),
 'customer': np.int64(554),
 'service': np.int64(2223),
 'announcement': np.int64(71),
 'freephone': np.int64(844),
 'customer service': np.int64(557),
 'service announcement': np.int64(2224),
 'romantic': np.int64(2118),
 'better': np.int64(193),
 'dun': np.int64(696),
 'disturb': np.int64(648),
 'liao': np.int64(1328),
 'wake': np.int64(2811),
 'msg': np.int64(1651),
 'lor': np.int64(1390),
 'disturb liao': np.int64(649),
 'claim': np.int64(385),
 'shop': np.int64(2251),
 'spree': np.int64(2370),
 'ppm': np.int64(1915),
 'shop spree': np.int64(2252),
 'issue': np.int64(1226),
 'right': np.int64(2103),
 'ill': np.int64(1175),
 'fix': np.int

In [56]:
import numpy as np
np.set_printoptions(edgeitems=30, linewidth=100000, formatter=dict(float=lambda x: "%.3g" % x))
X_train

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0

In [57]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model=MultinomialNB().fit(X_train,y_train)
y_pred=spam_detect_model.predict(X_test)

In [58]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_test,y_pred)

0.989237668161435

In [59]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       983
           1       0.96      0.95      0.95       132

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [60]:
joblib.dump(spam_detect_model, 'models/bow_model.pkl')
joblib.dump(cv, 'models/cv.pkl')

['models/cv.pkl']

In [61]:
test_text = "Congratulations! You've been selected to receive a free iPhone 🎉 Click here to claim your prize now!"
test_text=re.sub('[^a-zA-z]',' ',test_text).lower().split()
test_text=[wnl.lemmatize(word, pos='v') for word in test_text if not word in stopwords.words('english')]
test_text=[' '.join(test_text)]
test_text = cv.transform(test_text).toarray()
result = spam_detect_model.predict(test_text)[0]
print(result)
print("Spam" if result == 1 else "Ham")

1
Spam


In [62]:
test_text = "Hey, are we still meeting at 5 PM today? Let me know if anything changes."
test_text=re.sub('[^a-zA-z]',' ',test_text).lower().split()
test_text=[wnl.lemmatize(word, pos='v') for word in test_text if not word in stopwords.words('english')]
test_text=[' '.join(test_text)]
test_text = cv.transform(test_text).toarray()
result = spam_detect_model.predict(test_text)[0]
print(result)
print("Spam" if result == 1 else "Ham")

0
Ham


## method 3:

### Creating The TF-IDF Model

In [63]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(corpus,y,test_size=0.20)

In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv=TfidfVectorizer(max_features=2500,ngram_range=(1,2))
X_train=tv.fit_transform(X_train).toarray()
X_test=tv.transform(X_test).toarray()

In [65]:
X_train

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0

In [66]:
tv.vocabulary_

{'ya': np.int64(2456),
 'sorry': np.int64(1966),
 'hurt': np.int64(987),
 'ah': np.int64(32),
 'look': np.int64(1187),
 'like': np.int64(1145),
 'ill': np.int64(1002),
 'prob': np.int64(1657),
 'send': np.int64(1831),
 'laptop': np.int64(1096),
 'get': np.int64(775),
 'fix': np.int64(696),
 'cuz': np.int64(476),
 'problem': np.int64(1659),
 'look like': np.int64(1189),
 'tomorrow': np.int64(2164),
 'build': np.int64(225),
 'might': np.int64(1309),
 'earlier': np.int64(585),
 'call': np.int64(237),
 'reach': np.int64(1696),
 'know': np.int64(1080),
 'change': np.int64(310),
 'drive': np.int64(573),
 'need': np.int64(1412),
 'go': np.int64(810),
 'need go': np.int64(1415),
 'go get': np.int64(816),
 'tuesday': np.int64(2203),
 'night': np.int64(1436),
 'real': np.int64(1701),
 'personal': np.int64(1564),
 'think': np.int64(2118),
 'always': np.int64(54),
 'try': np.int64(2195),
 'keep': np.int64(1066),
 'everybody': np.int64(634),
 'happy': np.int64(901),
 'time': np.int64(2141),
 'nobod

In [67]:
from sklearn.naive_bayes import MultinomialNB
spam_tfidf_model = MultinomialNB().fit(X_train, y_train)

In [68]:
#prediction
y_pred=spam_tfidf_model.predict(X_test)

In [69]:
score=accuracy_score(y_test,y_pred)
print(score)

0.97847533632287


In [70]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       983
           1       0.85      1.00      0.92       132

    accuracy                           0.98      1115
   macro avg       0.92      0.99      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [72]:
joblib.dump(spam_tfidf_model, 'models/tfidf_model.pkl')
joblib.dump(tv, 'models/tv.pkl')

['models/tv.pkl']

In [35]:
test_text = "Congratulations! You've been selected to receive a free iPhone 🎉 Click here to claim your prize now!"
test_text=re.sub('[^a-zA-z]',' ',test_text).lower().split()
test_text=[wnl.lemmatize(word, pos='v') for word in test_text if not word in stopwords.words('english')]
test_text=[' '.join(test_text)]
test_text = tv.transform(test_text).toarray()
result = spam_tfidf_model.predict(test_text)[0]
print(result)
print("Spam" if result == 1 else "Ham")

1
Spam


In [36]:
test_text = "Hey, are we still meeting at 5 PM today? Let me know if anything changes."
test_text=re.sub('[^a-zA-z]',' ',test_text).lower().split()
test_text=[wnl.lemmatize(word, pos='v') for word in test_text if not word in stopwords.words('english')]
test_text=[' '.join(test_text)]
test_text = tv.transform(test_text).toarray()
result = spam_tfidf_model.predict(test_text)[0]
print(result)
print("Spam" if result == 1 else "Ham")

0
Ham
