# **Spam Mail Prediction**

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier

In [2]:
mail_data = pd.read_csv('data/mail_data.csv')
mail_data.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [3]:
mail_data.shape

(5572, 2)

In [4]:
mail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
mail_data.isna().sum()

Category    0
Message     0
dtype: int64

In [6]:
label_encoder = LabelEncoder()
label_encoder.fit(mail_data['Category']) 
label_encoder.classes_ = np.array(['spam', 'ham'])
mail_data['Category'] = mail_data['Category'].map({'spam': 0, 'ham': 1})

mail_data.head(10)

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
5,0,FreeMsg Hey there darling it's been 3 week's n...
6,1,Even my brother is not like to speak with me. ...
7,1,As per your request 'Melle Melle (Oru Minnamin...
8,0,WINNER!! As a valued network customer you have...
9,0,Had your mobile 11 months or more? U R entitle...


In [7]:
X = mail_data['Message']
y = mail_data['Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## **Logistic Regression**

In [8]:
param_grid = {
    'tfidf__min_df': [1, 2, 5], 
    'tfidf__max_df': [0.7, 0.8, 1.0],           
    'logreg__C': [10, 100],
    'logreg__penalty': ['l2'],
    'logreg__solver': ['lbfgs']
}

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', lowercase=True)),
    ('logreg', LogisticRegression(max_iter=1000))
])


grid_search_lr = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search_lr.fit(X_train, y_train)

prediction_data_lr = grid_search_lr.predict(X_test)

print("Best parameters found: ", grid_search_lr.best_params_)
print("Best cross-validation accuracy: {:.3f}".format(grid_search_lr.best_score_))
print("Test accuracy: {:.3f}".format(accuracy_score(y_test, prediction_data_lr)))

Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best parameters found:  {'logreg__C': 100, 'logreg__penalty': 'l2', 'logreg__solver': 'lbfgs', 'tfidf__max_df': 0.7, 'tfidf__min_df': 1}
Best cross-validation accuracy: 0.981
Test accuracy: 0.989


In [9]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"]

prediction = grid_search_lr.predict(input_mail)
print(prediction)
if prediction[0] == 1:
    print('Ham mail')
else:
    print('Spam mail')

[1]
Ham mail


## **Random Forest**

In [10]:
param_grid = {
    'tfidf__min_df': [1, 5],
    'tfidf__max_df': [0.7, 1.0],
    'rf__n_estimators': [100, 200],        
    'rf__max_depth': [10, 20, None],
    'rf__min_samples_split': [2, 5] 
}

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', lowercase=True)),
    ('rf', RandomForestClassifier(random_state=42))
])

grid_search_rf = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)
predictions_rf = grid_search_rf.predict(X_test)


print("Best parameters found: ", grid_search_rf.best_params_)
print("Best cross-validation accuracy: {:.3f}".format(grid_search_rf.best_score_))
print("Test accuracy: {:.3f}".format(accuracy_score(y_test, predictions_rf)))

Fitting 3 folds for each of 48 candidates, totalling 144 fits
Best parameters found:  {'rf__max_depth': None, 'rf__min_samples_split': 2, 'rf__n_estimators': 200, 'tfidf__max_df': 0.7, 'tfidf__min_df': 5}
Best cross-validation accuracy: 0.977
Test accuracy: 0.984


In [11]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise I won’t take your help for granted and will fulfill my promise. You have been wonderful and a blessing at all times"]

prediction = grid_search_rf.predict(input_mail)

if prediction[0] == 1:
    print('Ham mail')
else:
    print('Spam mail')

Ham mail


## **Gradient Boosting**

In [12]:
param_grid = {
    'tfidf__min_df': [1, 5],
    'tfidf__max_df': [0.7, 1.0],
    'gb__n_estimators': [50, 100, 200],    
    'gb__learning_rate': [0.01, 0.1, 0.2], 
    'gb__max_depth': [3, 5, 7]     
}

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', lowercase=True)),
    ('gb', GradientBoostingClassifier(random_state=42))
])

grid_search_gb = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search_gb.fit(X_train, y_train)
predictions_gb = grid_search_gb.predict(X_test)

print("Best parameters found: ", grid_search_gb.best_params_)
print("Best cross-validation accuracy: {:.3f}".format(grid_search_gb.best_score_))
print("Test accuracy: {:.3f}".format(accuracy_score(y_test, predictions_gb)))


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best parameters found:  {'gb__learning_rate': 0.2, 'gb__max_depth': 3, 'gb__n_estimators': 200, 'tfidf__max_df': 0.7, 'tfidf__min_df': 1}
Best cross-validation accuracy: 0.968
Test accuracy: 0.979


In [13]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise I won’t take your help for granted and will fulfill my promise. You have been wonderful and a blessing at all times"]

prediction = grid_search_gb.predict(input_mail)

if prediction[0] == 1:
    print('Ham mail')
else:
    print('Spam mail')

Ham mail


## **XGBooster**

In [16]:
param_grid = {
    'tfidf__min_df': [1, 5],
    'tfidf__max_df': [0.7, 1.0],
    'xgb__n_estimators': [50, 100, 200],
    'xgb__learning_rate': [0.01, 0.1, 0.2], 
    'xgb__max_depth': [3, 5, 7],   
    'xgb__subsample': [0.6, 0.8, 1.0]   
}
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', lowercase=True)),
    ('xgb', XGBClassifier( eval_metric='mlogloss', random_state=42))
])

grid_search_xgb = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search_xgb.fit(X_train, y_train)
predictions_xgb = grid_search_xgb.predict(X_test)

print("Best parameters found: ", grid_search_xgb.best_params_)
print("Best cross-validation accuracy: {:.3f}".format(grid_search_xgb.best_score_))
print("Test accuracy: {:.3f}".format(accuracy_score(y_test, predictions_xgb)))

Fitting 3 folds for each of 324 candidates, totalling 972 fits
Best parameters found:  {'tfidf__max_df': 0.7, 'tfidf__min_df': 5, 'xgb__learning_rate': 0.2, 'xgb__max_depth': 3, 'xgb__n_estimators': 200, 'xgb__subsample': 0.8}
Best cross-validation accuracy: 0.970
Test accuracy: 0.978


In [15]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise I won’t take your help for granted and will fulfill my promise. You have been wonderful and a blessing at all times"]

prediction = grid_search_xgb.predict(input_mail)

if prediction[0] == 1:
    print('Ham mail')
else:
    print('Spam mail')

Ham mail
