## **Spam Mail Detection**

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
raw_df = pd.read_csv("/content/drive/MyDrive/Datasets/mail_data.csv")
raw_df.shape

(5572, 2)

In [3]:
df = raw_df.where((pd.notnull(raw_df)),'')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Label Encoding on 'Category'

In [4]:
df.loc[df['Category'] == 'spam', 'Category'] = 0
df.loc[df['Category'] == 'ham', 'Category'] = 1

Separating input features x and output label y

In [5]:
X = df['Message']
Y = df['Category']

Training Test data splitting

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)
print(X.shape, X_train.shape, X_test.shape)

(5572,) (4457,) (1115,)


Features Extraction

In [13]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [14]:
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

Model Training and evaluation

In [17]:
model = LogisticRegression()

In [19]:
model.fit(X_train_features, Y_train)

In [20]:
# Model accuracy evaluation on training dataset
Y_pred = model.predict(X_train_features)
training_accuracy = accuracy_score(Y_train, Y_pred)
print(f"Accuracy on training dataset: {training_accuracy:.2%}")

Accuracy on training dataset: 96.77%


In [21]:
# Model accuracy evaluation on testing dataset
Y_pred = model.predict(X_test_features)
testing_accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy on testing dataset: {testing_accuracy:.2%}")

Accuracy on testing dataset: 96.68%


Making a Predictive System

In [27]:
input_data = ["Customer service annoncement. You have a New Years delivery waiting for you. Please call 07046744435 now to arrange delivery"]
input_data_features = feature_extraction.transform(input_data)

prediction = model.predict(input_data_features)
if prediction[0] == 1:
    print("Ham Mail")
else:
    print("Spam Mail")

Spam Mail
