## importing libraries

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

## loading the dataset

In [2]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.shape

(5572, 2)

In [5]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

## train test split

In [7]:
X = df.Message
y= df.Category
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=4)

## converting string to numbers

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
X_train_count = v.fit_transform(X_train)
X_test_count = v.transform(X_test)

In [15]:
len(X_train_count.toarray()[1])

7212

## Bayes Model

In [16]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_count,y_train)

## evaluation

In [17]:
y_pred = model.predict(X_test_count)
from sklearn.metrics import classification_report
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1432
        spam       0.97      0.89      0.93       240

    accuracy                           0.98      1672
   macro avg       0.97      0.94      0.96      1672
weighted avg       0.98      0.98      0.98      1672



## prediction example

In [18]:
emails = [
    "Click here to claim your prize now! Limited time offer. Provide your bank details to receive your winnings.",
    "We noticed suspicious activity on your account. Please log in using the link below to secure your account.",
    "Hi team, the next meeting is scheduled for Monday at 10:00 AM. Please confirm your availability.",
    "Thank you for your purchase. Your order #12345 has been shipped and will arrive soon. Track your package here."
]
emails_count = v.transform(emails)
model.predict(emails_count)

array(['spam', 'spam', 'ham', 'ham'], dtype='<U4')

## Pipeline execution for same steps

In [22]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ('Count_vectorzier',CountVectorizer()),
    ('Bayes_model', MultinomialNB())
])
pipe.fit(X_train,y_train)

In [23]:
y_pred_pipe = pipe.predict(X_test)
report = classification_report(y_test,y_pred_pipe)
print(report)

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1432
        spam       0.97      0.89      0.93       240

    accuracy                           0.98      1672
   macro avg       0.97      0.94      0.96      1672
weighted avg       0.98      0.98      0.98      1672



In [25]:
pipe.predict(emails)

array(['spam', 'spam', 'ham', 'ham'], dtype='<U4')