In [1]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv("spam.csv")
df.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [7]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [8]:
df['spam'] = df.Category.apply(lambda x: 1 if  x=='spam' else 0 )

In [10]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [15]:
from sklearn.model_selection import train_test_split

In [208]:
x_train, x_test , y_train, y_test = train_test_split(df.Message,df.spam, test_size=0.2)

In [211]:
x_train[15:20]

1225    You are a winner U have been specially selecte...
3407    HEY DAS COOL... IKNOW ALL 2 WELLDA PERIL OF ST...
3862    Free Msg: Ringtone!From: http://tms. widelive....
2667    * Was a nice day and, impressively, i was sens...
2792    … we r stayin here an extra week, back next we...
Name: Message, dtype: object

In [212]:
y_train[15:20]

1225    1
3407    0
3862    1
2667    0
2792    0
Name: spam, dtype: int64

In [213]:
type(x_train.values)

numpy.ndarray

In [214]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
x_train_cv = v.fit_transform(x_train.values)
x_train_cv

<4457x7765 sparse matrix of type '<class 'numpy.int64'>'
	with 59451 stored elements in Compressed Sparse Row format>

In [215]:
type(x_train_cv)

scipy.sparse.csr.csr_matrix

In [216]:
x_train_cv.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [217]:
x_train_cv.shape

(4457, 7765)

In [218]:
v.get_feature_names_out()[2000:2050]

array(['contract', 'contribute', 'control', 'convenience',
       'conversations', 'converted', 'converter', 'convey', 'conveying',
       'convinced', 'convincing', 'cook', 'cooked', 'cookies', 'cooking',
       'cool', 'cooped', 'cooperative', 'copied', 'coping', 'cops',
       'copy', 'cornwall', 'corporation', 'correct', 'correction',
       'correctly', 'corrupt', 'corvettes', 'cos', 'cost', 'costa',
       'costs', 'costume', 'costumes', 'couch', 'cougar', 'cough',
       'coughing', 'could', 'couldn', 'count', 'country', 'counts',
       'couple', 'courage', 'courageous', 'course', 'court', 'courtroom'],
      dtype=object)

In [219]:
from itertools import islice
dict(islice(v.vocabulary_.items(), 10))

{'please': 5277,
 'leave': 4077,
 'this': 6875,
 'topic': 7006,
 'sorry': 6353,
 'for': 2951,
 'telling': 6778,
 'that': 6833,
 'it': 3774,
 'didnt': 2314}

In [220]:
x_train[4839]

'All boys made fun of me today. Ok i have no problem. I just sent one message just for fun'

In [221]:
x_train_np = x_train_cv.toarray()
np.where(x_train_np[0]!=0)

(array([2951, 4077, 5277, 6353, 6778, 6833, 6875, 7006], dtype=int64),)

In [222]:
print(v.vocabulary_['fun'])
print("The word 'fun' repeats:",x_train_arr[0][3029], "times in mail 0")

3054
The word 'fun' repeats: 2 times in mail 0


In [223]:
from sklearn.naive_bayes import MultinomialNB

In [230]:
model = MultinomialNB()
model.fit(x_train_cv,y_train)

MultinomialNB()

In [231]:
x_test_cv = v.transform(x_test)

In [232]:
x_test_cv.shape

(1115, 7765)

In [268]:
from sklearn.metrics import classification_report
y_pred = model.predict(x_test_cv)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       958
           1       0.97      0.89      0.93       157

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



**Taking random Emails from my mail box**

In [277]:
random_emails = ["You are a winner!! Press this button to win over 100000$",
                 """Are you ready for the next adventure? Then immerse yourself as Human or Elf in the magical world of Elvenar and discover ancient magic, 
                 long-lost races, and powerful Relics!
                Start with an exclusive Starter Bundle and begin your journey!"""]
emails_cv = v.transform(random_emails)
model.predict(emails_cv)

array([1, 0], dtype=int64)

The first email is spam, the second isn't


**Lets have a shorter way to build the model**


In [278]:
from sklearn.pipeline import Pipeline

pipe_model = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [279]:
pipe_model.fit(x_train,y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [280]:
y_pred = pipe_model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       958
           1       0.97      0.89      0.93       157

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [284]:
pipe_model.predict(random_emails)

array([1, 0], dtype=int64)