# IMPORT LIBRARY

In [144]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report



# IMPORT DATASET

In [145]:
df = pd.read_csv('spam.csv')

In [146]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [147]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [148]:
df.isna().sum()

Category    0
Message     0
dtype: int64

In [149]:
df['spam'] = np.where(df['Category'] == 'spam', 1, 0)


In [150]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [151]:
df.shape

(5572, 3)

# SPLIT TEST

In [152]:
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2,random_state=2)

In [153]:
X_train.shape

(4457,)

In [154]:
X_test.shape

(1115,)

In [155]:
cv = CountVectorizer()

X_train_cv = cv.fit_transform(X_train.values)
X_train_cv

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 59180 stored elements and shape (4457, 7719)>

In [156]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [157]:
cv.get_feature_names_out()[1771]


'chick'

In [158]:
cv.vocabulary_

{'unlimited': 7155,
 'texts': 6788,
 'limited': 4085,
 'minutes': 4469,
 'hahaha': 3267,
 'use': 7202,
 'your': 7685,
 'brain': 1475,
 'dear': 2178,
 'ujhhhhhhh': 7104,
 'computer': 1930,
 'shipped': 6080,
 'out': 4985,
 'with': 7525,
 'address': 809,
 'to': 6922,
 'sandiago': 5874,
 'and': 948,
 'parantella': 5055,
 'lane': 3976,
 'wtf': 7609,
 'poop': 5290,
 'you': 7679,
 'have': 3327,
 'won': 7552,
 'as': 1070,
 'valued': 7231,
 'vodafone': 7298,
 'customer': 2116,
 'our': 4983,
 'has': 3313,
 'picked': 5179,
 'win': 7500,
 '150': 300,
 'prize': 5406,
 'collect': 1879,
 'is': 3722,
 'easy': 2500,
 'just': 3846,
 'call': 1604,
 '09061743386': 197,
 'love': 4197,
 'it': 3734,
 'the': 6804,
 'girls': 3124,
 'at': 1102,
 'office': 4875,
 'may': 4366,
 'wonder': 7554,
 'why': 7481,
 'are': 1036,
 'smiling': 6248,
 'but': 1575,
 'sore': 6322,
 've': 7243,
 'been': 1284,
 'searching': 5950,
 'for': 2925,
 'right': 5761,
 'words': 7567,
 'thank': 6794,
 'this': 6841,
 'breather': 1497,
 'pr

In [159]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [160]:
np.where(X_train_np[0]!=0)

(array([4085, 4469, 6788, 7155], dtype=int64),)

In [161]:
X_train_np[0][1771]


0

# MODEL

In [162]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [163]:
X_test_cv = cv.transform(X_test)


In [164]:
y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98       957
           1       0.96      0.85      0.90       158

    accuracy                           0.97      1115
   macro avg       0.97      0.92      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [165]:
emails = [
    'Get this for only 20 dollars, this is a special promo',
    'Hi Hans, how are you?'
]

emails_count = cv.transform(emails)
model.predict(emails_count)

array([1, 0])

# PICKLE

In [166]:
import pickle

In [167]:
filename = 'spam_email.pkl'
pickle.dump(model, open(filename, 'wb'))
pickle.dump(cv, open('cv.pkl', 'wb'))  
