[KAGGLE - url](https://www.kaggle.com/datasets/abdallahwagih/spam-emails)

## Imports

In [82]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

## Inspect data

In [11]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
df.groupby('Category').count()

Unnamed: 0_level_0,Message
Category,Unnamed: 1_level_1
ham,4825
spam,747


In [10]:
df.shape

(5572, 2)

In [26]:
# Examples of spam emails

spam = df[df['Category']=='spam'].head()
spam['Message'].values

array(["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
       "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv",
       'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.',
       'Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030',
       'SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info'],
      dtype=object)

In [28]:
# Let's add additional column

df['spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


## Train and Test data split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam)

In [33]:
X_train

4842    Hmmm.... Mayb can try e shoppin area one, but ...
1114    No I'm good for the movie, is it ok if I leave...
1472    Oh. U must have taken your REAL Valentine out ...
2106                              I fetch yun or u fetch?
4379        Doing nothing, then u not having dinner w us?
                              ...                        
4985    goldviking (29/M) is inviting you to be his fr...
3290    Camera - You are awarded a SiPix Digital Camer...
1675                            Nah dub but je still buff
2457                            Onum ela pa. Normal than.
2046    Okay... I booked all already... Including the ...
Name: Message, Length: 4179, dtype: object

In [34]:
y_train

4842    0
1114    0
1472    0
2106    0
4379    0
       ..
4985    1
3290    1
1675    0
2457    0
2046    0
Name: spam, Length: 4179, dtype: int64

## Converting text into matrix

In [42]:
cv = CountVectorizer()

In [43]:
X_train_count = cv.fit_transform(X_train.values)

In [44]:
X_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## Train model

In [45]:
model = MultinomialNB()

In [46]:
model.fit(X_train_count, y_train)

## Pre-test predictions

In [47]:
# Let's create a spam email

spam_email = ["Win $1,000,000 today in our Free Casino! No registration needed. Click this link."]
spam_email_count = cv.transform(spam_email)
model.predict(spam_email_count)

array([1])

In [48]:
# Now let's create a "ham" email (normal one)

ham_email = ["Hello, please let me know when we can schedule a meeting?"]
ham_email_count = cv.transform(ham_email)
model.predict(ham_email_count)

array([0])

## Test model

In [64]:
# We need to vectorize x_test before testing

X_test_count = cv.transform(X_test)
predictions = model.predict(X_test_count)

In [83]:
# model results

# Corrected model results without variable name clashes
precision_score_val = round(precision_score(y_test, predictions), 2)
recall_score_val = round(recall_score(y_test, predictions), 2)
accuracy_score_val = round(accuracy_score(y_test, predictions), 2)
f1_score_val = round(f1_score(y_test, predictions), 2)

print(f'Precision: {precision_score_val}')
print(f'Recall: {recall_score_val}')
print(f'Accuracy: {accuracy_score_val}')
print(f'F1: {f1_score_val}')

Precision: 0.96
Recall: 0.93
Accuracy: 0.98
F1: 0.95
