In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

### **Read CSV and drop duplicate rows**

In [None]:
df = pd.read_csv('spam.csv')
df.drop_duplicates(inplace=True)
df.head()

# Column 1 is the email's text
# Column 2 is whether it's spam (1) or not (0)

Unnamed: 0,text,spam
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,"Subject: photoshop , windows , office . cheap ...",1
4,Subject: re : indian springs\r\nthis deal is t...,0


In [None]:
print('Number of spam emails:', (df['spam'] == 1).sum())
print('Number of non-spam emails:', (df['spam'] == 0).sum())

Number of spam emails: 1462
Number of non-spam emails: 3531


### **Tokenizing and vectorizing emails**

In [None]:
# 80/20 Training/Test split

x_train, x_test, y_train, y_test = train_test_split(df['text'], df['spam'])

In [None]:
vectorizer = CountVectorizer().fit(x_train)
x_train_vectorized = vectorizer.transform(x_train)

### **Multinomial Naive Bayes Classifier**

In [None]:
model = MultinomialNB()
model.fit(x_train_vectorized, y_train)

MultinomialNB()

In [None]:
# Evaluating the model on the test data
prediction = model.predict(vectorizer.transform(x_test))

print('Confusion Matrix: \n', confusion_matrix(y_test, prediction))

print('\nAccuracy: \n', round(accuracy_score(y_test, prediction) * 100, 2), '%')

Confusion Matrix: 
 [[887  10]
 [ 18 334]]

Accuracy: 
 97.76 %


### **Testing with custom emails**

In [None]:
def isSpam(prediction):
  if str(prediction) == '[0]':
    print('Not spam')
  else:
    print('Spam')

In [None]:
# Spam
isSpam(model.predict(vectorizer.transform([
"Subject: CONGRATULATIONS! you have won an all-expenses-paid CRUISE TO THE BAHAMAS! Please message back with your bank account " +
"number and social security number for more info."])))

# Spam
isSpam(model.predict(vectorizer.transform([
"Subject: your social security number appears to be compromised!!! call 1-800-499-4999 to restore your privacy.",
])))

# Not spam
isSpam(model.predict(vectorizer.transform([
"Subject: Hi John, thanks for the update. Can't wait to see you at the BBQ tomorrow!",
])))

# Not spam
isSpam(model.predict(vectorizer.transform([
"Subject: Hi team, here are our numbers for last quarter. Great job! Let's keep it up!",
])))


Spam
Spam
Not spam
Not spam
