## Spam Message Classification Using Naive Bayes

### Import Library

In [None]:
!pip install numpy pandas scikit-learn nltk

In [99]:
import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
import string

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt

In [16]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lhldanh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/lhldanh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Read data

In [69]:
data_path = '2cls_spam_text_cls.csv'
df = pd.read_csv(data_path)
df[:3]

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [70]:
messages = df['Message'].values.tolist()
labels = df['Category'].values.tolist()

In [42]:
stopwords = nltk.corpus.stopwords.words('english')
stemmer = nltk.PorterStemmer()

### Data PreProcessing

#### Text Preprocess

In [71]:
def preprocess_text(text):
    text = text.lower()
    text =  text.translate(str.maketrans('', '', string.punctuation))
    text = word_tokenize(text)
    # text = text.split(' ')
    text = [token for token in text if token not in stopwords]
    text = [stemmer.stem(token) for token in text]
    return text

messages = [preprocess_text(m) for m in messages]

#### Create dictionary of data

In [77]:
def create_dictionary(data):
    dict = []

    for sen in messages:
        for token in sen:
            if token not in dict:
                dict.append(token)
    
    return dict

dictionary = create_dictionary(messages)

#### Create feature

In [79]:
def create_feature(sentence):
    feature = np.zeros(len(dictionary))

    for token in sentence:
        if token in dictionary:
            feature[dictionary.index(token)] += 1
    
    return feature

X = [create_feature(sentence) for sentence in messages]

#### Label Encode

In [80]:
le = LabelEncoder()

y  = le.fit_transform(labels)

print(f'Catogoies:      {le.classes_}')
print(f'Encoded labels: {y}')

Catogoies:      ['ham' 'spam']
Encoded labels: [0 0 1 ... 0 0 0]


### Classification

#### Split train, test

In [89]:
val_sz = 0.2
test_sz = 0.05
SEED = 0

X_train, X_val, Y_train, Y_val = train_test_split(X, y,
                                                  test_size=val_sz,
                                                  shuffle=True,
                                                  random_state=SEED)

X_train, X_test, Y_train, Y_test = train_test_split(X, y,
                                                  test_size=test_sz,
                                                  shuffle=True,
                                                  random_state=SEED)

#### Training

In [90]:
model = GaussianNB()

In [91]:
print('Training..')
model = model.fit(X_train, Y_train)
print('Complete!')

Training..
Complete!


#### Evaluation

In [100]:
Y_val_pred = model.predict(X_val)
Y_test_pred = model.predict(X_test)

val_acc = f1_score(Y_val, Y_val_pred, average='weighted')
test_acc = f1_score(Y_test, Y_test_pred, average='weighted')

print(f'Valuation accuracy: {val_acc}')
print(f'Test accuracy: {test_acc}')

Valuation accuracy: 0.9253793350300911
Test accuracy: 0.9227352692905434


### Save model

In [102]:
import joblib

joblib.dump(model, 'Spam_Message_Classify.joblib')

['Spam_Message_Classify.joblib']