In [1]:
! gdown --id 1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R

Downloading...
From: https://drive.google.com/uc?id=1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R
To: /content/2cls_spam_text_cls.csv
100% 486k/486k [00:00<00:00, 100MB/s]


In [2]:
# ENVIRONMENT
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
DATASET_PATH = '/content/2cls_spam_text_cls.csv'
df = pd.read_csv(DATASET_PATH)

messages = df['Message'].values.tolist()
labels = df['Category'].values.tolist()

In [4]:
# Pre-processing data

# convert all text to lowercase
def lowercase(text):
    return text.lower()

# eliminate all punctuation marks
def punctuation_removal(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# split the text into individual words (TOKENS)
def tokenize(text):
    return nltk.word_tokenize(text)

# filter out common words that do not carry significant meaning
def remove_stopwords(tokens):
    stop_words = nltk.corpus.stopwords.words('english')

    return[token for token in tokens if token not in stop_words]

# reduce word to the root form, group similar words together
def stemming(tokens):
    stemmer = nltk.PorterStemmer()
    return[stemmer.stem(token) for token in tokens]

# all the above functions constitude preprocess_text funct
def preprocess_text(text):
    text = lowercase(text)
    text = punctuation_removal(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = stemming(tokens)

    return tokens

messages = [preprocess_text(message) for message in messages]


In [5]:
# create a dictionary storing all tokens and characters in 'messages' after preprocessing and not counting duplicated words

def create_dictionary(messages):
    dictionary = []

    for tokens in messages:
        for token in tokens:
            if token not in dictionary:
                dictionary.append(token)

    return dictionary

dictionary = create_dictionary(messages)

In [6]:
# create features that represent the information of the Messages based on the token appearing frequency
# for each message, the representative vector will have the size equals to the number of tokens in dictionary

def create_features(tokens, dictionary):
    features = np.zeros(len(dictionary))

    for token in tokens:
        if token in dictionary:
            features[dictionary.index(token)] += 1

    return features

X = np.array([create_features(tokens, dictionary) for tokens in messages])

In [9]:
# to use the 'LabelEncoder' class from the 'sklearn.preprocessing' module in Python

le = LabelEncoder()
y = le.fit_transform(labels) # labels = df['Category'].values.tolist()
print(f'Classes : {le.classes_ }')
print(f'Encoded labels : {y}')

# >> Classes : [ ’ham ’ ’spam ’]
# >> Encoded labels : [0 0 1 ... 0 0 0]

Classes : ['ham' 'spam']
Encoded labels : [0 0 1 ... 0 0 0]


In [10]:
VAL_SIZE = 0.2 # define the proportion of the dataset to be used for the validation set (20%)
TEST_SIZE = 0.125 # define the proportion of the dataset (after the first split) to be used for the test set (12.5%)
SEED = 0 # set a random seed for shuffling the data, ensuring that the split is reproducible

# to use 'train_test_split' function from 'sklearn.model_selection' module

# the first train_test_split function splits the entire dataset into a training set and a validation set
X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=VAL_SIZE,
                                                  shuffle=True,
                                                  random_state = SEED)

# the second train_test_split function takes the 'X_train' and 'y_train' data from the first split and further splits it into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train,
                                                    test_size = TEST_SIZE ,
                                                    shuffle =True,
                                                    random_state = SEED)

In [12]:
# initialize a GaussianNB model from the 'sklearn.naive_bayes' module
model = GaussianNB()
print('Start training ...')

# train model
model = model.fit(X_train, y_train)
print('Training completed!')

# >> Start training ...
# >> Training completed!
# >> CPU times: user 397 ms, sys: 162 ms, total: 559 ms
# >> Wall time: 633 ms

Start training ...
Training completed!


In [13]:
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f'Val accuracy: {val_accuracy}')
print(f'Test accuracy: {test_accuracy}')

# >> Val accuracy : 0.8816143497757848
# >> Test accuracy : 0.8602150537634409

Val accuracy: 0.8816143497757848
Test accuracy: 0.8602150537634409


In [14]:
def predict(text, model, dictionary):
    processed_text = preprocess_text(text)
    features = create_features(text, dictionary)
    features = np.array(features).reshape(1, -1)
    prediction = model.predict(features)
    prediction_cls = le.inverse_transform(prediction)[0]

    return prediction_cls

test_input = 'I am actually thinking a way of doing something useful'
prediction_cls = predict(test_input, model, dictionary)
print(f'Prediction: { prediction_cls }')

# >> Prediction : ham

Prediction: ham
