In [1]:
#download dataset
!gdown --id 1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R

Downloading...
From: https://drive.google.com/uc?id=1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R
To: /content/2cls_spam_text_cls.csv
100% 486k/486k [00:00<00:00, 73.9MB/s]


# Read dataset and import necessary package

In [2]:
#import necessary package

import string #basic function for string handling
import nltk #natural language processing
nltk.download('stopwords') #download stopwords
nltk.download('punkt') #download punkt

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
#read dataset

dataset_path = '/content/2cls_spam_text_cls.csv'
df = pd.read_csv(dataset_path)

messages = df['Message'].values.tolist()
labels = df['Category'].values.tolist()

In [5]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Preprocessing Data



In [6]:
#convert all text to lowercase
def lowercase(text):
  return text.lower()

#eliminate all punctuation marks
def punctuation_removal(text):
  translator = str.maketrans('', '', string.punctuation)
  return text.translate(translator)

#Splits the text into individual words (tokens)
def tokenize(text):
  return nltk.word_tokenize(text)

#Filters out common words that don't carry significant meaning
def remove_stopwords(tokens):
  stop_words = nltk.corpus.stopwords.words('english')
  return [token for token in tokens if token not in stop_words]

#Reduces words to their root form, grouping similar word together
def stemming(tokens):
  stemmer = nltk.PorterStemmer()
  return [stemmer.stem(token) for token in tokens]

In [7]:
#preprocess data
def preprocess_text(text):
  text = lowercase(text)
  text = punctuation_removal(text)
  tokens = tokenize(text)
  tokens = remove_stopwords(tokens)
  tokens = stemming(tokens)
  return tokens

messages = [preprocess_text(message) for message in messages]

In [8]:
#create a dictionary that contain all word appeared in messages
def create_dictionary(messages):
  dictionary = []
  for tokens in messages:
    for token in tokens:
      if token not in dictionary:
        dictionary.append(token)
  return dictionary

dictionary = create_dictionary(messages)

In [10]:
#create feature for each message
def create_features(tokens, dictionary):
  features = np.zeros(len(dictionary))
  for token in tokens:
    if token in dictionary:
      features[dictionary.index(token)] += 1
  return features

In [11]:
X = np.array([create_features(tokens, dictionary) for tokens in messages])

In [13]:
#pre process label: convert ham and spam to 0, 1
le = LabelEncoder()
y = le.fit_transform(labels)
print(f'Classes: {le.classes_}')
print(f'Encoded labels: {y}')

Classes: ['ham' 'spam']
Encoded labels: [0 0 1 ... 0 0 0]


# Train model

In [14]:
#split data into train/val/test
VAL_SIZE = 0.2
TEST_SIZE = 0.125
SEED = 0

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_SIZE, random_state=SEED)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train,
                                                    test_size=TEST_SIZE,
                                                    shuffle = True,
                                                    random_state=SEED)

In [15]:
#train model
model = GaussianNB()
print('Start training...')
model.fit(X_train, y_train)
print('Training completed!')

Start training...
Training completed!


In [16]:
#model evaluation
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f'Validation accuracy: {val_accuracy}')
print(f'Test accuracy: {test_accuracy}')

Validation accuracy: 0.8816143497757848
Test accuracy: 0.8602150537634409


In [17]:
#prediction
def predict(text, model, dictionary):
  processed_text = preprocess_text(text)
  features = create_features(processed_text, dictionary)
  features = np.array(features).reshape(1, -1)
  prediction = model.predict(features)
  prediction_cls = le.inverse_transform(prediction)[0]
  return prediction_cls

In [23]:
test_input = "Bitcoin, Anyone?"
prediction_cls = predict(test_input, model, dictionary)
print(f'Prediction: {prediction_cls}')

Prediction: spam
