# Naive Bayes  and Random Forest Spam Filter
### by: Kyla S. Ronquillo


This notebook contains a from-the-scratch implementation of Spam/Ham filter using the Naive Bayes theory. Meanwhile, my implementation of the model Random Forest was done with the help of scikit-learn

In [1]:
#libraries

import numpy as np
import re
import pandas as pd

In [2]:
# libraries for random forest 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Preprocess the Training Data

1. Read TrainingData.csv (contains 3900 messages labeled as "ham" or "spam").
2. Remove special characters and tokenize words (keep only alphabetic characters).
3. Create a vocabulary (V) of unique words.
4. Count the occurrences of each word in ham and spam messages.
5. Compute the prior probabilities for ham and spam.


In [3]:
#read the training data

df_sms = pd.read_csv('/kaggle/input/messages-data/TrainingData.csv', encoding='ISO-8859-1')
df_sms.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#clean text and tokenize words

def preprocess_data(text):
    #keep alphabet and space only
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    #lowercase all of them
    text = text.lower()
    #split into words
    words = text.split()

    return words


df_sms['tokens'] = df_sms['message'].apply(preprocess_data)

In [5]:
#V = vocabulary of unique words

all_words = []
for tokens in df_sms['tokens']:
    for word in tokens:
        all_words.append(word)

V = set(all_words)

print(f"Vocabulary Size: {len(V)}")

Vocabulary Size: 7063


In [6]:
#count their statistics and report the prior probabilities for spam and ham

# Count the number of ham and spam messages and total messages
total_sms = len(df_sms)
num_spam = len(df_sms[df_sms['label'] == 'spam'])
num_ham = len(df_sms[df_sms['label'] == 'ham'])

probability_spam = num_spam/total_sms
probability_ham = num_ham/total_sms

print(f"P(ham) = {probability_ham:.4f}, P(spam) = {probability_spam:.4f}")

P(ham) = 0.8669, P(spam) = 0.1331


# Train the Naive Bayes Classifier

1. Use Laplace Smoothing (λ = 1) to compute the likelihoods.
2. Store word probabilities for each class.


In [7]:
#count word freqs. in Spam and Ham Messages
from collections import defaultdict

#separate messages by class
ham_messages = df_sms[df_sms['label'] == 'ham']['tokens']
spam_messages = df_sms[df_sms['label'] == 'spam']['tokens']

#initialize word count dictionaries
ham_word_counts = defaultdict(int)
spam_word_counts = defaultdict(int)

#count word occurrences in ham messages
for tokens in ham_messages:
    for word in tokens:
        ham_word_counts[word] += 1

#count word occurrences in spam messages
for tokens in spam_messages:
    for word in tokens:
        spam_word_counts[word] += 1

In [8]:
# Vocabulary size (number of unique words)
V_size = len(V)

# Total word count in each class
total_ham_words = sum(ham_word_counts.values())
total_spam_words = sum(spam_word_counts.values())

# Compute word probabilities with Laplace Smoothing
ham_probabilities = {word: (ham_word_counts[word] + 1) / (total_ham_words + V_size) for word in V}
spam_probabilities = {word: (spam_word_counts[word] + 1) / (total_spam_words + V_size) for word in V}


# Classify the Test Data

1. Read TestData.csv (1672 messages without labels).
2. For each message, compute the probability of being spam or ham using log probabilities.
3. Assign the label ham or spam based on the higher probability.


In [9]:
# Load test data
df_test = pd.read_csv("/kaggle/input/messages-data/TestData.csv", encoding="ISO-8859-1")

# Apply the same preprocessing function as before
df_test['tokens'] = df_test['message'].apply(preprocess_data)

# Display first few rows
print(df_test.head())

                                             message  \
0  That depends. How would you like to be treated...   
1                       Right on brah, see you later   
2  Waiting in e car 4 my mum lor. U leh? Reach ho...   
3  Your 2004 account for 07XXXXXXXXX shows 786 un...   
4  Do you want a new video handset? 750 anytime a...   

                                              tokens  
0  [that, depends, how, would, you, like, to, be,...  
1                 [right, on, brah, see, you, later]  
2  [waiting, in, e, car, my, mum, lor, u, leh, re...  
3  [your, account, for, xxxxxxxxx, shows, unredee...  
4  [do, you, want, a, new, video, handset, anytim...  


In [10]:
def classify_message(tokens, P_ham, P_spam):
    # Initialize log probabilities with priors
    log_prob_ham = np.log(P_ham)
    log_prob_spam = np.log(P_spam)

    # Sum log probabilities of each word in the message
    for word in tokens:
        if word in ham_probabilities:
            log_prob_ham += np.log(ham_probabilities[word])
        else:
            log_prob_ham += np.log(1 / (total_ham_words + V_size))  # Handle unseen words

        if word in spam_probabilities:
            log_prob_spam += np.log(spam_probabilities[word])
        else:
            log_prob_spam += np.log(1 / (total_spam_words + V_size))  # Handle unseen words

    # Predict class based on higher probability
    return "spam" if log_prob_spam > log_prob_ham else "ham"

In [11]:
P_ham = num_ham / total_sms
P_spam = num_spam / total_sms

# Apply classification to each message
df_test['label'] = df_test['tokens'].apply(lambda tokens: classify_message(tokens, P_ham, P_spam))

# Save results to ResultData.csv
df_test[['message', 'label']].to_csv("ResultData.csv", index=False)

# Save the Results for using Naive Bayes
1. Output ResultData.csv with classified messages.

In [12]:
# Output ResultData.csv with classified messages.
print(pd.read_csv("ResultData.csv").head())

                                             message label
0  That depends. How would you like to be treated...   ham
1                       Right on brah, see you later   ham
2  Waiting in e car 4 my mum lor. U leh? Reach ho...   ham
3  Your 2004 account for 07XXXXXXXXX shows 786 un...  spam
4  Do you want a new video handset? 750 anytime a...  spam


# Train the Random Forest Model

In [13]:
# Convert text data to numerical features
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(df_sms['message'])
y_train = df_sms['label']
X_test = vectorizer.transform(df_test['message'])
y_test = df_test['label']

# Train Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

# Make predictions
y_pred = rf_clf.predict(X_test)

# Save the results for using Random Forest

In [14]:
# Save results to CSV
results = pd.DataFrame({'message': df_test['message'], 'actual_label': y_test, 'predicted_label': y_pred})
results.to_csv('random_forest_results.csv', index=False)
print("Predictions saved to random_forest_results.csv")


# Output ResultData.csv with classified messages.
print(pd.read_csv("random_forest_results.csv").head())

Predictions saved to random_forest_results.csv
                                             message actual_label  \
0  That depends. How would you like to be treated...          ham   
1                       Right on brah, see you later          ham   
2  Waiting in e car 4 my mum lor. U leh? Reach ho...          ham   
3  Your 2004 account for 07XXXXXXXXX shows 786 un...         spam   
4  Do you want a new video handset? 750 anytime a...         spam   

  predicted_label  
0             ham  
1             ham  
2             ham  
3            spam  
4            spam  
