# 1. Import Libraries

In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import math

from collections import defaultdict
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import KFold, cross_val_score

from sklearn.naive_bayes import MultinomialNB

# 2. Load the Dataset

In [47]:
path = '/content/spam.csv'
dataset = pd.read_csv(path, encoding='ISO-8859-1')
print("Dataset Loaded:\n", dataset.head())

Dataset Loaded:
      v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [48]:
# Keep only the first two columns: 'v1' is label, 'v2' is message
dataset = dataset[['v1', 'v2']]
dataset.columns = ['label', 'message']
print(dataset.head())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [49]:
print(dataset.shape)

(5572, 2)


In [50]:
print(dataset.isnull().sum())

label      0
message    0
dtype: int64


In [51]:
print(dataset.dtypes)

label      object
message    object
dtype: object


# 3. Preprocess the Dataset

In [52]:
dataset['label'] = dataset['label'].map({'ham': 0, 'spam': 1})
print(dataset.head())

   label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...


In [53]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)  # remove punctuation
    return text

dataset['clean_message'] = dataset['message'].apply(clean_text)
print(dataset.head())

   label                                            message  \
0      0  Go until jurong point, crazy.. Available only ...   
1      0                      Ok lar... Joking wif u oni...   
2      1  Free entry in 2 a wkly comp to win FA Cup fina...   
3      0  U dun say so early hor... U c already then say...   
4      0  Nah I don't think he goes to usf, he lives aro...   

                                       clean_message  
0  go until jurong point crazy available only in ...  
1                            ok lar joking wif u oni  
2  free entry in 2 a wkly comp to win fa cup fina...  
3        u dun say so early hor u c already then say  
4  nah i dont think he goes to usf he lives aroun...  


In [54]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(dataset['clean_message'])  # sparse matrix
y = dataset['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(dataset.dtypes)

label             int64
message          object
clean_message    object
dtype: object


# 4. Reference Naive Bayes Classifier

In [55]:
nbc_ref = MultinomialNB()

start_time = datetime.now()
nbc_ref.fit(X_train, y_train)
end_time = datetime.now()

y_pred = nbc_ref.predict(X_test)

In [56]:
# Metrics
accuracy = accuracy_score(y_test, y_pred)
execution_time = (end_time - start_time).microseconds
report = classification_report(y_test, y_pred)

print(f"\nReference Naive Bayes Classifier")
print(f"\nExecution Time: {execution_time} mcs")
print(f"\nR2 score: {accuracy:.4f}")
print(f"\nClassification Report:\n{report}")


Reference Naive Bayes Classifier

Execution Time: 3311 mcs

R2 score: 0.9785

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.92      0.92      0.92       150

    accuracy                           0.98      1115
   macro avg       0.95      0.95      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [57]:
# Cross-Validation
n_folds = 10

cv = KFold(n_splits=n_folds, shuffle=True, random_state=42)
scores_ref = cross_val_score(nbc_ref, X, y, scoring='accuracy', cv=cv)

In [58]:
print(f"The mean Accuracy score for {n_folds} for the Reference Naive Bayes Classifier is: {scores_ref.mean():.4f}")

The mean Accuracy score for 10 for the Reference Naive Bayes Classifier is: 0.9772


# 5. Custom Bagging Regressor

In [59]:
class CustomMultinomialNB:
    def __init__(self, alpha=1.0):
        self.alpha = alpha  # Laplace smoothing
        self.class_log_prior_ = {}
        self.feature_log_prob_ = {}
        self.classes_ = []
        self.vocab_size = 0

    def get_params(self, deep=True):
        return {"alpha": self.alpha}

    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        return self

    def fit(self, X, y):
        n_docs, n_features = X.shape
        self.classes_ = np.unique(y)
        self.vocab_size = n_features

        # Count documents and total word counts per class
        class_counts = defaultdict(int)
        feature_counts = {cls: np.zeros(n_features) for cls in self.classes_}

        for i in range(n_docs):
            cls = y[i]
            class_counts[cls] += 1
            feature_counts[cls] += X[i].toarray()[0]

        # Compute priors
        total_docs = len(y)
        for cls in self.classes_:
            self.class_log_prior_[cls] = math.log(class_counts[cls] / total_docs)

        # Compute conditional probabilities with Laplace smoothing
        for cls in self.classes_:
            total_words = np.sum(feature_counts[cls])
            smoothed = feature_counts[cls] + self.alpha
            self.feature_log_prob_[cls] = np.log(smoothed / (total_words + self.alpha * n_features))

    def predict(self, X):
        predictions = []
        for i in range(X.shape[0]):
            log_probs = {}
            x_i = X[i].toarray()[0]

            for cls in self.classes_:
                log_prob = self.class_log_prior_[cls]
                log_prob += np.sum(x_i * self.feature_log_prob_[cls])
                log_probs[cls] = log_prob

            predicted_class = max(log_probs, key=log_probs.get)
            predictions.append(predicted_class)
        return np.array(predictions)

In [60]:
nbc_cus = CustomMultinomialNB()

start_time = datetime.now()
nbc_cus.fit(X_train, y_train)
end_time = datetime.now()

y_pred = nbc_cus.predict(X_test)

In [61]:
# Metrics
accuracy = accuracy_score(y_test, y_pred)
execution_time = (end_time - start_time).microseconds
report = classification_report(y_test, y_pred)

print(f"\nCustom Naive Bayes Classifier")
print(f"\nExecution Time: {execution_time} mcs")
print(f"\nR2 score: {accuracy:.4f}")
print(f"\nClassification Report:\n{report}")


Custom Naive Bayes Classifier

Execution Time: 373898 mcs

R2 score: 0.9785

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.92      0.92      0.92       150

    accuracy                           0.98      1115
   macro avg       0.95      0.95      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [62]:
# Cross-Validation
n_folds = 10

cv = KFold(n_splits=n_folds, shuffle=True, random_state=42)
scores_cus = cross_val_score(nbc_cus, X, y, scoring='accuracy', cv=cv)

In [63]:
print(f"The mean Accuracy score for {n_folds} for the Custom Naive Bayes Classifier is: {scores_cus.mean():.4f}")

The mean Accuracy score for 10 for the Custom Naive Bayes Classifier is: 0.9772
