In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import defaultdict
import re

In [24]:
data = pd.read_csv('spamdb.csv', encoding='cp1251')

In [25]:
data.head()

Unnamed: 0,class,text,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [26]:
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
data.head()

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [27]:
data['class'] = data['class'].apply(lambda x: 1 if x=='spam' else 0)

In [28]:
data.head()

Unnamed: 0,class,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [29]:
def clear(text):
    return ' '.join(re.sub('[^a-zA-Z]',' ', text.lower()).split())

In [30]:
data['lemm'] = data['text'].apply(clear)

In [31]:
data.head()

Unnamed: 0,class,text,lemm
0,0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final ...
3,0,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives arou...


In [32]:
data['class'].value_counts()

class
0    4825
1     747
Name: count, dtype: int64

In [34]:
X = data['lemm']
y = data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [37]:
class NaiveBayesClassifier:
    def __init__(self):
        self.vocab = set()
        self.word_counts = {
            0: defaultdict(int),
            1: defaultdict(int)
        }
        self.class_counts = {0: 0, 1: 0}
        self.class_priors = {}
        self.total_words = {0: 0, 1: 0}

    def fit(self, X, y):
        for text, label in zip(X, y):
            self.class_counts[label] += 1
            for word in text.split():
                self.word_counts[label][word] += 1
                self.vocab.add(word)
                self.total_words[label] += 1

        total_docs = len(y)
        self.class_priors[0] = self.class_counts[0] / total_docs
        self.class_priors[1] = self.class_counts[1] / total_docs

    def predict(self, X):
        predictions = []
        for text in X:
            words = text.split()
            log_prob_0 = np.log(self.class_priors[0])
            log_prob_1 = np.log(self.class_priors[1])
            for word in words:
                prob_w_given_0 = (self.word_counts[0][word] + 1) / (self.total_words[0] + len(self.vocab))
                prob_w_given_1 = (self.word_counts[1][word] + 1) / (self.total_words[1] + len(self.vocab))
                log_prob_0 += np.log(prob_w_given_0)
                log_prob_1 += np.log(prob_w_given_1)
            predictions.append(1 if log_prob_1 > log_prob_0 else 0)
        return predictions

    def score(self, X, y_true):
        y_pred = self.predict(X)
        correct = sum(1 for yp, yt in zip(y_pred, y_true) if yp == yt)
        return correct / len(y_true)

In [38]:
nb = NaiveBayesClassifier()
nb.fit(X_train, y_train)

accuracy = nb.score(X_test, y_test)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.9740
