# NAIVE BAYES

$$\hat{y} = \arg\max_{y} P(y) \prod_{i=1}^{n} P(x_i \mid y)$$

$$P(x_i \mid y) = \hat{\theta_{yi}} = \dfrac{N_{yi} + \alpha}{N_{y} + {\alpha}n}$$

In [23]:
import numpy as np
from collections import defaultdict
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re

In [27]:
def sentences_process(sentences):
    sentences = sentences.lower()
    cleaned_string = re.sub(r'[^a-zA-Z0-9 ]+', '', sentences)
    cleaned_string = ' '.join( cleaned_string.split())
    return cleaned_string

In [5]:
print(ENGLISH_STOP_WORDS)

frozenset({'ever', 'none', 'perhaps', 'is', 'what', 'between', 'thru', 'with', 'only', 'anyway', 'please', 'yet', 'although', 'hasnt', 'them', 'give', 'always', 'etc', 'least', 'would', 'thereafter', 'interest', 'i', 'for', 'against', 'enough', 'done', 'behind', 'though', 'four', 'were', 'yourselves', 'few', 'else', 'call', 'from', 'which', 'whence', 'via', 'within', 'of', 'same', 'themselves', 're', 'sincere', 'us', 'twenty', 'could', 'cant', 'ours', 'why', 'their', 'who', 'mostly', 'itself', 'too', 'everything', 'up', 'across', 'part', 'will', 'before', 'therein', 'made', 'whenever', 'everyone', 'anything', 'her', 'nothing', 'my', 'without', 'hereupon', 'thereupon', 'whereby', 'nor', 'through', 'since', 'also', 'or', 'such', 'besides', 'detail', 'afterwards', 'again', 'whatever', 'both', 'are', 'out', 'at', 'another', 'because', 'they', 'whole', 'beforehand', 'this', 'un', 'anyhow', 'two', 'indeed', 'twelve', 'latter', 'de', 'seemed', 'take', 'onto', 'its', 'mine', 'that', 'describe'

In [49]:
class NaiveBayesClassifier:
    def __init__ (self, alpha = 1.0):
        self.classes = None
        self.class_prob = defaultdict(float)
        self.feature_prob = defaultdict(lambda : defaultdict(float))
        self.vocab = set()
        self.stop_words = ENGLISH_STOP_WORDS
        self.alpha = alpha
    def fit(self, X, y):
        class_counts = defaultdict(int)
        feature_counts = defaultdict(lambda : defaultdict(int))
        total_docs = len(y)
        for i in range(total_docs):
            label = y[i]
            class_counts[label] += 1
            split_sentences = (sentences_process(X[i])).split()
            for word in split_sentences:
                if word not in self.stop_words:
                    self.vocab.add(word)
                    feature_counts[label][word] += 1
        self.classes = class_counts.keys()
        for label in class_counts:
            self.class_prob[label] = class_counts[label] / total_docs
        for label in self.classes:
            total_words_in_class = sum(feature_counts[label].values())
            for word in self.vocab:
                self.feature_prob[label][word] = (feature_counts[label][word] + self.alpha) / (total_words_in_class + self.alpha*len(self.vocab))
    def predict(self, X):
        if not isinstance(X, list) : X = [X]
        predictions = []
        for doc in X:
            class_scores = {}
            split_senteces = (sentences_process(doc)).split()
            for label in self.classes:
                class_scores[label] = np.log(self.class_prob[label])
                for word in split_senteces:
                    if word in self.vocab and word not in self.stop_words:
                        class_scores[label] += np.log(self.feature_prob[label][word])
            predictions.append(max(class_scores, key = class_scores.get))
        return predictions
    def score(self, X, y):
        predictions = self.predict(X)
        accuracy = np.mean(np.array(predictions) == np.array(y))
        return accuracy
    def get_params(self, deep=True):
        return {"alpha": self.alpha}
    
    def set_params(self, **params):
        for key, value in params.items():
            setattr(self, key, value)
        return self

In [50]:
file_path = r"Naive_Bayes.xlsx"
df = pd.read_excel(file_path)
df = df.dropna()
X = df["Sentences"]
y = df["Label"]
print(len(X), len(y))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
param_grid = {'alpha' : [0.01, 0.1, 1, 10]}
grid_search = GridSearchCV(NaiveBayesClassifier(), param_grid, cv=5, verbose=3) #cv = 5 chia tap du lieu thanh bao nhieu phan
grid_search.fit(X.tolist(), y.tolist())
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)
nb_classifier = NaiveBayesClassifier(alpha = grid_search.best_params_['alpha'])
nb_classifier.fit(X_train.tolist(), y_train.tolist())
print(len(X_test))
y_pred = nb_classifier.predict(X_test.tolist())
print(len(y_test))
accuracy = accuracy_score(y_test, y_pred)

print("Dự đoán:", accuracy)

211 211
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END ........................alpha=0.01;, score=0.837 total time=   0.0s
[CV 2/5] END ........................alpha=0.01;, score=0.857 total time=   0.0s
[CV 3/5] END ........................alpha=0.01;, score=0.952 total time=   0.0s
[CV 4/5] END ........................alpha=0.01;, score=0.714 total time=   0.0s
[CV 5/5] END ........................alpha=0.01;, score=0.786 total time=   0.0s
[CV 1/5] END .........................alpha=0.1;, score=0.837 total time=   0.0s
[CV 2/5] END .........................alpha=0.1;, score=0.857 total time=   0.0s
[CV 3/5] END .........................alpha=0.1;, score=0.952 total time=   0.0s
[CV 4/5] END .........................alpha=0.1;, score=0.714 total time=   0.0s
[CV 5/5] END .........................alpha=0.1;, score=0.786 total time=   0.0s
[CV 1/5] END ...........................alpha=1;, score=0.791 total time=   0.0s
[CV 2/5] END ...........................a

In [51]:
print(nb_classifier.predict("I love you"))

['Positive']
