# Naive Bayes - Object Oriented (OOP)

In [1]:
import pandas as pd
from pandas.api.types import is_string_dtype

from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords

In [2]:
class NaiveBayes():
    def __init__(self):
        self.df_p = None
        self.p_spam = None
        self.p_ham = None
        
    def fit(self, X, y):
        X = self.__prepare_data(X)
        self.df_p = self.__calculate_df_p(X, y)
        self.p_spam, self.p_ham = self__calculate_p_classes(y)
        
    def predict(self, X):
        X = self.__prepare_data(X)
        df_X = X.values.tolist()
        query = df_X[0][1]
        query_words = query.split(' ')
        
        p_words_ham = 1
        p_words_spam = 1

        for word in query_words:
            df_p_word = df_p[df_p["word"] == word]

            p_words_ham *= df_p_word['prob_ham'].values[0]
            p_words_spam *= df_p_word['prob_spam'].values[0]

        p_words = (p_ham * p_words_ham) + (p_spam * p_words_spam)

        p_final_ham = p_words_ham * p_ham / p_words
        p_final_spam = p_words_spam * p_spam / p_words

        print(f"Ham: {p_final_ham:.5f}, Spam: {p_final_spam:.5f}")

        if p_final_ham > p_final_spam:
            classified = 'ham'
        else:
            classified = 'spam'
            
        return classified
        
        
    def __prepare_data(self, X):
        X = self.__lower_case(X)
        X = self.__remove_special(X)
        X = self.__apply_nlp(X)
        X = self.__merge_columns(X)
        
        
    def __merge_columns(self, X):
        for column in X.columns:
            # '_nbc' as NaiveBayesClassifier
            
            X["_nbc"] = X[col] + ' '
        return X
    
    def __words_count(self, X, y):
        for column in X.columns:
            if is_string_dtype(X.dtypes[column]):
                df_count = X.set_index(y)['_nbc'].str.split().explode().reset_index().groupby(['sms', 'y']).size().to_frame()
                df_count = df_count.reset_index()
                return df_count

    def __calculate_p_classes(self, y):
        df_general = y.groupby(y.columns).count().reset_index()

        total_spam = df_general.loc[df_general[y.columns] == 'spam', '_nbc'].values[0]
        total_ham = df_general.loc[df_general[y.columns] == 'ham', '_nbc'].values[0]
        self.p_spam = total_spam / df.shape[0]
        self.p_ham = total_ham / df.shape[0]
        
        return self.p_spam, self.p_ham

    def __calculate_df_p(self, X, y):
        columns=['word', 'prob_ham', 'prob_spam']
        df_p = pd.DataFrame(data=None, columns=columns)
        
        words = self.__get_words_list(X)
        df_count = self.__words_count(X, y)
        
        for word in words:
            entry = []
            df_word = df_count[df_count["_nbc"] == word]
            if df_word.shape[0] > 0:
                count_word_spam = df_word.loc[df_word[y.columns] == 'spam', 0].values
                count_word_ham = df_word.loc[df_word[y.columns] == 'ham', 0].values

                if len(count_word_spam) > 0:
                    count_word_spam = count_word_spam[0]
                else:
                    count_word_spam = 1

                if len(count_word_ham) > 0:
                    count_word_ham = count_word_ham[0]
                else:
                    count_word_ham = 1

                p_word_ham = count_word_ham / total_ham
                p_word_spam = count_word_spam / total_spam


                entry = [word, p_word_ham, p_word_spam]

                df_entry = pd.DataFrame(data=[entry], columns=columns)
                if df_p.shape[0] > 0:
                    df_p = pd.concat([df_p, df_entry])
                else:
                    df_p = df_entry

        self.df_p = df_p.reset_index(drop=True)
        return self.df_p
        
    def __lower_case(self, X):
        df = X.applymap(lambda x: x.lower() if isinstance(x, str) else x)
        return df
    
    def __remove_special(self, X):
        for column in X.columns:
            if is_string_dtype(X.dtypes[column]):
                X[column] = X[column].str.replace("[^a-z 0-9]+", "", regex=True)
        return X
    
    def __apply_nlp(self, X):
        lemmatizer = WordNetLemmatizer()
        stemmer = PorterStemmer()

        def preprocess(sentence):
            tokenizer = RegexpTokenizer(r'\w+')
            tokens = tokenizer.tokenize(sentence)

            filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
            stem_words=[stemmer.stem(w) for w in filtered_words]
            lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
            return " ".join(lemma_words)

        for column in X.columns:
            if is_string_dtype(X.dtypes[column]):
                X[column] = X[column].map(lambda s:preprocess(s)) 
    
        return X
    
    def __get_words_list(self, X):
        words = []
        for column in X.columns:
            if is_string_dtype(X.dtypes[column]):
                words.append(dX[column].str.split().explode().drop_duplicates().values)
        
        words = list(set(words))


In [None]:
model = NaiveBayes()
model.fit(X, y)
prediction = model.predict(X)
