# Naive-Bayes Model
This notebook creates a naive-bayes model and executes it on sample text data.

In [1]:
import pandas as pd
import numpy as np

In [2]:
import nltk

In [3]:
class NBayesClass:
    
    def __init__(self):
        
        self.probas = None
        self.predict = None
        
    def calc_probas(self, features, targets):
        
        # combine features and targets into single dataframe
        df = features
        df['class_labels'] = targets
        # feature counts by class
        freq = df.groupby(df.columns[-1]).sum()
        # total count of features in sample (to add for smoothing of zero values)
        tot_num = len(freq.columns)
        # calc totals per class
        freq['class_total'] = freq.sum(axis=1) + tot_num
        # calc probabilities of each feature appearing in each class
        self.probas = (freq.iloc[:, 0:-1] + 1).div(freq['class_total'], axis=0)
        return(self.probas)
    
    def predict_class(self, features):
        
        def process(feat_vals):
            
            # remove features not in training set
            feat_trn = set(feat_vals).intersection(self.probas.columns)
            # calc probabilities of feature per class
            predict_probas = self.probas[list(feat_trn)].prod(axis=1)
            # return predicted class
            return(predict_probas.idxmax(axis=1))
        
        self.predict = features.apply(lambda x: process(x))
        return(self.predict)
    
    def metrics(self, actual):

        # cacluate overall accuracy
        accuracy = sum(self.predict == actual) / len(self.predict)
        
        classes = self.predict.append(actual).unique()
    
        df_prec_rec = pd.DataFrame(index=['TP', 'FP', 'FN', 'precision', 'recall'])
    
        # cacluate true positives, false positives and false negatives for each class
        for cls in classes:
            TP = sum((self.predict == cls) & (self.predict == actual))
            FP = sum((self.predict == cls) & (self.predict != actual) & (actual != cls))
            FN = sum((self.predict != cls) & (self.predict != actual) & (actual == cls))
            precision = TP / (TP + FP)
            recall = TP / (TP + FN)
            df_prec_rec[cls] = [TP, FP, FN, precision, recall]
        
        precision = df_prec_rec.loc['precision', :].mean()
        recall = df_prec_rec.loc['recall', :].mean()
        
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)

# Example

## Input

In [4]:
word_dict = {'text': 
             ["a great game", "the election was over", "very clean match", 
              "a clean but forgettable game", "it was a close election"],
            'tag': ["sports", "not_sports", "sports", "sports", "not_sports"]
            }

In [5]:
word_df = pd.DataFrame(word_dict)
word_df

Unnamed: 0,text,tag
0,a great game,sports
1,the election was over,not_sports
2,very clean match,sports
3,a clean but forgettable game,sports
4,it was a close election,not_sports


In [6]:
X = word_df.iloc[:, 0]
X

0                    a great game
1           the election was over
2                very clean match
3    a clean but forgettable game
4         it was a close election
Name: text, dtype: object

In [7]:
y = word_df.iloc[:, 1]
y

0        sports
1    not_sports
2        sports
3        sports
4    not_sports
Name: tag, dtype: object

In [8]:
X_test = pd.Series(['a very close game', "the election was over", 'game over match it'])

In [9]:
y_test = pd.Series(['sports', 'not_sports', 'not_sports'])

## Word Dataframe
Create sparse matrix of words in training dataset

In [10]:
# create sparse matrix of words
# input is dataframe of text

def create_word_mat(df_docs_text):
    # initialize empty list to store counts for each observation
    word_dicts = []
    
    for doc in df_docs_text:
        word_counts = {}
        # split sentences into individual words
        words = nltk.word_tokenize(doc)
        # calc word counts for each word in observation
        for word in words:
            if word.isalnum():
                if word in word_counts.values():
                    word_counts[word] += 1
                else:
                    word_counts[word] = 1
            else:
                pass
        # add word counts for observation to list
        word_dicts.append(word_counts)
    
    # create dataframe of all observation word counts (0's fill empty word count cells)
    df_words = pd.DataFrame.from_records(word_dicts).fillna(0)
    return(df_words)

In [11]:
df_words = create_word_mat(X)
df_words

Unnamed: 0,a,but,clean,close,election,forgettable,game,great,it,match,over,the,very,was
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


## Word Probabilities
Calculate probabilities of each word per class label

In [12]:
# create instance of nbayes
model = NBayesClass()

In [13]:
df_probas = model.calc_probas(df_words, y)
df_probas

Unnamed: 0_level_0,a,but,clean,close,election,forgettable,game,great,it,match,over,the,very,was
class_labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
not_sports,0.086957,0.043478,0.043478,0.086957,0.130435,0.043478,0.043478,0.043478,0.086957,0.043478,0.086957,0.086957,0.043478,0.130435
sports,0.12,0.08,0.12,0.04,0.04,0.08,0.12,0.08,0.04,0.08,0.04,0.04,0.08,0.04


## Predictions
Predict class labels for test dataset

In [14]:
# pre-process test features
words = X_test.apply(lambda x: nltk.word_tokenize(x))
words

0        [a, very, close, game]
1    [the, election, was, over]
2       [game, over, match, it]
dtype: object

In [15]:
predicts = model.predict_class(words)
predicts

0        sports
1    not_sports
2        sports
dtype: object

## Evaluate
Retrieve model metrics

In [16]:
model.metrics(y_test)

Accuracy: 0.6666666666666666
Precision: 0.75
Recall: 0.75
