# AI Lab 8 - Naive Bayes from Scratch
__By Hasnain Naeem (212728), BSCS-7B, NUST__

__Description__: Naive bayes implementation from scratch using Numpy and Pandas. Although binary classification is demonstrated but it works for more than 2 classes. 

## Imports

In [40]:
from tabulate import tabulate
import numpy as np
import pandas as pd

## Defining Training Set Dataframe

In [41]:
# dataframe columns
column_names=["department", "status", "age", "salary"]

# define dataframe data
age_groups = {"A": "21-25", "B": "26-30", "C": "31-35", "D": "36-40", 
"E": "41-45", "F": "46-50"}

salary_groups = {"A": "26-30", "B": "31-35", "C": "36-40", "D": "41-45",
"E": "46-50", "F": "51-55", "G": "56-60", "H": "61-65", "I": "66-70"}

data = {"department": ["sales", "sales", "sales", "systems", "systems",
                        "systems", "systems", "marketing", "marketing",
                        "secretary", "secretary"],
        "status":     ["senior", "junior", "junior", "junior", "senior",
                       "junior", "senior", "senior", "junior", "senior",
                       "junior"],
        "age":        ["C", "B", "C", "A", "C", "B", "E", "D", "C", "F", "B"],
        "salary":     ["E", "A", "B", "E", "I", "E", "I", "E", "D", "C", "A"]
        }
                 
# create dataframe
df = pd.DataFrame(data)

In [42]:
print("Training Set:")
print(tabulate(df))

Training Set:
--  ---------  ------  -  -
 0  sales      senior  C  E
 1  sales      junior  B  A
 2  sales      junior  C  B
 3  systems    junior  A  E
 4  systems    senior  C  I
 5  systems    junior  B  E
 6  systems    senior  E  I
 7  marketing  senior  D  E
 8  marketing  junior  C  D
 9  secretary  senior  F  C
10  secretary  junior  B  A
--  ---------  ------  -  -


## Naive Bayes Implementation

In [43]:
def get_feature_table(feature_name, feature_column, label_column, pc=0):
    """
        Takes the feature column and label column to calculate the probability tables.
        Parameters:
        pc: pseudocount constant used for laplace smoothing
    
    """
    
    # get column names for the feature dataframe
    label_values = list(label_column.unique())
    feature_df_cols = label_values.copy()
    # get list of column names as: feature_name, label values...
    feature_df_cols.insert(0, feature_name)
    
    feature_values = feature_column.unique()
    feature_table = pd.DataFrame([[feature_val, 0, 0] for feature_val in feature_values], columns=feature_df_cols)
    feature_table = feature_table.set_index(feature_name)    
    
    # fill the feature table with the counts of each feature value
    label_count = {label_value: 0 for label_value in label_values}
    # count the values
    for i, feature_value in feature_column.iteritems():
        feature_table[label_column[i]][feature_value] = feature_table[label_column[i]][feature_value] + 1
        label_count[label_column[i]] += 1
    
    # divide to get probabilities
    for label_value in label_values:
        feature_table[label_value] = (feature_table[label_value] + pc) / (label_count[label_value] + (pc * len(label_values)))
        
    return feature_table

In [44]:
# select the feature to be predicted;
label_class = "status"

# column containing the values of the label class
label_column = df[label_class]

# get the prior probabilities of values of each label class
label_counts = label_column.value_counts()
label_prob = label_counts / label_column.size

label_classes = [label_class for label_class, _ in label_prob.iteritems()]

# get the prior probabilties of features
feature_prior_prob = {column_name:None for column_name in column_names}
for feature_name, feature_val_prior_prob in feature_prior_prob.items():
    val_counts = df[feature_name].value_counts()
    val_prob = val_counts / label_column.size
    feature_prior_prob[feature_name] = val_prob

### Calculate Feature Tables without Laplace Smoothing

In [45]:
# generate the probability tables for each feature
feature_tables = {}
for col_name, col_data in df.iteritems():
    feature_table = get_feature_table(col_name, col_data, label_column)
    feature_tables[col_name] = feature_table

In [46]:
print("Feature Tables")
print("_______________\n")
for feature_name, feature_table in feature_tables.items():
    print("Feature Name: "+feature_name)
    print("Feature Table:")
    print(tabulate(feature_table))
    print()

Feature Tables
_______________

Feature Name: department
Feature Table:
---------  ---  --------
sales      0.2  0.333333
systems    0.4  0.333333
marketing  0.2  0.166667
secretary  0.2  0.166667
---------  ---  --------

Feature Name: status
Feature Table:
------  -  -
senior  1  0
junior  0  1
------  -  -

Feature Name: age
Feature Table:
-  ---  --------
C  0.4  0.333333
B  0    0.5
A  0    0.166667
E  0.2  0
D  0.2  0
F  0.2  0
-  ---  --------

Feature Name: salary
Feature Table:
-  ---  --------
E  0.4  0.333333
A  0    0.333333
B  0    0.166667
I  0.4  0
D  0    0.166667
C  0.2  0
-  ---  --------



### Calculate Feature Tables with Laplace Smoothing

In [47]:
# generate the probability tables for each feature
feature_tables_with_ls = {}
for col_name, col_data in df.iteritems():
    feature_table = get_feature_table(col_name, col_data, label_column, pc=1) # using pseudocount = 1
    feature_tables_with_ls[col_name] = feature_table

In [48]:
print("Feature Tables")
print("_______________\n")
for feature_name, feature_table in feature_tables_with_ls.items():
    print("Feature Name: "+feature_name)
    print("Feature Table:")
    print(tabulate(feature_table))
    print()

Feature Tables
_______________

Feature Name: department
Feature Table:
---------  --------  -----
sales      0.285714  0.375
systems    0.428571  0.375
marketing  0.285714  0.25
secretary  0.285714  0.25
---------  --------  -----

Feature Name: status
Feature Table:
------  --------  -----
senior  0.857143  0.125
junior  0.142857  0.875
------  --------  -----

Feature Name: age
Feature Table:
-  --------  -----
C  0.428571  0.375
B  0.142857  0.5
A  0.142857  0.25
E  0.285714  0.125
D  0.285714  0.125
F  0.285714  0.125
-  --------  -----

Feature Name: salary
Feature Table:
-  --------  -----
E  0.428571  0.375
A  0.142857  0.375
B  0.142857  0.25
I  0.428571  0.125
D  0.142857  0.25
C  0.285714  0.125
-  --------  -----



## Defining Test Set Dataframe

In [49]:
test_rows = [
    ["marketing", "C", "E"],
    ["sales", "C", "I"]
]

test_columns = column_names.copy()
test_columns.remove(label_class)
test_df = pd.DataFrame(test_rows, columns=test_columns)
print("Test Set:")
print(tabulate(test_df))

Test Set:
-  ---------  -  -
0  marketing  C  E
1  sales      C  I
-  ---------  -  -


## Making Predictions

In [53]:
def make_prediction(i, row, feature_tables):
    """
        Makes predictions depending on the given feature table.
        Parameters:
            i: index of sample
            row: row of test set dataframe
            feature_table: probability tables for each feature 
    """
    
    likelihoods = {label_class: 1 for label_class in label_classes}
    predict_prior_probs = {label_class: 0 for label_class in label_classes}
    predicted_probs = likelihoods.copy()
    
    for feature_name, feature_val in row.iteritems():
        for sample_class, class_prior_prob in label_prob.iteritems():
            likelihoods[sample_class] *= feature_tables[feature_name].loc[feature_val][sample_class]
            predict_prior_probs[sample_class] += feature_prior_prob[feature_name][feature_val]
            
    # Calculate Feature Tables without Laplace Smoothing
    for sample_class, class_prior_prob in label_prob.iteritems():
        predicted_probs[sample_class] = (likelihoods[sample_class] * label_prob[sample_class]) / predict_prior_probs[sample_class]
        print("\tProbability of class '" + sample_class+"': " + str(predicted_probs[sample_class]))
    print("\tPredicted class: '" + max(predicted_probs, key=predicted_probs.get)+"'")

### Making Predictions without Laplace Smoothing

In [51]:
for i, row in test_df.iterrows():
    print("Making prediction on " + str(i) +"th sample:")
    make_prediction(i, row, feature_tables)
    print()

Making prediction on 0th sample:
	Probability of class 'junior': 0.01111111111111111
	Probability of class 'senior': 0.016000000000000004
	Predicted class: 'senior'

Making prediction on 1th sample:
	Probability of class 'junior': 0.0
	Probability of class 'senior': 0.017777777777777785
	Predicted class: 'senior'



### Making Predictions with Laplace Smoothing

In [52]:
for i, row in test_df.iterrows():
    print("Making prediction on " + str(i) +"th sample:")
    make_prediction(i, row, feature_tables_with_ls)
    print()

Making prediction on 0th sample:
	Probability of class 'junior': 0.021093749999999998
	Probability of class 'senior': 0.02623906705539358
	Predicted class: 'senior'

Making prediction on 1th sample:
	Probability of class 'junior': 0.011718749999999998
	Probability of class 'senior': 0.029154518950437313
	Predicted class: 'senior'

