In [1]:
from sklearn import datasets 
iris = datasets.load_iris()

In [2]:
X = iris.data
Y = iris.target

In [3]:
X.shape, Y.shape

((150, 4), (150,))

In [4]:
def label(X):
    second_limit = X.mean()
    first_limit = 0.5 * second_limit
    third_limit = 1.5 * second_limit
    
    for i in range(len(X)):
        if(X[i] < first_limit):
            X[i] = 0
        elif(X[i] < second_limit):
            X[i] = 1
        elif(X[i] < third_limit):
            X[i] = 2
        else:
            X[i] = 3
    return X

In [5]:
for i in range(0,X.shape[-1]):
    X[:,i] = label(X[:,i])

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 0, test_size = 0.25)

In [7]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((112, 4), (38, 4), (112,), (38,))

# Self-implemented Naive Bayes Classifier

In [8]:
def fit(X, Y):
    result = {}
    result["total_data_points"] = len(Y)
    all_classes = set(Y)
    for current_class in all_classes:
        current_class_length = (Y == current_class).sum()
        result[current_class] = {}
        result[current_class]["total_class_points"] = current_class_length
        n_features = X.shape[-1]
        X_current_class = X[Y == current_class]
        for current_feature in range(n_features) :
            result[current_class][current_feature] ={}
            possible_feature_values = set(X[:, current_feature])
            for value in possible_feature_values :
                result[current_class][current_feature][value] = (X_current_class[:,current_feature] == value).sum()
    return result     

In [9]:
import numpy as np
def probability(X, current_class):
    output = np.log(feature_dict[current_class]["total_class_points"]) - np.log(feature_dict["total_data_points"])
    for i in range(len(X)) :
        current_feature_count = np.log(feature_dict[current_class][i][X[i]]+ 1)
        current_class_count = np.log(feature_dict[current_class]["total_class_points"] + len(feature_dict[current_class][i].keys()))
        current_feature_prob = current_feature_count - current_class_count
        output = output + current_feature_prob
    return output


In [10]:
def predictSinglePoint(X):
    classes = feature_dict.keys()
    max_prob = -1000
    best_class = -1
    first_run = True
    for current_class in classes :
        if(current_class == "total_data_points"):
            continue
        current_prob = probability(X, current_class) 
        if(first_run or current_prob > max_prob):
            max_prob = current_prob
            best_class = current_class
        first_run = False
    return best_class

In [11]:
def predict(X):
    Y_pred = []
    for x in X:
        Y_pred.append(predictSinglePoint(x))
    return Y_pred

In [12]:
feature_dict = fit(X_train, Y_train)
feature_dict

{0: {0: {1.0: 37, 2.0: 0},
  1: {1.0: 6, 2.0: 31},
  2: {0.0: 37, 1.0: 0, 2.0: 0, 3.0: 0},
  3: {0.0: 36, 1.0: 1, 2.0: 0, 3.0: 0},
  'total_class_points': 37},
 1: {0: {1.0: 19, 2.0: 15},
  1: {1.0: 28, 2.0: 6},
  2: {0.0: 0, 1.0: 6, 2.0: 28, 3.0: 0},
  3: {0.0: 0, 1.0: 8, 2.0: 25, 3.0: 1},
  'total_class_points': 34},
 2: {0: {1.0: 4, 2.0: 37},
  1: {1.0: 26, 2.0: 15},
  2: {0.0: 0, 1.0: 0, 2.0: 24, 3.0: 17},
  3: {0.0: 0, 1.0: 0, 2.0: 4, 3.0: 37},
  'total_class_points': 41},
 'total_data_points': 112}

In [13]:
Y_pred = predict(X_test)

In [14]:
Y_test

array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
       0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 1])

In [15]:
print(Y_pred)

[2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1, 0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 1]


In [16]:
n_correct_points = (Y_test == Y_pred).sum()
total_points = len(Y_test)

In [17]:
accuracy = n_correct_points/total_points

# Results of our classifier

In [18]:
print("accuracy of our classifier is ", accuracy)

accuracy of our classifier is  0.9736842105263158


In [19]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(Y_test,Y_pred))
print(confusion_matrix(Y_test,Y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        13
          1       0.94      1.00      0.97        16
          2       1.00      0.89      0.94         9

avg / total       0.98      0.97      0.97        38

[[13  0  0]
 [ 0 16  0]
 [ 0  1  8]]


# Inbuilt Gaussian Naive Bayes Classifier

In [20]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)


# Results of inbuilt Gaussian Naive Bayes classifier

In [21]:
accuracy = (Y_pred == Y_test).sum()/len(Y_pred)
print("accuracy of inbuilt gaussian naive bayes classifier is ", accuracy)
print(classification_report(Y_test, Y_pred))
print(confusion_matrix(Y_test, Y_pred))

accuracy of inbuilt gaussian naive bayes classifier is  0.868421052631579
             precision    recall  f1-score   support

          0       1.00      0.85      0.92        13
          1       0.76      1.00      0.86        16
          2       1.00      0.67      0.80         9

avg / total       0.90      0.87      0.87        38

[[11  2  0]
 [ 0 16  0]
 [ 0  3  6]]


# Inbuilt Multinomial Naive Bayes Classifier

In [23]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)


# Results of inbuilt Multinomial Naive Bayes Classifier

In [25]:
accuracy = (Y_pred == Y_test).sum()/len(Y_pred)
print("accuracy of inbuilt multinomial naive bayes classifier is ", accuracy)
print(classification_report(Y_test, Y_pred))
print(confusion_matrix(Y_test, Y_pred))

accuracy of inbuilt multinomial naive bayes classifier is  0.5263157894736842
             precision    recall  f1-score   support

          0       1.00      0.85      0.92        13
          1       0.00      0.00      0.00        16
          2       0.36      1.00      0.53         9

avg / total       0.43      0.53      0.44        38

[[11  2  0]
 [ 0  0 16]
 [ 0  0  9]]
