In [110]:
import pandas as pd

In [111]:
from decimal import Decimal

class NaiveBayes:

    def __init__(self, data_frame):
        self.data_frame = data_frame
        self.classes = self.data_frame[self.data_frame.columns[-1]].unique()
        self.cond_count = {}
        self.class_count = {}
        self.class_title = self.data_frame.columns[-1]
        self.total_rows = Decimal(int(self.data_frame[self.class_title].count()))

        for val in self.classes:
            self.class_count[val] = self.data_frame[self.class_title].value_counts()[val]

        # Convert all values to string form
        for column_title in self.data_frame.columns:
            self.data_frame[column_title] = self.data_frame[column_title].astype(str)

    def train(self):
        unique_per_column = [list(self.data_frame[column].unique()) for column in self.data_frame]
        temp_list = []

        for class_val in self.classes:
            for col_head in self.data_frame.columns[:-1]:
                for one in self.data_frame[col_head].unique():
                    temp_list.append((col_head, one, class_val))
        self.cond_count.update(dict.fromkeys(temp_list, Decimal(0)))

        for classVal in self.classes:
            selected_rows = self.data_frame[self.data_frame[self.class_title] == classVal]
            for col in self.data_frame.columns[:-1]:
                counts = selected_rows[col].value_counts()
                for value, count in counts.items():
                    self.cond_count[(col, value, classVal)] = Decimal(count)

    def predict(self, data_object):
        prediction = ''
        prob = Decimal(0)
        for classVal in self.classes:
            numerator = Decimal(1)
            times = Decimal(0)
            for ind in range(len(data_object)):
                numerator *= self.cond_count[(self.data_frame.columns[ind], data_object[ind], classVal)]
                times += Decimal(1)

            # Due to calculation of power, limit on maximum number of columns in dataset
            prob_temp = numerator / (self.class_count[classVal] ** (times - Decimal(1)) * self.total_rows)
            if prob_temp >= prob:
                prob = prob_temp
                prediction = classVal
        # Returns predicted class value and probability
        return prediction, prob

In [112]:
# Read mushrooms.csv file
# Kaggle link: https://www.kaggle.com/datasets/uciml/mushroom-classification
df=pd.read_csv("D:/Workspace/Datasets/mushrooms.csv")

In [113]:
df.head(3)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m


In [114]:
# Move class label to end
class_label=df.pop(df.columns[0])
df[class_label.name]=class_label
df.head(2)

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class
0,x,s,n,t,p,f,c,n,k,e,...,w,w,p,w,o,p,k,s,u,p
1,x,s,y,t,a,f,c,b,k,e,...,w,w,p,w,o,p,n,n,g,e


In [115]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X = df.drop('class', axis=1)
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [116]:
# Get the training data into dataframe
train_df=pd.concat([X_train,y_train],axis=1)
train_df.head(3)

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class
7873,k,s,e,f,s,f,c,n,b,t,...,p,w,p,w,o,e,w,v,d,p
6515,x,s,n,f,f,f,c,n,b,t,...,w,w,p,w,o,e,w,v,p,p
6141,f,y,e,f,y,f,c,n,b,t,...,p,w,p,w,o,e,w,v,l,p


In [117]:
nb=NaiveBayes(train_df)
nb.train()

In [118]:
# Convert the testing data values into 'string' type
X_test = X_test.astype(str)
y_test=y_test.astype(str)

In [119]:
# Get the prediction, one row at a time
y_pred=[]
for i in range(len(X_test)):
    y_pred.append( nb.predict( tuple(X_test.iloc[i]) )[0] )

In [122]:
from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy = round(accuracy_score(y_test, y_pred),4)
print(f"Accuracy: {accuracy*100} %")


Accuracy: 99.63 %


In [123]:
# Display confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Confusion Matrix:
[[837   6]
 [  0 782]]


# Implementing the NaiveBayes using 'sklearn' library.

In [106]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Load the dataset
mushrooms_df = pd.read_csv('D:/Workspace/Datasets/mushrooms.csv')
# Encode categorical variables
labelencoder = LabelEncoder()
for col in mushrooms_df.columns:
    mushrooms_df[col] = labelencoder.fit_transform(mushrooms_df[col])

# Split data into features and target
X = mushrooms_df.drop('class', axis=1)
y = mushrooms_df['class']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Naive Bayes classifier
nb_classifier = GaussianNB()

# Train the classifier
nb_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = nb_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9218461538461539

Confusion Matrix:
[[771  72]
 [ 55 727]]
