In [148]:
import pandas as pd

In [149]:
from decimal import Decimal

class NaiveBayes:

    def __init__(self, data_frame):
        self.data_frame = data_frame
        self.classes = self.data_frame[self.data_frame.columns[-1]].unique()
        self.cond_count = {}
        self.class_count = {}
        self.class_title = self.data_frame.columns[-1]
        self.total_rows = Decimal(int(self.data_frame[self.class_title].count()))

        for val in self.classes:
            self.class_count[val] = self.data_frame[self.class_title].value_counts()[val]

        # Convert all values to string form
        for column_title in self.data_frame.columns:
            self.data_frame[column_title] = self.data_frame[column_title].astype(str)

    def train(self):
        unique_per_column = [list(self.data_frame[column].unique()) for column in self.data_frame]
        temp_list = []

        for class_val in self.classes:
            for col_head in self.data_frame.columns[:-1]:
                for one in self.data_frame[col_head].unique():
                    temp_list.append((col_head, one, class_val))
        self.cond_count.update(dict.fromkeys(temp_list, Decimal(0)))

        for classVal in self.classes:
            selected_rows = self.data_frame[self.data_frame[self.class_title] == classVal]
            for col in self.data_frame.columns[:-1]:
                counts = selected_rows[col].value_counts()
                for value, count in counts.items():
                    self.cond_count[(col, value, classVal)] = Decimal(count)

    def predict(self, data_object):
        prediction = ''
        prob = Decimal(0)
        for classVal in self.classes:
            numerator = Decimal(1)
            times = Decimal(0)
            for ind in range(len(data_object)):
                numerator *= self.cond_count[(self.data_frame.columns[ind], data_object[ind], classVal)]
                times += Decimal(1)

            # Due to calculation of power, limit on maximum number of columns in dataset
            prob_temp = numerator / (self.class_count[classVal] ** (times - Decimal(1)) * self.total_rows)
            if prob_temp >= prob:
                prob = prob_temp
                prediction = classVal
        # Returns predicted class value and probability
        return prediction, prob


In [150]:
df=pd.read_csv("D:/Workspace/Datasets/tennis.csv")
b=NaiveBayes(df)
b.train()
print(b.predict(('overcast','mild','normal','False')))
print(b.predict(('rain','hot','high','False')))
print(b.predict(('rain','mild','high','False')))
print()
flu=pd.read_csv("D:/Workspace/Datasets/flu_symptoms.csv")
dec=NaiveBayes(flu)
dec.train()
print(dec.predict(('Y','N','MILD','Y')))

('P', Decimal('0.05643738977072310405643738977'))
('N', Decimal('0.01828571428571428571428571429'))
('P', Decimal('0.02116402116402116402116402116'))

('Y', Decimal('0.024'))


In [151]:
yi=tuple(('Y','N','MILD','Y'))
print(yi,type(yi))
print(dec.predict(yi))

('Y', 'N', 'MILD', 'Y') <class 'tuple'>
('Y', Decimal('0.024'))


In [152]:
df=pd.read_csv("D:/Workspace/Datasets/mushrooms.csv")

In [153]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [154]:
class_label=df.pop(df.columns[0])
df[class_label.name]=class_label

In [155]:
df.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class
0,x,s,n,t,p,f,c,n,k,e,...,w,w,p,w,o,p,k,s,u,p
1,x,s,y,t,a,f,c,b,k,e,...,w,w,p,w,o,p,n,n,g,e
2,b,s,w,t,l,f,c,b,n,e,...,w,w,p,w,o,p,n,n,m,e
3,x,y,w,t,p,f,c,n,n,e,...,w,w,p,w,o,p,k,s,u,p
4,x,s,g,f,n,f,w,b,k,t,...,w,w,p,w,o,e,n,a,g,e


In [156]:
from sklearn.model_selection import train_test_split
X = df.drop('class', axis=1)
y = df['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [157]:
print(y_test)

1971    e
6654    p
5606    p
3332    e
6988    p
       ..
7374    p
1149    e
4999    p
7497    p
3341    p
Name: class, Length: 1625, dtype: object


In [158]:
train_df=pd.concat([X_train,y_train],axis=1)

In [159]:
train_df.head(3)

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class
7873,k,s,e,f,s,f,c,n,b,t,...,p,w,p,w,o,e,w,v,d,p
6515,x,s,n,f,f,f,c,n,b,t,...,w,w,p,w,o,e,w,v,p,p
6141,f,y,e,f,y,f,c,n,b,t,...,p,w,p,w,o,e,w,v,l,p


In [160]:
b=NaiveBayes(train_df)
b.train()

In [161]:
X_test = X_test.astype(str)

In [162]:
X_test.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
1971,f,f,n,f,n,f,w,b,h,t,...,f,w,w,p,w,o,e,n,s,g
6654,f,s,e,f,y,f,c,n,b,t,...,s,p,p,p,w,o,e,w,v,l
5606,x,y,n,f,f,f,c,n,b,t,...,s,w,p,p,w,o,e,w,v,l
3332,f,y,g,t,n,f,c,b,n,t,...,s,g,p,p,w,o,p,n,y,d
6988,f,s,e,f,s,f,c,n,b,t,...,s,p,p,p,w,o,e,w,v,l


In [163]:
y_test=y_test.astype(str)

In [164]:
t=tuple(X_test.iloc[0])
print(type(t[0]))

<class 'str'>


In [165]:
y_test

1971    e
6654    p
5606    p
3332    e
6988    p
       ..
7374    p
1149    e
4999    p
7497    p
3341    p
Name: class, Length: 1625, dtype: object

In [166]:
y_pred=[]
for i in range(len(X_test)):
    y_pred.append( b.predict( tuple(X_test.iloc[i]) )[0] )

In [167]:
print(y_pred)

['e', 'p', 'p', 'e', 'p', 'p', 'p', 'p', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'p', 'e', 'p', 'e', 'e', 'e', 'e', 'p', 'p', 'p', 'e', 'e', 'e', 'p', 'e', 'p', 'p', 'p', 'p', 'e', 'p', 'e', 'p', 'e', 'p', 'e', 'p', 'e', 'p', 'p', 'p', 'p', 'e', 'p', 'e', 'e', 'p', 'p', 'p', 'e', 'e', 'e', 'p', 'p', 'e', 'p', 'e', 'p', 'p', 'p', 'e', 'p', 'p', 'p', 'e', 'p', 'p', 'p', 'e', 'p', 'e', 'e', 'e', 'e', 'p', 'p', 'e', 'p', 'e', 'e', 'e', 'e', 'p', 'p', 'p', 'e', 'p', 'e', 'e', 'e', 'e', 'e', 'p', 'p', 'e', 'p', 'e', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'e', 'e', 'e', 'p', 'p', 'p', 'p', 'e', 'e', 'e', 'p', 'p', 'e', 'e', 'e', 'e', 'e', 'e', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'e', 'e', 'p', 'p', 'p', 'e', 'e', 'e', 'p', 'p', 'e', 'p', 'p', 'p', 'e', 'p', 'p', 'p', 'p', 'e', 'e', 'p', 'e', 'p', 'e', 'p', 'p', 'p', 'p', 'e', 'e', 'e', 'e', 'p', 'e', 'p', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'p', 'e', 'e', 'p', 'p', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'e',

In [168]:
from sklearn.metrics import accuracy_score


# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)


Accuracy: 0.9963076923076923


In [169]:
# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Display confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           e       1.00      0.99      1.00       843
           p       0.99      1.00      1.00       782

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625


Confusion Matrix:
[[837   6]
 [  0 782]]


In [147]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Load the dataset
mushrooms_df = pd.read_csv('D:/Workspace/Datasets/mushrooms.csv')

# Encode categorical variables
labelencoder = LabelEncoder()
for col in mushrooms_df.columns:
    mushrooms_df[col] = labelencoder.fit_transform(mushrooms_df[col])

# Split data into features and target
X = mushrooms_df.drop('class', axis=1)
y = mushrooms_df['class']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize Naive Bayes classifier
nb_classifier = GaussianNB()

# Train the classifier
nb_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = nb_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Display confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9294503691550451

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1257
           1       0.93      0.93      0.93      1181

    accuracy                           0.93      2438
   macro avg       0.93      0.93      0.93      2438
weighted avg       0.93      0.93      0.93      2438


Confusion Matrix:
[[1172   85]
 [  87 1094]]
