In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
data = pd.read_csv("mushrooms.csv")

In [3]:
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
X = data.drop("class", axis=1)
y = data["class"]

In [5]:
for col in X.columns:
    X[col] = X[col].astype("category").cat.codes
y = y.astype("category").cat.codes    # 'e'→0, 'p'→1

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X.values, y.values, test_size=0.2, random_state=42, stratify=y
)

In [7]:
class NaiveBayesClassifier:
    def fit(self, X, y, alpha=1.0):
        self.alpha = alpha
        self.classes = np.unique(y)
        n_samples, n_features = X.shape

        # Cardinality (number of distinct values) for every feature column
        self.cardinality = [X[:, i].max() + 1 for i in range(n_features)]

        # Count tables
        self.feature_counts = [
            np.zeros((len(self.classes), card), dtype=np.float64)
            for card in self.cardinality
        ]
        self.class_counts = np.zeros(len(self.classes), dtype=np.float64)

        for idx, cls in enumerate(self.classes):
            X_c = X[y == cls]
            self.class_counts[idx] = X_c.shape[0]
            for f in range(n_features):
                counts = np.bincount(
                    X_c[:, f], minlength=self.cardinality[f]
                ).astype(np.float64)
                self.feature_counts[f][idx] = counts

        # Priors  P(class)
        self.class_log_prior_ = np.log(
            (self.class_counts + alpha) /
            (n_samples + alpha * len(self.classes))
        )

        # Conditionals P(feature=value | class)
        self.feature_log_prob_ = []
        for f in range(n_features):
            smoothed = self.feature_counts[f] + alpha
            denom = smoothed.sum(axis=1, keepdims=True)  # per‑class totals
            self.feature_log_prob_.append(np.log(smoothed / denom))

    def _joint_log_likelihood(self, X):
        n_samples, n_features = X.shape
        jll = np.zeros((n_samples, len(self.classes)), dtype=np.float64)

        for c in range(len(self.classes)):
            # start with log prior
            log_prob = self.class_log_prior_[c]
            # add each feature’s log‑probability
            for f in range(n_features):
                log_prob += self.feature_log_prob_[f][c][X[:, f]]
            jll[:, c] = log_prob
        return jll

    def predict(self, X):
        return self.classes[np.argmax(self._joint_log_likelihood(X), axis=1)]

In [8]:
# 6. Train & evaluate
nb = NaiveBayesClassifier()
nb.fit(X_train, y_train)

In [9]:

y_pred = nb.predict(X_test)

In [10]:
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}\n")
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred), "\n")
print("Classification report:")
print(classification_report(y_test, y_pred, target_names=["edible", "poisonous"]))

Accuracy: 0.9458

Confusion matrix:
[[835   7]
 [ 81 702]] 

Classification report:
              precision    recall  f1-score   support

      edible       0.91      0.99      0.95       842
   poisonous       0.99      0.90      0.94       783

    accuracy                           0.95      1625
   macro avg       0.95      0.94      0.95      1625
weighted avg       0.95      0.95      0.95      1625



In [None]:
category_mapping = {
    col: {cat: i for i, cat in enumerate(data[col].astype("category").cat.categories)}
    for col in X.columns
}
feature_cols = list(X.columns)          # keep the correct order


cap_shape               = 'x'   # b=bell, c=conical, x=convex, f=flat, k=knobbed, s=sunken
cap_surface             = 's'   # f=fibrous, g=grooves, y=scaly, s=smooth
cap_color               = 'n'   # n=brown, b=buff, c=cinnamon, g=gray, r=green, p=pink, u=purple, e=red, w=white, y=yellow
bruises                 = 't'   # t=yes (bruises), f=no
odor                    = 'p'   # a=almond, l=anise, c=creosote, y=fishy, f=foul, m=musty, n=none, p=pungent, s=spicy
gill_attachment         = 'f'   # a=attached, d=descending, f=free, n=notched
gill_spacing            = 'c'   # c=close, w=crowded, d=distant
gill_size               = 'n'   # b=broad, n=narrow
gill_color              = 'k'   # k=black, n=brown, b=buff, h=chocolate, g=gray, r=green, o=orange, p=pink, u=purple, e=red, w=white, y=yellow
stalk_shape             = 'e'   # e=enlarging, t=tapering
stalk_root              = 'b'   # b=bulbous, c=club, u=cup, e=equal, z=rhizomorphs, r=rootless, ?=missing
stalk_surface_above_ring= 's'   # f=fibrous, y=scaly, k=silky, s=smooth
stalk_surface_below_ring= 's'   # f=fibrous, y=scaly, k=silky, s=smooth
stalk_color_above_ring  = 'w'   # n=brown, b=buff, c=cinnamon, g=gray, o=orange, p=pink, e=red, w=white, y=yellow
stalk_color_below_ring  = 'w'   # same codes as above
veil_type               = 'p'   # p=partial, u=universal   (dataset actually has only 'p')
veil_color              = 'w'   # n=brown, o=orange, w=white, y=yellow
ring_number             = 'o'   # n=none, o=one, t=two
ring_type               = 'p'   # c=cobwebby, e=evanescent, f=flaring, l=large, n=none, p=pendant, s=sheathing, z=zone
spore_print_color       = 'k'   # k=black, n=brown, b=buff, h=chocolate, r=green, o=orange, u=purple, w=white, y=yellow
population              = 's'   # a=abundant, c=clustered, n=numerous, s=scattered, v=several, y=solitary
habitat                 = 'u'   # g=grasses, l=leaves, m=meadows, p=paths, u=urban, w=waste, d=woods

sample_letters = [
    cap_shape, cap_surface, cap_color, bruises, odor,
    gill_attachment, gill_spacing, gill_size, gill_color,
    stalk_shape, stalk_root, stalk_surface_above_ring,
    stalk_surface_below_ring, stalk_color_above_ring, stalk_color_below_ring,
    veil_type, veil_color, ring_number, ring_type,
    spore_print_color, population, habitat
]

sample_codes = [
    category_mapping[col][val] for col, val in zip(feature_cols, sample_letters)
]

prediction_code = nb.predict(np.array(sample_codes).reshape(1, -1))[0]
prediction_label = "poisonous" if prediction_code else "edible"

print("Your mushroom is predicted to be:", prediction_label.upper())

Your mushroom is predicted to be: EDIBLE
