In [1]:
import pandas as pd #for loadin the dataset
import numpy as np #dependency of pandas
import matplotlib.pyplot as plt #for plotting graphs
import seaborn as sns #data viz
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder #for encoding input training categorical data
from sklearn.model_selection import train_test_split #for extracting X_train , Y_train etc
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report


# Loading dataset
df = pd.read_csv(r"C:\Users\jaken\Documents\mushroom_classification\data\mushrooms.csv")
df

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [2]:
# Handle missing values
if 'stalk-root' in df.columns:
    df['stalk-root'] = df['stalk-root'].replace('?', np.nan)

# Drop rows with missing values
df_cleaned = df.dropna()


In [3]:
# Separate imput features and target class
y = df_cleaned['class']
X = df_cleaned.drop('class', axis=1)

In [4]:
# Encode input categorical features using OrdinalEncoder
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_encoded = ordinal_encoder.fit_transform(X)
X_encoded = pd.DataFrame(X_encoded, columns=X.columns, index=X.index)


In [5]:

# Encode output class variable using LabelEncoder
target_encoder = LabelEncoder()
y_encoded = target_encoder.fit_transform(y)

In [6]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y_encoded, 
    test_size=0.1,
    random_state=42,
    stratify=y_encoded
)

In [7]:
print(X_train)

      cap-shape  cap-surface  cap-color  bruises  odor  gill-attachment  \
802         5.0          2.0        4.0      1.0   6.0              1.0   
3133        2.0          3.0        3.0      1.0   5.0              1.0   
4221        2.0          0.0        7.0      0.0   2.0              1.0   
816         5.0          3.0        4.0      1.0   0.0              1.0   
3478        2.0          3.0        4.0      1.0   5.0              1.0   
...         ...          ...        ...      ...   ...              ...   
1733        5.0          0.0        6.0      0.0   5.0              1.0   
7073        2.0          2.0        1.0      1.0   5.0              1.0   
1897        2.0          2.0        3.0      0.0   5.0              1.0   
3588        2.0          0.0        3.0      1.0   5.0              1.0   
7295        3.0          3.0        7.0      0.0   5.0              1.0   

      gill-spacing  gill-size  gill-color  stalk-shape  ...  \
802            0.0        1.0       

In [8]:
model = LogisticRegression(max_iter=10000)
model.fit(X_train,y_train)


In [9]:

y_pred = model.predict(X_test)

print("accuracy:" , accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

accuracy: 0.9557522123893806
              precision    recall  f1-score   support

           0       0.96      0.97      0.96       349
           1       0.96      0.93      0.94       216

    accuracy                           0.96       565
   macro avg       0.96      0.95      0.95       565
weighted avg       0.96      0.96      0.96       565



In [10]:
y_c =np.array([[0.],
       [2.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [4.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [3.]])

y_c = y_c.T
# 3. Use the model to predict on your input data.
single_prediction = model.predict(y_c)

# Print the result
# print("The prediction for your input is:", single_prediction[0])
l = y_pred.shape[0]
e=[]
p=[]

for j in range(l):
    if y_pred[j] == 1:
        e.append(1)
    else:
        p.append(1)

print(len(e))
print(len(p))


209
356




In [None]:

from joblib import dump

# Save the trained model and encoders
dump(model, 'mushroom_model.pkl')
dump(ordinal_encoder, 'ordinal_encoder.pkl') 
dump(target_encoder, 'target_encoder.pkl')

# Save feature names for reference
feature_names = X.columns.tolist()
dump(feature_names, 'feature_names.pkl')

print("Model and encoders saved successfully!")
print(f"Feature names: {feature_names}")
print(f"Total features: {len(feature_names)}")

# Verify the files were created
import os
files = ['mushroom_model.pkl', 'ordinal_encoder.pkl', 'target_encoder.pkl', 'feature_names.pkl']
for file in files:
    if os.path.exists(file):
        print(f"✓ {file} created")
    else:
        print(f"✗ {file} NOT created")

Model and encoders saved successfully!
Feature names: ['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']
Total features: 22
✓ mushroom_model.pkl created
✓ ordinal_encoder.pkl created
✓ target_encoder.pkl created
✓ feature_names.pkl created
