In [1]:
# 1. Import packages
import pandas as pd
import numpy as np
from sklearn import model_selection,metrics,preprocessing
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# 2. Data loading
df = pd.read_csv("datasets/mushrooms.csv")

In [3]:
# 3. EDA
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [4]:
# Check for missing values and duplicates
print(df.isna().sum())
print(df.duplicated().sum())

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64
0


In [5]:
df.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [6]:
# Split the features and labels
label = df.pop('class')

In [7]:
# 4. Categorical encoding
# (A) For label
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(label)
label_transformed = label_encoder.transform(label)

In [8]:
# (B) For features
ordinal_encoder = preprocessing.OrdinalEncoder()
ordinal_encoder.fit(df)
features_transformed = ordinal_encoder.transform(df)

In [9]:
# Put the data back to DataFrame
feature_df = pd.DataFrame(features_transformed,columns=df.columns)

In [10]:
# 5. Data splitting
x_train,x_test,y_train,y_test = model_selection.train_test_split(feature_df,label_transformed,train_size=0.8,random_state=42)

In [11]:
# 6. Model development
model = LogisticRegression(solver='liblinear')
model.fit(x_train,y_train)

In [12]:
# 7. Model evaluation
training_prediction = model.predict(x_train)
testing_prediction = model.predict(x_test)

print("Training evaluation:")
print("Training loss: ",metrics.log_loss(y_train,training_prediction))
print(metrics.classification_report(y_train,training_prediction))
print("Testing evaluation:")
print("Test loss: ",metrics.log_loss(y_test,testing_prediction))
print(metrics.classification_report(y_test,testing_prediction))

Training evaluation:
Training loss:  1.7414536335101993
              precision    recall  f1-score   support

           0       0.95      0.96      0.95      3365
           1       0.96      0.94      0.95      3134

    accuracy                           0.95      6499
   macro avg       0.95      0.95      0.95      6499
weighted avg       0.95      0.95      0.95      6499

Testing evaluation:
Test loss:  1.9297217506788875
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       843
           1       0.94      0.95      0.94       782

    accuracy                           0.95      1625
   macro avg       0.95      0.95      0.95      1625
weighted avg       0.95      0.95      0.95      1625

