In [242]:
import pandas as pd
import numpy as np

# Logistic regression, KNearest neighbor and Naive bayes classifier

## Load Data

In [243]:
data = pd.read_csv("mushrooms.csv")

In [244]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

No Null values - Good to go

## Pipeline

In [245]:
def preprocessing_pipeline(data):
    for col in data:
        data[col] = data[col].astype('category')
        data[col] = data[col].cat.codes
    y = data['class']
    X = data.loc[:, 'cap-shape':]
    return X,y

In [246]:
X, y = preprocessing_pipeline(data)
X.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,5,2,4,1,6,1,0,1,4,0,...,2,7,7,0,2,1,4,2,3,5
1,5,2,9,1,0,1,0,0,4,0,...,2,7,7,0,2,1,4,3,2,1
2,0,2,8,1,3,1,0,0,5,0,...,2,7,7,0,2,1,4,3,2,3
3,5,3,8,1,6,1,0,1,5,0,...,2,7,7,0,2,1,4,2,3,5
4,5,2,3,0,5,1,1,0,4,1,...,2,7,7,0,2,1,0,3,0,1


In [292]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, train_size=0.8, random_state=42)

In [293]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)

In [294]:
X_train.shape, X_test.shape

((6499, 22), (1625, 22))

In [295]:
y_train.shape, y_test.shape

((6499,), (1625,))

## Logistic Regression

**Possible Error** : *ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.*
- Due to inability of convergence (possibly becasue of larger feature size and no scaling) within the default iteration(by default 100)
    - Takes around `384 iterations` to converge without scaling --> accuracy 95.769 %
    - When scaled, it converges within `default 100 iterations` --> accuracy 95.2 %
- refer [StackOverfow](https://stackoverflow.com/questions/62658215/convergencewarning-lbfgs-failed-to-converge-status-1-stop-total-no-of-iter)

In [296]:
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression( )
model = reg.fit(X_train, y_train)

### Evaluation

In [297]:
X_test = sc.transform(X_test)
pred = model.predict(X_test)

In [298]:
(pred == y_test).mean()

0.952

In [299]:
from sklearn.metrics import classification_report, confusion_matrix

In [300]:
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95       833
           1       0.96      0.94      0.95       792

    accuracy                           0.95      1625
   macro avg       0.95      0.95      0.95      1625
weighted avg       0.95      0.95      0.95      1625



In [301]:
print(confusion_matrix(y_test, pred))

[[799  44]
 [ 34 748]]


## Nearest Neighbours
- With scaling accuracy - 100%
- Without scaling accuracy - 99.81%

In [287]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, train_size=0.8, random_state=42)

In [288]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)

In [264]:
# from sklearn.neighbors import NearestNeighbors
# nbrs = NearestNeighbors(n_neighbors=3).fit(X_train)

### Evaluation

In [289]:
y_pred = classifier.predict(X_test)

In [266]:
(y_pred == y_test).mean()

0.9981538461538462

In [267]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       840
           1       1.00      1.00      1.00       785

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



In [290]:
print(confusion_matrix(y_test, y_pred))

[[840   3]
 [  0 782]]


## Naive Bayes
- With scaling: Accuracy - 92.18%
- Without scaling: Accuracy - 92.18%

In [282]:
from sklearn.naive_bayes import GaussianNB # Commonly used GaussianNB
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, train_size=0.8, random_state=42)
#X_train = sc.fit_transform(X_train)
gnb = GaussianNB()
model = gnb.fit(X_train, y_train)

### Evaluation

In [283]:
#X_test = sc.transform(X_test)
pred = model.predict(X_test)

In [284]:
(pred == y_test).mean()

0.9218461538461539

In [285]:
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92       826
           1       0.93      0.91      0.92       799

    accuracy                           0.92      1625
   macro avg       0.92      0.92      0.92      1625
weighted avg       0.92      0.92      0.92      1625



In [286]:
print(confusion_matrix(y_test, pred))

[[771  72]
 [ 55 727]]
