In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn import tree

import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

### This is a second iteration.  We'll take a different approach and have a model for all mushrooms that have an odor and another model for mushrooms that are odorless.  In the end, there was a lot of success when the dataframe was split based on no odor vs. all others.

In [2]:
df = pd.read_csv('mushrooms.csv')

In [3]:
df.rename(index=str, columns={'class':'e_or_p'}, inplace=True)

### Let's make a copy of the dataframe and encode each of the features.  These encoded columns can be later used for modeling. 

In [4]:
categorical_feature_mask = df.dtypes==object

In [5]:
categorical_cols = df.columns[categorical_feature_mask].tolist()

In [6]:
le = LabelEncoder()

In [7]:
df_enc = df.copy()

In [8]:
df_enc[categorical_cols] = df_enc[categorical_cols].apply(lambda col: le.fit_transform(col))
df_enc[categorical_cols].head()

Unnamed: 0,e_or_p,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


### Let's verify below that no odor was encoded to 5.  Success!

In [10]:
df[df.odor == 'n'].head(3)

Unnamed: 0,e_or_p,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
14,e,x,f,n,f,n,f,w,b,n,...,f,w,w,p,w,o,e,k,a,g
15,e,s,f,g,f,n,f,c,n,k,...,s,w,w,p,w,o,p,n,y,u


In [11]:
df_enc[df_enc.odor == 5].head(3)

Unnamed: 0,e_or_p,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1
14,0,5,0,4,0,5,1,1,0,5,...,0,7,7,0,2,1,0,2,0,1
15,0,4,0,3,0,5,1,0,1,4,...,2,7,7,0,2,1,4,3,5,5


### Let's train/test split on the encoded df with just the odorless mushrooms with the same random state and test size for reproducibility.

In [50]:
df_no_odor_enc = df_enc[df_enc.odor == 5]

In [51]:
train, test = train_test_split(df_no_odor_enc, test_size=.3, random_state=123, stratify=df_no_odor_enc[['e_or_p']])

In [52]:
y_train = train[['e_or_p']]
y_test = test[['e_or_p']]

In [53]:
X_train = train[['gill-color', 'cap-shape', 'cap-color']]
X_test = test[['gill-color', 'cap-shape', 'cap-color']]

### Let's now try a logistic regrssion with multi class model.

In [54]:
log_reg = LogisticRegression(random_state=123, solver='saga').fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [55]:
y_pred = log_reg.predict(X_train)

In [56]:
log_reg.predict_proba(X_train)

array([[0.99622894, 0.00377106],
       [0.98990773, 0.01009227],
       [0.94832291, 0.05167709],
       ...,
       [0.93074713, 0.06925287],
       [0.99528518, 0.00471482],
       [0.99741265, 0.00258735]])

In [57]:
print('Accuracy of Logistic Regression classifier on training set: {:.6f}'
     .format(log_reg.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.963953


In [58]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      2385
           1       0.38      0.10      0.15        84

   micro avg       0.96      0.96      0.96      2469
   macro avg       0.67      0.54      0.57      2469
weighted avg       0.95      0.96      0.95      2469



In [59]:
cm = pd.DataFrame(confusion_matrix(y_train, y_pred),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])

cm

Unnamed: 0,Pred -,Pred +
Actual -,2372,13
Actual +,76,8


### Wow!  Our logistic regression model went from 74% with the entire dataframe on the first iteration to 98.1% on a train dataframe with just the odorless mushrooms.

### Let's now try a decision tree model.

In [60]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=123).fit(X_train, y_train)

In [61]:
y_pred = clf.predict(X_train)
y_pred_proba = clf.predict_proba(X_train)

In [62]:
print('Accuracy of Decision Tree classifier on training set: {:.6f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.981369


In [63]:
labels = sorted(y_train.e_or_p.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2379,6
1,40,44


### Decision tree was our best model.  Let's check the results on the test.  These are great!

In [64]:
print('Accuracy of Decision Tree classifier on training set: {:.6f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on training set: 0.983003


In [65]:
y_pred_test = clf.predict(X_test)
cm = pd.DataFrame(confusion_matrix(y_test, y_pred_test),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])

cm

Unnamed: 0,Pred -,Pred +
Actual -,1021,2
Actual +,16,20


### Let's see if we improve to 99% >= on our dataframe of mushrooms with odors.

In [36]:
df_w_odor_enc = df_enc[df_enc.odor != 5]

In [38]:
train, test = train_test_split(df_w_odor_enc, test_size=.3, random_state=123, stratify=df_w_odor_enc[['e_or_p']])

In [39]:
y_train = train[['e_or_p']]
y_test = test[['e_or_p']]

In [40]:
X_train = train[['odor', 'gill-color', 'cap-shape']]
X_test = test[['odor', 'gill-color', 'cap-shape']]

### Let's start with logistic regression again.

In [42]:
log_reg = LogisticRegression(random_state=123, solver='saga').fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [43]:
y_pred = log_reg.predict(X_train)

In [44]:
print('Accuracy of Logistic Regression classifier on training set: {:.6f}'
     .format(log_reg.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.881878


### This is another great improvement for logistic regression.  Let's check decision tree now.

In [46]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=123).fit(X_train, y_train)
y_pred = clf.predict(X_train)
y_pred_proba = clf.predict_proba(X_train)
print('Accuracy of Decision Tree classifier on training set: {:.6f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 1.000000


### 100% accuracy on our train!  To the test cave...

In [47]:
print('Accuracy of Decision Tree classifier on training set: {:.6f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on training set: 1.000000


In [48]:
y_pred_test = clf.predict(X_test)
cm = pd.DataFrame(confusion_matrix(y_test, y_pred_test),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])

cm

Unnamed: 0,Pred -,Pred +
Actual -,240,0
Actual +,0,1139
