# Case Study on Probability for Data Science

Mushroom dataset

In [8]:
# Import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [9]:
# Read student data
df=pd.read_csv('mushrooms.csv')
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [10]:
df.shape

(8124, 23)

In [11]:
#check for any missing values
df.isna().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [12]:
# Extract feature columns
df.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [13]:
# Separate the data into feature data and target data (X and y, respectively)
x=df.drop(['class'],axis=1)
y=pd.DataFrame(df['class'])

In [16]:
from sklearn.preprocessing import LabelEncoder

In [17]:
x = x.apply(LabelEncoder().fit_transform)

In [18]:
x.head() 

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,5,2,4,1,6,1,0,1,4,0,...,2,7,7,0,2,1,4,2,3,5
1,5,2,9,1,0,1,0,0,4,0,...,2,7,7,0,2,1,4,3,2,1
2,0,2,8,1,3,1,0,0,5,0,...,2,7,7,0,2,1,4,3,2,3
3,5,3,8,1,6,1,0,1,5,0,...,2,7,7,0,2,1,4,2,3,5
4,5,2,3,0,5,1,1,0,4,1,...,2,7,7,0,2,1,0,3,0,1


In [19]:
y = y.apply(LabelEncoder().fit_transform)

In [20]:
# splitting the data into train and test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=42)

In [21]:
#standard scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.fit_transform(x_test)

In [22]:
#Model Application

In [35]:
# 1.logistic regression
from sklearn.linear_model import LogisticRegression
logit_model = LogisticRegression(solver='lbfgs', max_iter=1000)
logit_model.fit(x_train,y_train)
y_pred = logit_model.predict(x_test)

from sklearn.metrics import f1_score,confusion_matrix,accuracy_score,precision_score,recall_score

print('Accuracy is:',accuracy_score(y_test,y_pred))
print('Precision is:',precision_score(y_test,y_pred))
print('recall is:',recall_score(y_test,y_pred,))
print('f1 is:',f1_score(y_test,y_pred))


Accuracy is: 0.9527326440177253
Precision is: 0.9479479479479479
recall is: 0.9556004036326943
f1 is: 0.9517587939698493


In [36]:
confusion_matrix(y_test,y_pred)

array([[988,  52],
       [ 44, 947]], dtype=int64)

In [37]:
# 2.Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()
dt_model.fit(x_train,y_train)
y_pred = dt_model.predict(x_test)

from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix
print('Accuracy is:',accuracy_score(y_test,y_pred))
print('Precision is:',precision_score(y_test,y_pred))
print('recall is:',recall_score(y_test,y_pred))
print('f1 score is:',f1_score(y_test,y_pred))

Accuracy is: 1.0
Precision is: 1.0
recall is: 1.0
f1 score is: 1.0


In [38]:
confusion_matrix(y_test,y_pred)

array([[1040,    0],
       [   0,  991]], dtype=int64)

In [39]:
# 3.random forest classifier
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(x_train,y_train)
y_pred=rf.predict(x_test)

from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix
print('Accuracy is:',accuracy_score(y_test,y_pred))
print('Precision is:',precision_score(y_test,y_pred))
print('recall is:',recall_score(y_test,y_pred))
print('f1 score is:',f1_score(y_test,y_pred))

Accuracy is: 1.0
Precision is: 1.0
recall is: 1.0
f1 score is: 1.0


In [40]:
confusion_matrix(y_test,y_pred)

array([[1040,    0],
       [   0,  991]], dtype=int64)

# Naïve Bayes’ Classifier

In [41]:
#fitting gaussian naive bayes to the training set
from sklearn.naive_bayes import GaussianNB
classifier=GaussianNB()
classifier.fit(x_train,y_train)
y_pred=classifier.predict(x_test)

from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix
print('Accuracy is:',accuracy_score(y_test,y_pred))
print('Precision is:',precision_score(y_test,y_pred))
print('recall is:',recall_score(y_test,y_pred))
print('f1 score is:',f1_score(y_test,y_pred))

Accuracy is: 0.9256523879862137
Precision is: 0.9183266932270916
recall is: 0.9303733602421796
f1 score is: 0.9243107769423559


In [42]:
#confusion metrix
confusion_matrix(y_test,y_pred)

array([[958,  82],
       [ 69, 922]], dtype=int64)