In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

  import pandas.util.testing as tm


In [3]:
columns = ["ediblility", "cap_shape", "cap_surface", "cap_color", "bruises", "odor", "gill_attachement", "gill_spacing", "gill_size", "gill_color", "stalk_shape", "stalk_root", "stalk_surface_above_ring", "stalk_surface_below_ring", "stalk_color_above_ring", "stalk_color_below_ring", "veil_type", "veil_color", "ring_number", "ring_type", "spore_print_color", "population", "habitat"]

dataset = pd.read_csv("agaricus_lepiota_data.csv", names = columns)

In [4]:
print(dataset.shape)

(8124, 23)


In [5]:
print(dataset)

     ediblility cap_shape cap_surface  ... spore_print_color population habitat
0             p         x           s  ...               k_2        s_4       u
1             e         x           s  ...                 n          n       g
2             e         b           s  ...                 n          n       m
3             p         x           y  ...                 k          s       u
4             e         x           s  ...                 n          a       g
...         ...       ...         ...  ...               ...        ...     ...
8119          e         k           s  ...                 b          c       l
8120          e         x           s  ...                 b          v       l
8121          e         f           s  ...                 b          c       l
8122          p         k           y  ...                 w          v       l
8123          e         x           s  ...                 o          c       l

[8124 rows x 23 columns]


In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   ediblility                8124 non-null   object
 1   cap_shape                 8124 non-null   object
 2   cap_surface               8124 non-null   object
 3   cap_color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill_attachement          8124 non-null   object
 7   gill_spacing              8124 non-null   object
 8   gill_size                 8124 non-null   object
 9   gill_color                8124 non-null   object
 10  stalk_shape               8124 non-null   object
 11  stalk_root                8124 non-null   object
 12  stalk_surface_above_ring  8124 non-null   object
 13  stalk_surface_below_ring  8124 non-null   object
 14  stalk_color_above_ring  

In [7]:
dataset.describe()

Unnamed: 0,ediblility,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachement,gill_spacing,gill_size,gill_color,stalk_shape,stalk_root,stalk_surface_above_ring,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,3,10,2,2,3,12,2,6,5,5,9,10,2,5,3,6,10,7,7
top,e,x,y,n,false,n,f,c,b,b,t,b,s,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,4608,3776,5175,4935,4464,4383,8123,7923,7488,3967,2388,4040,3148


In [8]:
dataset.isnull().sum()

ediblility                  0
cap_shape                   0
cap_surface                 0
cap_color                   0
bruises                     0
odor                        0
gill_attachement            0
gill_spacing                0
gill_size                   0
gill_color                  0
stalk_shape                 0
stalk_root                  0
stalk_surface_above_ring    0
stalk_surface_below_ring    0
stalk_color_above_ring      0
stalk_color_below_ring      0
veil_type                   0
veil_color                  0
ring_number                 0
ring_type                   0
spore_print_color           0
population                  0
habitat                     0
dtype: int64

No null values in data

In [9]:
print(dataset.groupby("ediblility").size())

ediblility
e    4208
p    3916
dtype: int64


In [11]:
encoded_data = dataset.apply(LabelEncoder().fit_transform)

print(encoded_data)

      ediblility  cap_shape  ...  population  habitat
0              1          5  ...           4        5
1              0          5  ...           2        1
2              0          0  ...           2        3
3              1          5  ...           3        5
4              0          5  ...           0        1
...          ...        ...  ...         ...      ...
8119           0          3  ...           1        2
8120           0          5  ...           5        2
8121           0          2  ...           1        2
8122           1          3  ...           5        2
8123           0          5  ...           1        2

[8124 rows x 23 columns]


In [12]:
X = encoded_data.iloc[:, 1:].values
y = encoded_data.iloc[:,0].values

In [13]:
print(X)

[[5 2 4 ... 3 4 5]
 [5 2 9 ... 4 2 1]
 [0 2 8 ... 4 2 3]
 ...
 [2 2 4 ... 0 1 2]
 [3 3 4 ... 8 5 2]
 [5 2 4 ... 5 1 2]]


In [14]:
le = LabelEncoder()
y = le.fit_transform(y)

In [15]:
print(y)

[1 0 0 ... 0 1 0]


In [16]:
SEED = 123

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify = y, random_state = SEED)

In [17]:

lr = LogisticRegression(random_state = SEED)
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier(random_state = SEED)
svc = SVC()


In [18]:
classifier = [("LogisticRegression", lr),
             ("KNeighborsClassifier", knn),
             ("Classification Tree", dt),
             ("SVM", svc)]

Checking my ensemble model classifiers accuracy first:-


In [19]:
for clf_name, clf in classifier:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("{:s} : {:.3f}".format(clf_name, accuracy_score(y_test, y_pred)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression : 0.951
KNeighborsClassifier : 1.000
Classification Tree : 1.000
SVM : 0.992


Now using ensemble model VotingClassifier:-

In [20]:
vc = VotingClassifier(estimators = classifier)

vc.fit(X_train, y_train)
y_pred = vc.predict(X_test)

print("voting Classifier : {:.3f}".format(accuracy_score(y_test, y_pred)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


voting Classifier : 0.992


So , i get a test accuracy of 99.2%.