In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neural_network import MLPClassifier
from keras.models import Sequential
from keras.layers import Dense
from scikeras.wrappers import KerasClassifier

In [11]:
# загружаем датасет
df = pd.read_csv('mushrooms.csv', header=0)

In [12]:
# шапка датасета
print(df.head())

  class cap-shape cap-surface cap-color bruises odor gill-attachment  \
0     p         x           s         n       t    p               f   
1     e         x           s         y       t    a               f   
2     e         b           s         w       t    l               f   
3     p         x           y         w       t    p               f   
4     e         x           s         g       f    n               f   

  gill-spacing gill-size gill-color  ... stalk-surface-below-ring  \
0            c         n          k  ...                        s   
1            c         b          k  ...                        s   
2            c         b          n  ...                        s   
3            c         n          n  ...                        s   
4            w         b          k  ...                        s   

  stalk-color-above-ring stalk-color-below-ring veil-type veil-color  \
0                      w                      w         p          w   
1       

In [13]:
# информация о датасете
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [14]:
# Статистический анализ
print(df.describe())

       class cap-shape cap-surface cap-color bruises  odor gill-attachment  \
count   8124      8124        8124      8124    8124  8124            8124   
unique     2         6           4        10       2     9               2   
top        e         x           y         n       f     n               f   
freq    4208      3656        3244      2284    4748  3528            7914   

       gill-spacing gill-size gill-color  ... stalk-surface-below-ring  \
count          8124      8124       8124  ...                     8124   
unique            2         2         12  ...                        4   
top               c         b          b  ...                        s   
freq           6812      5612       1728  ...                     4936   

       stalk-color-above-ring stalk-color-below-ring veil-type veil-color  \
count                    8124                   8124      8124       8124   
unique                      9                      9         1          4   
top    

In [15]:
# Исследование категориальных признаков
categorical_features = df.select_dtypes(include=['object']).columns
for feature in categorical_features:
    print(f"Уникальные значения для {feature}:")
    print(df[feature].value_counts())

Уникальные значения для class:
class
e    4208
p    3916
Name: count, dtype: int64
Уникальные значения для cap-shape:
cap-shape
x    3656
f    3152
k     828
b     452
s      32
c       4
Name: count, dtype: int64
Уникальные значения для cap-surface:
cap-surface
y    3244
s    2556
f    2320
g       4
Name: count, dtype: int64
Уникальные значения для cap-color:
cap-color
n    2284
g    1840
e    1500
y    1072
w    1040
b     168
p     144
c      44
u      16
r      16
Name: count, dtype: int64
Уникальные значения для bruises:
bruises
f    4748
t    3376
Name: count, dtype: int64
Уникальные значения для odor:
odor
n    3528
f    2160
y     576
s     576
a     400
l     400
p     256
c     192
m      36
Name: count, dtype: int64
Уникальные значения для gill-attachment:
gill-attachment
f    7914
a     210
Name: count, dtype: int64
Уникальные значения для gill-spacing:
gill-spacing
c    6812
w    1312
Name: count, dtype: int64
Уникальные значения для gill-size:
gill-size
b    5612
n    25

целевой признак - class 

для определения сьедобный или ядовитый гриб

In [16]:
# признаки преобразуются в числовые, т.к. там только буквы везде
le = LabelEncoder()
df_encoded = df.apply(le.fit_transform)

# x и y
X = df_encoded.drop('class', axis=1)
y = df_encoded['class']

# разделение данных на обучение и тесты
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
# 1. KNeighborsClassifier
knn_classifier = KNeighborsClassifier()
knn_params = {'n_neighbors': [3, 5, 7]}
grid_knn = GridSearchCV(knn_classifier, knn_params, cv=5)
grid_knn.fit(X_train, y_train)

In [18]:
# 2. SVC
svc_classifier = SVC()
svc_params = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
grid_svc = GridSearchCV(svc_classifier, svc_params, cv=5)
grid_svc.fit(X_train, y_train)

In [19]:
# 3. DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier()
dt_params = {'max_depth': [None, 5, 10, 15]}
grid_dt = GridSearchCV(dt_classifier, dt_params, cv=5)
grid_dt.fit(X_train, y_train)

In [20]:
# 4. RandomForestClassifier
rf_classifier = RandomForestClassifier()
rf_params = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
grid_rf = GridSearchCV(rf_classifier, rf_params, cv=5)
grid_rf.fit(X_train, y_train)

In [22]:
# 5. Нейросетевой классификатор (Keras)
def create_nn_model():
    model = Sequential()
    model.add(Dense(units=64, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dense(units=32, activation='relu'))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

nn_classifier = KerasClassifier(model=create_nn_model, epochs=5, batch_size=32, verbose=0)
nn_params = {'epochs': [5, 10, 15], 'batch_size': [32, 64, 128]}
grid_nn = GridSearchCV(nn_classifier, nn_params, cv=5)
grid_nn.fit(X_train, y_train)

In [26]:
# Оценка моделей
models = {
    'KNeighborsClassifier': grid_knn,
    'SVC': grid_svc,
    'DecisionTreeClassifier': grid_dt,
    'RandomForestClassifier': grid_rf,
    'NeuralNetworkClassifier': grid_nn
}

for model_name, model in models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    print(f"модель: {model_name}")
    print(f"точность: {accuracy}")
    print("------------------------------")

модель: KNeighborsClassifier
точность: 0.9981538461538462
------------------------------
модель: SVC
точность: 1.0
------------------------------
модель: DecisionTreeClassifier
точность: 1.0
------------------------------
модель: RandomForestClassifier
точность: 1.0
------------------------------
модель: NeuralNetworkClassifier
точность: 1.0
------------------------------


точность 100% очень подозрительно выглядит на самом деле