In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv("mushrooms.csv")

In [3]:
# df = pd.read_csv("mushrooms.csv")
df = df.drop(['habitat','population','spore-print-color','stalk-root','stalk-surface-above-ring',
                 'stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring','veil-type',
                'gill-attachment','gill-spacing','gill-size','odor', 'gill-color', 'stalk-shape', 'ring-number', 'ring-type'],axis=1)
# df = df.drop(['veil-type', 'odor'], axis=1)

In [4]:
df.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises',
       'veil-color'],
      dtype='object')

In [5]:
df = pd.get_dummies(df)

In [6]:
df

Unnamed: 0,class_e,class_p,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,...,cap-color_r,cap-color_u,cap-color_w,cap-color_y,bruises_f,bruises_t,veil-color_n,veil-color_o,veil-color_w,veil-color_y
0,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,1,0,0,1,0
2,1,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0
3,0,1,0,0,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,1,0
4,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
8120,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,1,0,0,0
8121,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
8122,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [8]:
df = df.drop(['class_e', 'bruises_f'], axis=1)

In [9]:
y = df['class_p']
x = df.drop(['class_p'], axis=1)

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [11]:
params = {"min_samples_split": range(2, 11, 1),
          "min_samples_leaf": range(2, 11, 1),
          "max_leaf_nodes" : range(10,500,10),
          "criterion": ["gini", "entropy"],
          "min_weight_fraction_leaf": np.arange(0.0, 1.6, 0.2)}

In [12]:
dTree = DecisionTreeClassifier()

In [13]:
rf_classifier_search = GridSearchCV(dTree, params, cv=5, n_jobs=4)
rf_classifier_search.fit(x_train, y_train)
rf_classifier_search.best_params_

{'criterion': 'entropy',
 'max_leaf_nodes': 60,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0}

In [14]:
best_rf_classifier = \
    DecisionTreeClassifier(max_leaf_nodes=rf_classifier_search.best_params_["max_leaf_nodes"],
                           criterion=rf_classifier_search.best_params_["criterion"],
                           min_samples_leaf=rf_classifier_search.best_params_["min_samples_leaf"],
                           min_samples_split=rf_classifier_search.best_params_["min_samples_split"],
                           min_weight_fraction_leaf=rf_classifier_search.best_params_["min_weight_fraction_leaf"])
best_rf_classifier.fit(x_train, y_train)
best_rf_classifier.score(x_test, y_test)

0.9037672510257366

In [28]:
dtClf_best_clf_score = []
dtClf_best_acc_score = []

for i in range(0,6):
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=i)
  params = {"min_samples_split": range(4, 8, 1),
          "min_samples_leaf": range(4, 8, 2),
          "max_leaf_nodes" : range(10,100,10),
          "criterion": ["gini"]}
  dtClf = DecisionTreeClassifier()
  dtClf_classifier_search = GridSearchCV(dtClf, params, cv=5,n_jobs=4)
  dtClf_classifier_search.fit(x_train, y_train)
  dtClf_classifier_search.best_params_
  best_dtClf_classifier = \
      DecisionTreeClassifier(criterion=dtClf_classifier_search.best_params_["criterion"],
                           min_samples_leaf=dtClf_classifier_search.best_params_["min_samples_leaf"],
                           min_samples_split=dtClf_classifier_search.best_params_["min_samples_split"])
  best_dtClf_classifier.fit(x_train, y_train)
  dtClf_best_clf_score.append(best_dtClf_classifier.score(x_test, y_test))
  y_pred = best_dtClf_classifier.predict(x_test)
  dtClf_best_acc_score.append(accuracy_score(y_test,y_pred))

In [29]:
dtClf_best_clf_score

[0.9149571055576278,
 0.9127191346512495,
 0.9007832898172323,
 0.90787019768743,
 0.9048862364789257,
 0.917195076464006]

In [30]:
dtClf_best_acc_score

[0.9149571055576278,
 0.9127191346512495,
 0.9007832898172323,
 0.90787019768743,
 0.9048862364789257,
 0.917195076464006]

In [31]:
def list_avg(list):
  sum = 0
  count = 0
  for element in list:
    sum += element
    count += 1
  avg = sum / count
  return(avg)

In [32]:
mean_dtClf_best_clf_score = list_avg(dtClf_best_clf_score)
mean_dtClf_best_acc_score = list_avg(dtClf_best_acc_score)

In [33]:
print(mean_dtClf_best_clf_score)
print(mean_dtClf_best_acc_score)

0.9097351734427453
0.9097351734427453
