In [50]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import pickle
import graphviz
import re

from sklearn.tree import DecisionTreeClassifier, export_graphviz, export_text
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report

In [7]:
with open("data/splited_data.pickle", "rb") as file:
    data = pickle.load(file)
    X_train = data["X_train"]
    y_train = data["y_train"]
    X_test = data["X_test"]
    y_test = data["y_test"]

In [8]:
steps = [("imputation", SimpleImputer(missing_values="?", strategy="most_frequent")),
         ("encoder", OneHotEncoder(drop="first")),
         ("classifier", DecisionTreeClassifier(random_state=21))]

pipeline = Pipeline(steps)
pipeline.fit(X_train, y_train)

Pipeline(steps=[('imputation',
                 SimpleImputer(missing_values='?', strategy='most_frequent')),
                ('encoder', OneHotEncoder(drop='first')),
                ('classifier', DecisionTreeClassifier(random_state=21))])

In [26]:
feature_names = pipeline[1].get_feature_names_out(X_train.columns).tolist()
print(export_text(pipeline[-1], feature_names=feature_names))

|--- odor_n <= 0.50
|   |--- stalk-root_c <= 0.50
|   |   |--- stalk-surface-below-ring_y <= 0.50
|   |   |   |--- spore-print-color_u <= 0.50
|   |   |   |   |--- odor_l <= 0.50
|   |   |   |   |   |--- gill-spacing_w <= 0.50
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- gill-spacing_w >  0.50
|   |   |   |   |   |   |--- stalk-shape_t <= 0.50
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- stalk-shape_t >  0.50
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- odor_l >  0.50
|   |   |   |   |   |--- class: 1
|   |   |   |--- spore-print-color_u >  0.50
|   |   |   |   |--- class: 1
|   |   |--- stalk-surface-below-ring_y >  0.50
|   |   |   |--- class: 1
|   |--- stalk-root_c >  0.50
|   |   |--- ring-number_o <= 0.50
|   |   |   |--- class: 0
|   |   |--- ring-number_o >  0.50
|   |   |   |--- class: 1
|--- odor_n >  0.50
|   |--- spore-print-color_r <= 0.50
|   |   |--- stalk-surface-below-ring_y <= 0.50
|   |   |   |--- cap-surf

In [30]:
tree_txt = export_text(pipeline[-1], feature_names=feature_names)

In [74]:
def clean_condition(l, i):
    prefix = "|   " * (i - 1) + "|--- "
    condition = l.replace(prefix, "") 

    if condition.startswith("class: "):
        return condition.replace("class: ", "Then ").replace("0", "poisonous").replace("1", "edible")
    else:
        is_equal = ">" in condition
        is_equal_str = "equal" if is_equal else "not equal"
        feature = condition.split(" ")[0]
        variable, category = feature.split("_")
        cleaned_variable = variable.replace("-", " ")
        
        return f"{cleaned_variable} is {is_equal_str} to {category}"

In [76]:
def parse_rules(text):
    
    node_list = text.strip().split("\n")
    depth = [len(line.split("   ")) for line in lines]
    
    rule_list = list()
    
    past_depth = 0
    present_rule = []
    for i, l in zip(sizes, lines):
        if i > past_size:
            cleaned_condition = clean_condition(l, i)
            present_rule.append(cleaned_condition)
        else:
            rule_list.append(present_rule)
            present_rule = present_rule[:i-1]
        
        past_size = i
    
    return rule_list
    
parse_rules(tree_txt)

1 |--- odor_n <= 0.50
2 |   |--- stalk-root_c <= 0.50
3 |   |   |--- stalk-surface-below-ring_y <= 0.50
4 |   |   |   |--- spore-print-color_u <= 0.50
5 |   |   |   |   |--- odor_l <= 0.50
6 |   |   |   |   |   |--- gill-spacing_w <= 0.50
7 |   |   |   |   |   |   |--- class: 0
6 |   |   |   |   |   |--- gill-spacing_w >  0.50
7 |   |   |   |   |   |   |--- stalk-shape_t <= 0.50
8 |   |   |   |   |   |   |   |--- class: 0
7 |   |   |   |   |   |   |--- stalk-shape_t >  0.50
8 |   |   |   |   |   |   |   |--- class: 1
5 |   |   |   |   |--- odor_l >  0.50
6 |   |   |   |   |   |--- class: 1
4 |   |   |   |--- spore-print-color_u >  0.50
5 |   |   |   |   |--- class: 1
3 |   |   |--- stalk-surface-below-ring_y >  0.50
4 |   |   |   |--- class: 1
2 |   |--- stalk-root_c >  0.50
3 |   |   |--- ring-number_o <= 0.50
4 |   |   |   |--- class: 0
3 |   |   |--- ring-number_o >  0.50
4 |   |   |   |--- class: 1
1 |--- odor_n >  0.50
2 |   |--- spore-print-color_r <= 0.50
3 |   |   |--- stalk-su

[['odor is not equal to n',
  'stalk root is not equal to c',
  'stalk surface below ring is not equal to y',
  'spore print color is not equal to u',
  'odor is not equal to l',
  'gill spacing is not equal to w',
  'Then poisonous'],
 ['odor is not equal to n',
  'stalk root is not equal to c',
  'stalk surface below ring is not equal to y',
  'spore print color is not equal to u',
  'odor is not equal to l',
  'stalk shape is not equal to t',
  'Then poisonous'],
 ['odor is not equal to n',
  'stalk root is not equal to c',
  'stalk surface below ring is not equal to y',
  'spore print color is not equal to u',
  'odor is not equal to l',
  'stalk shape is not equal to t',
  'Then edible'],
 ['odor is not equal to n',
  'stalk root is not equal to c',
  'stalk surface below ring is not equal to y',
  'spore print color is not equal to u',
  'Then edible'],
 ['odor is not equal to n',
  'stalk root is not equal to c',
  'stalk surface below ring is not equal to y',
  'Then edible'],


In [28]:
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1294
           1       1.00      1.00      1.00      1387

    accuracy                           1.00      2681
   macro avg       1.00      1.00      1.00      2681
weighted avg       1.00      1.00      1.00      2681

