In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import pickle
import tree_parser

from sklearn.tree import DecisionTreeClassifier, export_graphviz, export_text
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report

In [2]:
with open("data/splited_data.pickle", "rb") as file:
    data = pickle.load(file)
    X_train = data["X_train"]
    y_train = data["y_train"]
    X_test = data["X_test"]
    y_test = data["y_test"]

In [3]:
steps = [("imputation", SimpleImputer(missing_values="?", strategy="most_frequent")),
         ("encoder", OneHotEncoder(drop="first")),
         ("classifier", DecisionTreeClassifier(random_state=21))]

pipeline = Pipeline(steps)
pipeline.fit(X_train, y_train)

Pipeline(steps=[('imputation',
                 SimpleImputer(missing_values='?', strategy='most_frequent')),
                ('encoder', OneHotEncoder(drop='first')),
                ('classifier', DecisionTreeClassifier(random_state=21))])

In [4]:
feature_names = pipeline[1].get_feature_names_out(X_train.columns).tolist()
print(export_text(pipeline[-1], feature_names=feature_names))

|--- odor_n <= 0.50
|   |--- stalk-root_c <= 0.50
|   |   |--- stalk-surface-below-ring_y <= 0.50
|   |   |   |--- spore-print-color_u <= 0.50
|   |   |   |   |--- odor_l <= 0.50
|   |   |   |   |   |--- gill-spacing_w <= 0.50
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- gill-spacing_w >  0.50
|   |   |   |   |   |   |--- stalk-shape_t <= 0.50
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- stalk-shape_t >  0.50
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- odor_l >  0.50
|   |   |   |   |   |--- class: 1
|   |   |   |--- spore-print-color_u >  0.50
|   |   |   |   |--- class: 1
|   |   |--- stalk-surface-below-ring_y >  0.50
|   |   |   |--- class: 1
|   |--- stalk-root_c >  0.50
|   |   |--- ring-number_o <= 0.50
|   |   |   |--- class: 0
|   |   |--- ring-number_o >  0.50
|   |   |   |--- class: 1
|--- odor_n >  0.50
|   |--- spore-print-color_r <= 0.50
|   |   |--- stalk-surface-below-ring_y <= 0.50
|   |   |   |--- cap-surf

In [6]:
tree_txt = export_text(pipeline[-1], feature_names=feature_names)
tree_nodes = tree_parser.parse_rules(tree_txt)
for branch in tree_nodes:
    conditions_dict, prediction = tree_parser.extract_dictionary_rule(branch)
    branch_txt = tree_parser.gen_text(conditions_dict, prediction)
    print(branch_txt)
    print("#"* 20)

If odor not equal to n and l, and stalk root not equal to c, and stalk surface below ring not equal to y, and spore print color not equal to u, and gill spacing not equal to w then 0
####################
If odor not equal to n and l, and stalk root not equal to c, and stalk surface below ring not equal to y, and spore print color not equal to u, and stalk shape not equal to t then 0
####################
If odor not equal to n and l, and stalk root not equal to c, and stalk surface below ring not equal to y, and spore print color not equal to u, and stalk shape not equal to t then 1
####################
If odor not equal to n, and stalk root not equal to c, and stalk surface below ring not equal to y, and spore print color not equal to u then 1
####################
If odor not equal to n, and stalk root not equal to c, and stalk surface below ring not equal to y then 1
####################
If odor not equal to n, and stalk root not equal to c then 1
####################
If odor not equa

In [7]:
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1294
           1       1.00      1.00      1.00      1387

    accuracy                           1.00      2681
   macro avg       1.00      1.00      1.00      2681
weighted avg       1.00      1.00      1.00      2681

