# Drzewo decyzyjne

In [59]:
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from IPython.display import display

In [6]:
import csv
import numpy as np

class IrisData:
    def __init__(self, data, target, labels):
        self.data = data
        self.target = target
        self.labels = labels

def load_iris_data():
    data = []
    labels = []
    target = []

    with open('./iris/iris.data', newline='') as csvfile:
        data_reader = csv.reader(csvfile, delimiter=',')
        for row in data_reader:
            if len(row) != 0:
                if row[-1] not in labels:
                    labels.append(row[-1])
                
                target.append(labels.index(row[-1]))
                data.append([float(x) for x in row[:-1]])

    data = np.array(data)
    target = np.array(target)
    labels = np.array(labels)
    
    return IrisData(data, target, labels)

In [8]:
iris_data = load_iris_data()
print(iris_data.labels)

['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']


In [9]:
clf = DecisionTreeClassifier(random_state=0)
print((iris_data.data[10]))
#print(iris.target)
cross_val_score(clf, iris_data.data, iris_data.target, cv=10)

[5.4 3.7 1.5 0.2]


array([1.        , 0.93333333, 1.        , 0.93333333, 0.93333333,
       0.86666667, 0.93333333, 1.        , 1.        , 1.        ])

In [71]:
def conduct_tests(folds, scores_dict, random_states, param_name = None, param_value = None):
    res = {score_name: [] for score_name in scores_dict.keys()}

    for random_state in random_states:
        if param_name is None:
            clf = DecisionTreeClassifier(random_state=random_state)
        else:
            clf = DecisionTreeClassifier(random_state=random_state, **{param_name: param_value})
        results = cross_validate(clf, iris_data.data, iris_data.target, scoring=scores_dict,
                                cv=folds)
        for score_name, score_values in results.items():
            if 'test_' in score_name:
                res[score_name[5:]].append(score_values)

    avg_scores = {}
    std_dev_scores = {}

    for score_name, score_values in res.items():
        scores_array = np.concatenate(score_values)
        avg_scores[score_name] = np.mean(scores_array)
        std_dev_scores[score_name] = np.std(scores_array)

    return avg_scores, std_dev_scores

In [53]:
RANDOM_STATES = [6, 5, 2024]
FOLDS_N = 5
SCORES_DICT = {'accuracy': 'accuracy',
                'precision': 'precision_macro', 
                'recall': 'recall_macro',
                'f1': 'f1_macro'}

Testing for default

In [None]:
test_res = conduct_tests(FOLDS_N, SCORES_DICT, RANDOM_STATES)

for score_name in SCORES_DICT.keys():
    print(f"Average {score_name.capitalize()}: {test_res[0][score_name]}")
    print(f"Standard Deviation of {score_name.capitalize()}: {test_res[1][score_name]}")

In [76]:
PARAMETERS = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [1, 2, 3, 4, 5, None]
}

results_by_param = {}

for param_name, param_values in PARAMETERS.items():
    results_for_param = []
    for param_value in param_values:
        avg_scores, std_dev_scores = conduct_tests(FOLDS_N, SCORES_DICT, RANDOM_STATES, param_name, param_value)
        results_for_param.append(
            {
            'Param Value': param_value,
            **avg_scores,
            **std_dev_scores
        }
        )

    results_df = pd.DataFrame(results_for_param)
    results_by_param[param_name] = results_df

    display(results_df)

    print(results_df.to_latex())

        

Unnamed: 0,Param Value,accuracy,precision,recall,f1
0,gini,0.033259,0.032895,0.033259,0.033334
1,entropy,0.033993,0.033757,0.033993,0.034059


\begin{tabular}{llrrrr}
\toprule
 & Param Value & accuracy & precision & recall & f1 \\
\midrule
0 & gini & 0.033259 & 0.032895 & 0.033259 & 0.033334 \\
1 & entropy & 0.033993 & 0.033757 & 0.033993 & 0.034059 \\
\bottomrule
\end{tabular}



Unnamed: 0,Param Value,accuracy,precision,recall,f1
0,best,0.033259,0.032895,0.033259,0.033334
1,random,0.030307,0.024994,0.030307,0.030909


\begin{tabular}{llrrrr}
\toprule
 & Param Value & accuracy & precision & recall & f1 \\
\midrule
0 & best & 0.033259 & 0.032895 & 0.033259 & 0.033334 \\
1 & random & 0.030307 & 0.024994 & 0.030307 & 0.030909 \\
\bottomrule
\end{tabular}



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Unnamed: 0,Param Value,accuracy,precision,recall,f1
0,1.0,1.110223e-16,0.0,1.110223e-16,0.0
1,2.0,0.04714045,0.047254,0.04714045,0.047164
2,3.0,0.02572408,0.023354,0.02572408,0.025885
3,4.0,0.03624335,0.035604,0.03624335,0.036325
4,5.0,0.03325918,0.032895,0.03325918,0.033334
5,,0.03325918,0.032895,0.03325918,0.033334


\begin{tabular}{lrrrrr}
\toprule
 & Param Value & accuracy & precision & recall & f1 \\
\midrule
0 & 1.000000 & 0.000000 & 0.000000 & 0.000000 & 0.000000 \\
1 & 2.000000 & 0.047140 & 0.047254 & 0.047140 & 0.047164 \\
2 & 3.000000 & 0.025724 & 0.023354 & 0.025724 & 0.025885 \\
3 & 4.000000 & 0.036243 & 0.035604 & 0.036243 & 0.036325 \\
4 & 5.000000 & 0.033259 & 0.032895 & 0.033259 & 0.033334 \\
5 & NaN & 0.033259 & 0.032895 & 0.033259 & 0.033334 \\
\bottomrule
\end{tabular}

