# Drzewo decyzyjne

In [154]:
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import pandas as pd
from IPython.display import display

In [80]:
import csv
import numpy as np

class IrisData:
    def __init__(self, data, target, labels):
        self.data = data
        self.target = target
        self.labels = labels

def load_iris_data():
    data = []
    labels = []
    target = []

    with open('./iris/iris.data', newline='') as csvfile:
        data_reader = csv.reader(csvfile, delimiter=',')
        for row in data_reader:
            if len(row) != 0:
                if row[-1] not in labels:
                    labels.append(row[-1])
                
                target.append(labels.index(row[-1]))
                data.append([float(x) for x in row[:-1]])

    data = np.array(data)
    target = np.array(target)
    labels = np.array(labels)
    
    return IrisData(data, target, labels)

In [81]:
iris_data = load_iris_data()
print(iris_data.labels)

['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']


In [82]:
clf = DecisionTreeClassifier(random_state=0)
print((iris_data.data[10]))
cross_val_score(clf, iris_data.data, iris_data.target, cv=10)

[5.4 3.7 1.5 0.2]


array([1.        , 0.93333333, 1.        , 0.93333333, 0.93333333,
       0.86666667, 0.93333333, 1.        , 1.        , 1.        ])

In [83]:
def conduct_tests_tree(folds, scores_dict, random_states, param_name = None, param_value = None):
    res = {score_name: [] for score_name in scores_dict.keys()}

    for random_state in random_states:
        if param_name is None:
            clf = DecisionTreeClassifier(random_state=random_state)
        else:
            clf = DecisionTreeClassifier(random_state=random_state, **{param_name: param_value})
        results = cross_validate(clf, iris_data.data, iris_data.target, scoring=scores_dict,
                                cv=folds)
        for score_name, score_values in results.items():
            if 'test_' in score_name:
                res[score_name[5:]].append(score_values)

    avg_scores = {}
    std_dev_scores = {}

    for score_name, score_values in res.items():
        scores_array = np.concatenate(score_values)
        avg_scores[score_name] = np.mean(scores_array)
        std_dev_scores[score_name] = np.std(scores_array)

    return avg_scores, std_dev_scores

In [84]:
RANDOM_STATES = [6, 5, 2024]
FOLDS_N = 5
SCORES_DICT = {'accuracy': 'accuracy',
                'precision': 'precision_macro', 
                'recall': 'recall_macro',
                'f1': 'f1_macro'}

Testing for default

In [85]:
test_res = conduct_tests_tree(FOLDS_N, SCORES_DICT, RANDOM_STATES)

for score_name in SCORES_DICT.keys():
    print(f"Average {score_name.capitalize()}: {test_res[0][score_name]}")
    print(f"Standard Deviation of {score_name.capitalize()}: {test_res[1][score_name]}")

Average Accuracy: 0.9577777777777778
Standard Deviation of Accuracy: 0.033259176771323916
Average Precision: 0.9598653198653199
Standard Deviation of Precision: 0.032895054256567316
Average Recall: 0.9577777777777778
Standard Deviation of Recall: 0.03325917677132392
Average F1: 0.9576830966304649
Standard Deviation of F1: 0.033334371371323786


In [148]:
PARAMETERS = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [1, 2, 3, 4, 5, None]
}

results_by_param = {}

for param_name, param_values in PARAMETERS.items():
    results_for_param = []
    for param_value in param_values:
        avg_scores, std_dev_scores = conduct_tests_tree(FOLDS_N, SCORES_DICT, RANDOM_STATES, param_name, param_value)
        for metric in avg_scores.keys():
            results_for_param.append({
                #'Param Name': param_name,
                'Wartość parametru': param_value,
                'Miary jakości': metric,
                'Średnia': avg_scores[metric],
                'Odch. std.': std_dev_scores[metric]
            })

    results_df = pd.DataFrame(results_for_param)


    results_df = results_df.pivot_table(index=['Wartość parametru'], columns='Miary jakości')
    
    results_df.columns = results_df.columns.swaplevel(0, 1)
    results_df.sort_index(axis=1, level=0, inplace=True)

    results_by_param[param_name] = results_df

    results_df = results_df.map(lambda x: '{:.3f}'.format(x).replace('.', ','))

    display(results_df)

    latex_code = results_df.to_latex(multicolumn_format='c')

    print(latex_code)

        

Miary jakości,accuracy,accuracy,f1,f1,precision,precision,recall,recall
Unnamed: 0_level_1,Odch. std.,Średnia,Odch. std.,Średnia,Odch. std.,Średnia,Odch. std.,Średnia
Wartość parametru,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
entropy,34,953,34,953,34,955,34,953
gini,33,958,33,958,33,960,33,958


\begin{tabular}{lllllllll}
\toprule
Miary jakości & \multicolumn{2}{c}{accuracy} & \multicolumn{2}{c}{f1} & \multicolumn{2}{c}{precision} & \multicolumn{2}{c}{recall} \\
 & Odch. std. & Średnia & Odch. std. & Średnia & Odch. std. & Średnia & Odch. std. & Średnia \\
Wartość parametru &  &  &  &  &  &  &  &  \\
\midrule
entropy & 0,034 & 0,953 & 0,034 & 0,953 & 0,034 & 0,955 & 0,034 & 0,953 \\
gini & 0,033 & 0,958 & 0,033 & 0,958 & 0,033 & 0,960 & 0,033 & 0,958 \\
\bottomrule
\end{tabular}



Miary jakości,accuracy,accuracy,f1,f1,precision,precision,recall,recall
Unnamed: 0_level_1,Odch. std.,Średnia,Odch. std.,Średnia,Odch. std.,Średnia,Odch. std.,Średnia
Wartość parametru,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
best,33,958,33,958,33,960,33,958
random,30,960,31,960,25,965,30,960


\begin{tabular}{lllllllll}
\toprule
Miary jakości & \multicolumn{2}{c}{accuracy} & \multicolumn{2}{c}{f1} & \multicolumn{2}{c}{precision} & \multicolumn{2}{c}{recall} \\
 & Odch. std. & Średnia & Odch. std. & Średnia & Odch. std. & Średnia & Odch. std. & Średnia \\
Wartość parametru &  &  &  &  &  &  &  &  \\
\midrule
best & 0,033 & 0,958 & 0,033 & 0,958 & 0,033 & 0,960 & 0,033 & 0,958 \\
random & 0,030 & 0,960 & 0,031 & 0,960 & 0,025 & 0,965 & 0,030 & 0,960 \\
\bottomrule
\end{tabular}



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Miary jakości,accuracy,accuracy,f1,f1,precision,precision,recall,recall
Unnamed: 0_level_1,Odch. std.,Średnia,Odch. std.,Średnia,Odch. std.,Średnia,Odch. std.,Średnia
Wartość parametru,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1.0,0,667,0,556,0,500,0,667
2.0,47,933,47,933,47,934,47,933
3.0,26,964,26,964,23,968,26,964
4.0,36,962,36,962,36,964,36,962
5.0,33,958,33,958,33,960,33,958


\begin{tabular}{lllllllll}
\toprule
Miary jakości & \multicolumn{2}{c}{accuracy} & \multicolumn{2}{c}{f1} & \multicolumn{2}{c}{precision} & \multicolumn{2}{c}{recall} \\
 & Odch. std. & Średnia & Odch. std. & Średnia & Odch. std. & Średnia & Odch. std. & Średnia \\
Wartość parametru &  &  &  &  &  &  &  &  \\
\midrule
1.000000 & 0,000 & 0,667 & 0,000 & 0,556 & 0,000 & 0,500 & 0,000 & 0,667 \\
2.000000 & 0,047 & 0,933 & 0,047 & 0,933 & 0,047 & 0,934 & 0,047 & 0,933 \\
3.000000 & 0,026 & 0,964 & 0,026 & 0,964 & 0,023 & 0,968 & 0,026 & 0,964 \\
4.000000 & 0,036 & 0,962 & 0,036 & 0,962 & 0,036 & 0,964 & 0,036 & 0,962 \\
5.000000 & 0,033 & 0,958 & 0,033 & 0,958 & 0,033 & 0,960 & 0,033 & 0,958 \\
\bottomrule
\end{tabular}



In [157]:
def conduct_tests_svm(folds, scores_dict, random_states, param_name = None, param_value = None):
    res = {score_name: [] for score_name in scores_dict.keys()}

    for random_state in random_states:
        if param_name is None:
            clf = SVC(random_state=random_state)
        else:
            clf = SVC(random_state=random_state, **{param_name: param_value}, tol=10e-6)
        results = cross_validate(clf, iris_data.data, iris_data.target, scoring=scores_dict,
                                cv=folds)
        for score_name, score_values in results.items():
            if 'test_' in score_name:
                res[score_name[5:]].append(score_values)

    avg_scores = {}
    std_dev_scores = {}

    for score_name, score_values in res.items():
        scores_array = np.concatenate(score_values)
        avg_scores[score_name] = np.mean(scores_array)
        std_dev_scores[score_name] = np.std(scores_array)

    return avg_scores, std_dev_scores

In [158]:
test_res = conduct_tests_svm(FOLDS_N, SCORES_DICT, RANDOM_STATES)

for score_name in SCORES_DICT.keys():
    print(f"Average {score_name.capitalize()}: {test_res[0][score_name]}")
    print(f"Standard Deviation of {score_name.capitalize()}: {test_res[1][score_name]}")

Average Accuracy: 0.9666666666666667
Standard Deviation of Accuracy: 0.02108185106778919
Average Precision: 0.9684848484848485
Standard Deviation of Precision: 0.021134055483833604
Average Recall: 0.9666666666666667
Standard Deviation of Recall: 0.021081851067789228
Average F1: 0.9666165413533834
Standard Deviation of F1: 0.021081890794608538


In [172]:
PARAMETERS = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'max_iter': np.arange(1, 11, 1)
}

results_by_param = {}

for param_name, param_values in PARAMETERS.items():
    results_for_param = []
    for param_value in param_values:
        avg_scores, std_dev_scores = conduct_tests_svm(FOLDS_N, SCORES_DICT, RANDOM_STATES, param_name, param_value)
        for metric in avg_scores.keys():
            results_for_param.append({
                #'Param Name': param_name,
                'Wartość parametru': param_value,
                'Miary jakości': metric,
                'Średnia': avg_scores[metric],
                'Odch. std.': std_dev_scores[metric]
            })

    results_df = pd.DataFrame(results_for_param)


    results_df = results_df.pivot_table(index=['Wartość parametru'], columns='Miary jakości')
    
    results_df.columns = results_df.columns.swaplevel(0, 1)
    results_df.sort_index(axis=1, level=0, inplace=True)

    results_by_param[param_name] = results_df

    results_df = results_df.map(lambda x: '{:.3f}'.format(x).replace('.', ','))

    display(results_df)

    latex_code = results_df.to_latex(multicolumn_format='c')

    print(latex_code)

Miary jakości,accuracy,accuracy,f1,f1,precision,precision,recall,recall
Unnamed: 0_level_1,Odch. std.,Średnia,Odch. std.,Średnia,Odch. std.,Średnia,Odch. std.,Średnia
Wartość parametru,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0.1,27,920,27,919,26,928,27,920
1.0,21,967,21,967,21,968,21,967
10.0,16,980,16,980,15,982,16,980


\begin{tabular}{lllllllll}
\toprule
Miary jakości & \multicolumn{2}{c}{accuracy} & \multicolumn{2}{c}{f1} & \multicolumn{2}{c}{precision} & \multicolumn{2}{c}{recall} \\
 & Odch. std. & Średnia & Odch. std. & Średnia & Odch. std. & Średnia & Odch. std. & Średnia \\
Wartość parametru &  &  &  &  &  &  &  &  \\
\midrule
0.100000 & 0,027 & 0,920 & 0,027 & 0,919 & 0,026 & 0,928 & 0,027 & 0,920 \\
1.000000 & 0,021 & 0,967 & 0,021 & 0,967 & 0,021 & 0,968 & 0,021 & 0,967 \\
10.000000 & 0,016 & 0,980 & 0,016 & 0,980 & 0,015 & 0,982 & 0,016 & 0,980 \\
\bottomrule
\end{tabular}



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Miary jakości,accuracy,accuracy,f1,f1,precision,precision,recall,recall
Unnamed: 0_level_1,Odch. std.,Średnia,Odch. std.,Średnia,Odch. std.,Średnia,Odch. std.,Średnia
Wartość parametru,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
linear,16,980,16,980,15,982,16,980
poly,16,980,16,980,15,982,16,980
rbf,21,967,21,967,21,968,21,967
sigmoid,60,67,35,43,25,32,60,67




\begin{tabular}{lllllllll}
\toprule
Miary jakości & \multicolumn{2}{c}{accuracy} & \multicolumn{2}{c}{f1} & \multicolumn{2}{c}{precision} & \multicolumn{2}{c}{recall} \\
 & Odch. std. & Średnia & Odch. std. & Średnia & Odch. std. & Średnia & Odch. std. & Średnia \\
Wartość parametru &  &  &  &  &  &  &  &  \\
\midrule
linear & 0,016 & 0,980 & 0,016 & 0,980 & 0,015 & 0,982 & 0,016 & 0,980 \\
poly & 0,016 & 0,980 & 0,016 & 0,980 & 0,015 & 0,982 & 0,016 & 0,980 \\
rbf & 0,021 & 0,967 & 0,021 & 0,967 & 0,021 & 0,968 & 0,021 & 0,967 \\
sigmoid & 0,060 & 0,067 & 0,035 & 0,043 & 0,025 & 0,032 & 0,060 & 0,067 \\
\bottomrule
\end{tabular}





Miary jakości,accuracy,accuracy,f1,f1,precision,precision,recall,recall
Unnamed: 0_level_1,Odch. std.,Średnia,Odch. std.,Średnia,Odch. std.,Średnia,Odch. std.,Średnia
Wartość parametru,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,44,940,46,939,32,950,44,940
2,33,960,33,960,29,964,33,960
3,56,933,57,932,46,944,56,933
4,45,953,46,953,40,959,45,953
5,39,960,40,959,30,967,39,960
6,39,960,40,960,32,965,39,960
7,37,967,37,966,28,972,37,967
8,37,967,37,966,28,972,37,967
9,37,967,37,966,28,972,37,967
10,16,980,16,980,15,982,16,980


\begin{tabular}{lllllllll}
\toprule
Miary jakości & \multicolumn{2}{c}{accuracy} & \multicolumn{2}{c}{f1} & \multicolumn{2}{c}{precision} & \multicolumn{2}{c}{recall} \\
 & Odch. std. & Średnia & Odch. std. & Średnia & Odch. std. & Średnia & Odch. std. & Średnia \\
Wartość parametru &  &  &  &  &  &  &  &  \\
\midrule
1 & 0,044 & 0,940 & 0,046 & 0,939 & 0,032 & 0,950 & 0,044 & 0,940 \\
2 & 0,033 & 0,960 & 0,033 & 0,960 & 0,029 & 0,964 & 0,033 & 0,960 \\
3 & 0,056 & 0,933 & 0,057 & 0,932 & 0,046 & 0,944 & 0,056 & 0,933 \\
4 & 0,045 & 0,953 & 0,046 & 0,953 & 0,040 & 0,959 & 0,045 & 0,953 \\
5 & 0,039 & 0,960 & 0,040 & 0,959 & 0,030 & 0,967 & 0,039 & 0,960 \\
6 & 0,039 & 0,960 & 0,040 & 0,960 & 0,032 & 0,965 & 0,039 & 0,960 \\
7 & 0,037 & 0,967 & 0,037 & 0,966 & 0,028 & 0,972 & 0,037 & 0,967 \\
8 & 0,037 & 0,967 & 0,037 & 0,966 & 0,028 & 0,972 & 0,037 & 0,967 \\
9 & 0,037 & 0,967 & 0,037 & 0,966 & 0,028 & 0,972 & 0,037 & 0,967 \\
10 & 0,016 & 0,980 & 0,016 & 0,980 & 0,015 & 0,982 & 0,016 