# Drzewo decyzyjne

In [173]:
from sklearn.model_selection import cross_val_score, cross_validate, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import pandas as pd
from IPython.display import display

In [80]:
import csv
import numpy as np

class IrisData:
    def __init__(self, data, target, labels):
        self.data = data
        self.target = target
        self.labels = labels

def load_iris_data():
    data = []
    labels = []
    target = []

    with open('./iris/iris.data', newline='') as csvfile:
        data_reader = csv.reader(csvfile, delimiter=',')
        for row in data_reader:
            if len(row) != 0:
                if row[-1] not in labels:
                    labels.append(row[-1])
                
                target.append(labels.index(row[-1]))
                data.append([float(x) for x in row[:-1]])

    data = np.array(data)
    target = np.array(target)
    labels = np.array(labels)
    
    return IrisData(data, target, labels)

In [81]:
iris_data = load_iris_data()
print(iris_data.labels)

['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']


In [82]:
clf = DecisionTreeClassifier(random_state=0)
print((iris_data.data[10]))
cross_val_score(clf, iris_data.data, iris_data.target, cv=10)

[5.4 3.7 1.5 0.2]


array([1.        , 0.93333333, 1.        , 0.93333333, 0.93333333,
       0.86666667, 0.93333333, 1.        , 1.        , 1.        ])

In [174]:
def conduct_tests_tree(folds, scores_dict, random_states, param_name = None, param_value = None):
    res = {score_name: [] for score_name in scores_dict.keys()}

    for random_state in random_states:
        if param_name is None:
            clf = DecisionTreeClassifier(random_state=random_state)
        else:
            clf = DecisionTreeClassifier(random_state=random_state, **{param_name: param_value})
        kf = KFold(n_splits=folds, shuffle=True, random_state=random_state)
        results = cross_validate(clf, iris_data.data, iris_data.target, scoring=scores_dict,
                                cv=kf)
        for score_name, score_values in results.items():
            if 'test_' in score_name:
                res[score_name[5:]].append(score_values)

    avg_scores = {}
    std_dev_scores = {}

    for score_name, score_values in res.items():
        scores_array = np.concatenate(score_values)
        avg_scores[score_name] = np.mean(scores_array)
        std_dev_scores[score_name] = np.std(scores_array)

    return avg_scores, std_dev_scores

In [84]:
RANDOM_STATES = [6, 5, 2024]
FOLDS_N = 5
SCORES_DICT = {'accuracy': 'accuracy',
                'precision': 'precision_macro', 
                'recall': 'recall_macro',
                'f1': 'f1_macro'}

Testing for default

In [175]:
test_res = conduct_tests_tree(FOLDS_N, SCORES_DICT, RANDOM_STATES)

for score_name in SCORES_DICT.keys():
    print(f"Average {score_name.capitalize()}: {test_res[0][score_name]}")
    print(f"Standard Deviation of {score_name.capitalize()}: {test_res[1][score_name]}")

Average Accuracy: 0.9422222222222223
Standard Deviation of Accuracy: 0.044666113871648386
Average Precision: 0.9432556332556333
Standard Deviation of Precision: 0.0443131404597508
Average Recall: 0.9436294569627902
Standard Deviation of Recall: 0.044999873947333
Average F1: 0.9403874222658266
Standard Deviation of F1: 0.04647450346863861


In [176]:
PARAMETERS = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [1, 2, 3, 4, 5, None]
}

results_by_param = {}

for param_name, param_values in PARAMETERS.items():
    results_for_param = []
    for param_value in param_values:
        avg_scores, std_dev_scores = conduct_tests_tree(FOLDS_N, SCORES_DICT, RANDOM_STATES, param_name, param_value)
        for metric in avg_scores.keys():
            results_for_param.append({
                #'Param Name': param_name,
                'Wartość parametru': param_value,
                'Miary jakości': metric,
                'Średnia': avg_scores[metric],
                'Odch. std.': std_dev_scores[metric]
            })

    results_df = pd.DataFrame(results_for_param)


    results_df = results_df.pivot_table(index=['Wartość parametru'], columns='Miary jakości')
    
    results_df.columns = results_df.columns.swaplevel(0, 1)
    results_df.sort_index(axis=1, level=0, inplace=True)

    results_by_param[param_name] = results_df

    results_df = results_df.map(lambda x: '{:.3f}'.format(x).replace('.', ','))

    display(results_df)

    latex_code = results_df.to_latex(multicolumn_format='c')

    print(latex_code)

        

Miary jakości,accuracy,accuracy,f1,f1,precision,precision,recall,recall
Unnamed: 0_level_1,Odch. std.,Średnia,Odch. std.,Średnia,Odch. std.,Średnia,Odch. std.,Średnia
Wartość parametru,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
entropy,47,933,53,931,51,933,52,934
gini,45,942,46,940,44,943,45,944


\begin{tabular}{lllllllll}
\toprule
Miary jakości & \multicolumn{2}{c}{accuracy} & \multicolumn{2}{c}{f1} & \multicolumn{2}{c}{precision} & \multicolumn{2}{c}{recall} \\
 & Odch. std. & Średnia & Odch. std. & Średnia & Odch. std. & Średnia & Odch. std. & Średnia \\
Wartość parametru &  &  &  &  &  &  &  &  \\
\midrule
entropy & 0,047 & 0,933 & 0,053 & 0,931 & 0,051 & 0,933 & 0,052 & 0,934 \\
gini & 0,045 & 0,942 & 0,046 & 0,940 & 0,044 & 0,943 & 0,045 & 0,944 \\
\bottomrule
\end{tabular}



Miary jakości,accuracy,accuracy,f1,f1,precision,precision,recall,recall
Unnamed: 0_level_1,Odch. std.,Średnia,Odch. std.,Średnia,Odch. std.,Średnia,Odch. std.,Średnia
Wartość parametru,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
best,45,942,46,940,44,943,45,944
random,31,956,35,955,35,955,33,959


\begin{tabular}{lllllllll}
\toprule
Miary jakości & \multicolumn{2}{c}{accuracy} & \multicolumn{2}{c}{f1} & \multicolumn{2}{c}{precision} & \multicolumn{2}{c}{recall} \\
 & Odch. std. & Średnia & Odch. std. & Średnia & Odch. std. & Średnia & Odch. std. & Średnia \\
Wartość parametru &  &  &  &  &  &  &  &  \\
\midrule
best & 0,045 & 0,942 & 0,046 & 0,940 & 0,044 & 0,943 & 0,045 & 0,944 \\
random & 0,031 & 0,956 & 0,035 & 0,955 & 0,035 & 0,955 & 0,033 & 0,959 \\
\bottomrule
\end{tabular}



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Miary jakości,accuracy,accuracy,f1,f1,precision,precision,recall,recall
Unnamed: 0_level_1,Odch. std.,Średnia,Odch. std.,Średnia,Odch. std.,Średnia,Odch. std.,Średnia
Wartość parametru,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1.0,34,633,16,539,16,482,0,667
2.0,42,938,45,935,43,939,45,938
3.0,40,938,44,935,41,938,44,937
4.0,41,936,42,934,41,937,41,937
5.0,41,936,42,934,41,937,41,937


\begin{tabular}{lllllllll}
\toprule
Miary jakości & \multicolumn{2}{c}{accuracy} & \multicolumn{2}{c}{f1} & \multicolumn{2}{c}{precision} & \multicolumn{2}{c}{recall} \\
 & Odch. std. & Średnia & Odch. std. & Średnia & Odch. std. & Średnia & Odch. std. & Średnia \\
Wartość parametru &  &  &  &  &  &  &  &  \\
\midrule
1.000000 & 0,034 & 0,633 & 0,016 & 0,539 & 0,016 & 0,482 & 0,000 & 0,667 \\
2.000000 & 0,042 & 0,938 & 0,045 & 0,935 & 0,043 & 0,939 & 0,045 & 0,938 \\
3.000000 & 0,040 & 0,938 & 0,044 & 0,935 & 0,041 & 0,938 & 0,044 & 0,937 \\
4.000000 & 0,041 & 0,936 & 0,042 & 0,934 & 0,041 & 0,937 & 0,041 & 0,937 \\
5.000000 & 0,041 & 0,936 & 0,042 & 0,934 & 0,041 & 0,937 & 0,041 & 0,937 \\
\bottomrule
\end{tabular}



In [177]:
def conduct_tests_svm(folds, scores_dict, random_states, param_name = None, param_value = None):
    res = {score_name: [] for score_name in scores_dict.keys()}

    for random_state in random_states:
        if param_name is None:
            clf = SVC(random_state=random_state)
        else:
            clf = SVC(random_state=random_state, **{param_name: param_value}, tol=10e-6)
        kf = KFold(n_splits=folds, shuffle=True, random_state=random_state)
        results = cross_validate(clf, iris_data.data, iris_data.target, scoring=scores_dict,
                                cv=kf)
        for score_name, score_values in results.items():
            if 'test_' in score_name:
                res[score_name[5:]].append(score_values)

    avg_scores = {}
    std_dev_scores = {}

    for score_name, score_values in res.items():
        scores_array = np.concatenate(score_values)
        avg_scores[score_name] = np.mean(scores_array)
        std_dev_scores[score_name] = np.std(scores_array)

    return avg_scores, std_dev_scores

In [178]:
test_res = conduct_tests_svm(FOLDS_N, SCORES_DICT, RANDOM_STATES)

for score_name in SCORES_DICT.keys():
    print(f"Average {score_name.capitalize()}: {test_res[0][score_name]}")
    print(f"Standard Deviation of {score_name.capitalize()}: {test_res[1][score_name]}")

Average Accuracy: 0.9622222222222222
Standard Deviation of Accuracy: 0.0341384255460827
Average Precision: 0.9613583330249997
Standard Deviation of Precision: 0.03543533171641029
Average Recall: 0.9623702840369507
Standard Deviation of Recall: 0.03536243510349699
Average F1: 0.9601063605816151
Standard Deviation of F1: 0.0364433482627812


In [181]:
PARAMETERS = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'max_iter': np.arange(1, 16, 1)
}

results_by_param = {}

for param_name, param_values in PARAMETERS.items():
    results_for_param = []
    for param_value in param_values:
        avg_scores, std_dev_scores = conduct_tests_svm(FOLDS_N, SCORES_DICT, RANDOM_STATES, param_name, param_value)
        for metric in avg_scores.keys():
            results_for_param.append({
                #'Param Name': param_name,
                'Wartość parametru': param_value,
                'Miary jakości': metric,
                'Średnia': avg_scores[metric],
                'Odch. std.': std_dev_scores[metric]
            })

    results_df = pd.DataFrame(results_for_param)


    results_df = results_df.pivot_table(index=['Wartość parametru'], columns='Miary jakości')
    
    results_df.columns = results_df.columns.swaplevel(0, 1)
    results_df.sort_index(axis=1, level=0, inplace=True)

    results_by_param[param_name] = results_df

    results_df = results_df.map(lambda x: '{:.3f}'.format(x).replace('.', ','))

    display(results_df)

    latex_code = results_df.to_latex(multicolumn_format='c')

    print(latex_code)

Miary jakości,accuracy,accuracy,f1,f1,precision,precision,recall,recall
Unnamed: 0_level_1,Odch. std.,Średnia,Odch. std.,Średnia,Odch. std.,Średnia,Odch. std.,Średnia
Wartość parametru,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0.1,42,913,47,909,37,921,40,918
1.0,34,962,36,960,35,961,35,962
10.0,25,973,28,971,28,972,27,974


\begin{tabular}{lllllllll}
\toprule
Miary jakości & \multicolumn{2}{c}{accuracy} & \multicolumn{2}{c}{f1} & \multicolumn{2}{c}{precision} & \multicolumn{2}{c}{recall} \\
 & Odch. std. & Średnia & Odch. std. & Średnia & Odch. std. & Średnia & Odch. std. & Średnia \\
Wartość parametru &  &  &  &  &  &  &  &  \\
\midrule
0.100000 & 0,042 & 0,913 & 0,047 & 0,909 & 0,037 & 0,921 & 0,040 & 0,918 \\
1.000000 & 0,034 & 0,962 & 0,036 & 0,960 & 0,035 & 0,961 & 0,035 & 0,962 \\
10.000000 & 0,025 & 0,973 & 0,028 & 0,971 & 0,028 & 0,972 & 0,027 & 0,974 \\
\bottomrule
\end{tabular}



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Miary jakości,accuracy,accuracy,f1,f1,precision,precision,recall,recall
Unnamed: 0_level_1,Odch. std.,Średnia,Odch. std.,Średnia,Odch. std.,Średnia,Odch. std.,Średnia
Wartość parametru,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
linear,24,980,25,979,24,979,24,981
poly,24,967,27,965,27,966,25,968
rbf,34,962,36,960,35,961,35,962
sigmoid,66,242,32,128,21,81,74,314




\begin{tabular}{lllllllll}
\toprule
Miary jakości & \multicolumn{2}{c}{accuracy} & \multicolumn{2}{c}{f1} & \multicolumn{2}{c}{precision} & \multicolumn{2}{c}{recall} \\
 & Odch. std. & Średnia & Odch. std. & Średnia & Odch. std. & Średnia & Odch. std. & Średnia \\
Wartość parametru &  &  &  &  &  &  &  &  \\
\midrule
linear & 0,024 & 0,980 & 0,025 & 0,979 & 0,024 & 0,979 & 0,024 & 0,981 \\
poly & 0,024 & 0,967 & 0,027 & 0,965 & 0,027 & 0,966 & 0,025 & 0,968 \\
rbf & 0,034 & 0,962 & 0,036 & 0,960 & 0,035 & 0,961 & 0,035 & 0,962 \\
sigmoid & 0,066 & 0,242 & 0,032 & 0,128 & 0,021 & 0,081 & 0,074 & 0,314 \\
\bottomrule
\end{tabular}





Miary jakości,accuracy,accuracy,f1,f1,precision,precision,recall,recall
Unnamed: 0_level_1,Odch. std.,Średnia,Odch. std.,Średnia,Odch. std.,Średnia,Odch. std.,Średnia
Wartość parametru,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,47,911,53,907,43,922,44,915
2,37,942,40,940,33,946,33,945
3,55,927,67,922,45,936,52,930
4,43,944,49,941,39,949,39,947
5,31,956,35,953,30,957,31,956
6,36,947,41,943,34,950,35,948
7,24,971,26,970,26,970,24,973
8,34,967,40,964,35,967,32,969
9,24,980,26,979,25,980,23,980
10,30,973,33,972,33,972,31,974


\begin{tabular}{lllllllll}
\toprule
Miary jakości & \multicolumn{2}{c}{accuracy} & \multicolumn{2}{c}{f1} & \multicolumn{2}{c}{precision} & \multicolumn{2}{c}{recall} \\
 & Odch. std. & Średnia & Odch. std. & Średnia & Odch. std. & Średnia & Odch. std. & Średnia \\
Wartość parametru &  &  &  &  &  &  &  &  \\
\midrule
1 & 0,047 & 0,911 & 0,053 & 0,907 & 0,043 & 0,922 & 0,044 & 0,915 \\
2 & 0,037 & 0,942 & 0,040 & 0,940 & 0,033 & 0,946 & 0,033 & 0,945 \\
3 & 0,055 & 0,927 & 0,067 & 0,922 & 0,045 & 0,936 & 0,052 & 0,930 \\
4 & 0,043 & 0,944 & 0,049 & 0,941 & 0,039 & 0,949 & 0,039 & 0,947 \\
5 & 0,031 & 0,956 & 0,035 & 0,953 & 0,030 & 0,957 & 0,031 & 0,956 \\
6 & 0,036 & 0,947 & 0,041 & 0,943 & 0,034 & 0,950 & 0,035 & 0,948 \\
7 & 0,024 & 0,971 & 0,026 & 0,970 & 0,026 & 0,970 & 0,024 & 0,973 \\
8 & 0,034 & 0,967 & 0,040 & 0,964 & 0,035 & 0,967 & 0,032 & 0,969 \\
9 & 0,024 & 0,980 & 0,026 & 0,979 & 0,025 & 0,980 & 0,023 & 0,980 \\
10 & 0,030 & 0,973 & 0,033 & 0,972 & 0,033 & 0,972 & 0,031 