# Drzewo decyzyjne

In [173]:
from sklearn.model_selection import cross_val_score, cross_validate, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import pandas as pd
from IPython.display import display

In [80]:
import csv
import numpy as np

class IrisData:
    def __init__(self, data, target, labels):
        self.data = data
        self.target = target
        self.labels = labels

def load_iris_data():
    data = []
    labels = []
    target = []

    with open('./iris/iris.data', newline='') as csvfile:
        data_reader = csv.reader(csvfile, delimiter=',')
        for row in data_reader:
            if len(row) != 0:
                if row[-1] not in labels:
                    labels.append(row[-1])
                
                target.append(labels.index(row[-1]))
                data.append([float(x) for x in row[:-1]])

    data = np.array(data)
    target = np.array(target)
    labels = np.array(labels)
    
    return IrisData(data, target, labels)

In [81]:
iris_data = load_iris_data()
print(iris_data.labels)

['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']


In [82]:
clf = DecisionTreeClassifier(random_state=0)
print((iris_data.data[10]))
cross_val_score(clf, iris_data.data, iris_data.target, cv=10)

[5.4 3.7 1.5 0.2]


array([1.        , 0.93333333, 1.        , 0.93333333, 0.93333333,
       0.86666667, 0.93333333, 1.        , 1.        , 1.        ])

In [185]:
def conduct_tests_tree(folds, scores_dict, random_states, params = None):
    res = {score_name: [] for score_name in scores_dict.keys()}

    for random_state in random_states:
        if params is None:
            clf = DecisionTreeClassifier(random_state=random_state)
        else:
            clf = DecisionTreeClassifier(random_state=random_state, **params)
        kf = KFold(n_splits=folds, shuffle=True, random_state=random_state)
        results = cross_validate(clf, iris_data.data, iris_data.target, scoring=scores_dict,
                                cv=kf)
        for score_name, score_values in results.items():
            if 'test_' in score_name:
                res[score_name[5:]].append(score_values)

    avg_scores = {}
    std_dev_scores = {}

    for score_name, score_values in res.items():
        scores_array = np.concatenate(score_values)
        avg_scores[score_name] = np.mean(scores_array)
        std_dev_scores[score_name] = np.std(scores_array)

    return avg_scores, std_dev_scores

In [84]:
RANDOM_STATES = [6, 5, 2024]
FOLDS_N = 5
SCORES_DICT = {'accuracy': 'accuracy',
                'precision': 'precision_macro', 
                'recall': 'recall_macro',
                'f1': 'f1_macro'}

Testing for default

In [175]:
test_res = conduct_tests_tree(FOLDS_N, SCORES_DICT, RANDOM_STATES)

for score_name in SCORES_DICT.keys():
    print(f"Average {score_name.capitalize()}: {test_res[0][score_name]}")
    print(f"Standard Deviation of {score_name.capitalize()}: {test_res[1][score_name]}")

Average Accuracy: 0.9422222222222223
Standard Deviation of Accuracy: 0.044666113871648386
Average Precision: 0.9432556332556333
Standard Deviation of Precision: 0.0443131404597508
Average Recall: 0.9436294569627902
Standard Deviation of Recall: 0.044999873947333
Average F1: 0.9403874222658266
Standard Deviation of F1: 0.04647450346863861


In [191]:
PARAMETERS = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [1, 2, 3, 4, 5, None]
}

results_by_params = {}

for criterion_value in PARAMETERS['criterion']:
    for splitter_value in PARAMETERS['splitter']:
        for max_depth_value in PARAMETERS['max_depth']:
            results_for_params = []
            avg_scores, std_dev_scores = conduct_tests_tree(FOLDS_N, SCORES_DICT, RANDOM_STATES, {'criterion': criterion_value, 'splitter': splitter_value, 'max_depth': max_depth_value})
            for metric in avg_scores.keys():
                results_for_params.append({
                    'Criterion': criterion_value,
                    'Splitter': splitter_value,
                    'Max Depth': max_depth_value,
                    'Quality Metric': metric,
                    'Mean': avg_scores[metric],
                    'Std Dev': std_dev_scores[metric]
                })

            results_df = pd.DataFrame(results_for_params)
            #results_df = results_df.pivot_table(index=['Criterion', 'Splitter', 'Max Depth'], columns='Quality Metric')
            results_by_params[(criterion_value, splitter_value, max_depth_value)] = results_df

# Display the results for each combination of parameters
#for params, results_df in results_by_params.items():
    #display(results_df)

# Concatenate all dataframes into one
all_results_df = pd.concat(results_by_params.values())

# Group by each parameter and calculate mean and standard deviation for each metric
summary_df = all_results_df.groupby(['Criterion', 'Splitter', 'Max Depth']).agg({'Mean': 'mean', 'Std Dev': 'mean'})
#summary_df = summary_df.map(lambda x: '{:.3f}'.format(x).replace('.', ','))
# Display the summary table
display(summary_df)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Mean,Std Dev
Criterion,Splitter,Max Depth,Unnamed: 3_level_1,Unnamed: 4_level_1
entropy,best,1,0.580273,0.016515
entropy,best,2,0.937603,0.043614
entropy,best,3,0.937091,0.045612
entropy,best,4,0.930836,0.04597
entropy,best,5,0.935381,0.046626
entropy,random,1,0.500166,0.084329
entropy,random,2,0.859849,0.069215
entropy,random,3,0.932063,0.046481
entropy,random,4,0.941145,0.041242
entropy,random,5,0.945798,0.042728


In [199]:
# Pivot the DataFrame for Criterion
criterion_df = summary_df.reset_index().pivot_table(index='Criterion', values=['Mean', 'Std Dev'], aggfunc='mean')
criterion_df = criterion_df.map(lambda x: '{:.3f}'.format(x).replace('.', ','))
display(criterion_df)

# Pivot the DataFrame for Splitter
splitter_df = summary_df.reset_index().pivot_table(index='Splitter', values=['Mean', 'Std Dev'], aggfunc='mean')
splitter_df = splitter_df.map(lambda x: '{:.3f}'.format(x).replace('.', ','))
display(splitter_df)

# Pivot the DataFrame for Max Depth
max_depth_df = summary_df.reset_index().pivot_table(index='Max Depth', values=['Mean', 'Std Dev'], aggfunc='mean')
max_depth_df = max_depth_df.map(lambda x: '{:.3f}'.format(x).replace('.', ','))
display(max_depth_df)

Unnamed: 0_level_0,Mean,Std Dev
Criterion,Unnamed: 1_level_1,Unnamed: 2_level_1
entropy,850,48
gini,850,46


Unnamed: 0_level_0,Mean,Std Dev
Splitter,Unnamed: 1_level_1,Unnamed: 2_level_1
best,865,38
random,836,56


Unnamed: 0_level_0,Mean,Std Dev
Max Depth,Unnamed: 1_level_1,Unnamed: 2_level_1
1,537,51
2,898,56
3,937,44
4,938,42
5,940,43


In [209]:
def conduct_tests_svm(folds, scores_dict, random_states, params = None):
    res = {score_name: [] for score_name in scores_dict.keys()}

    for random_state in random_states:
        if params is None:
            clf = SVC(random_state=random_state)
        else:
            clf = SVC(random_state=random_state, **params, tol=10e-6)
        kf = KFold(n_splits=folds, shuffle=True, random_state=random_state)
        results = cross_validate(clf, iris_data.data, iris_data.target, scoring=scores_dict,
                                cv=kf)
        for score_name, score_values in results.items():
            if 'test_' in score_name:
                res[score_name[5:]].append(score_values)

    avg_scores = {}
    std_dev_scores = {}

    for score_name, score_values in res.items():
        scores_array = np.concatenate(score_values)
        avg_scores[score_name] = np.mean(scores_array)
        std_dev_scores[score_name] = np.std(scores_array)

    return avg_scores, std_dev_scores

In [178]:
test_res = conduct_tests_svm(FOLDS_N, SCORES_DICT, RANDOM_STATES)

for score_name in SCORES_DICT.keys():
    print(f"Average {score_name.capitalize()}: {test_res[0][score_name]}")
    print(f"Standard Deviation of {score_name.capitalize()}: {test_res[1][score_name]}")

Average Accuracy: 0.9622222222222222
Standard Deviation of Accuracy: 0.0341384255460827
Average Precision: 0.9613583330249997
Standard Deviation of Precision: 0.03543533171641029
Average Recall: 0.9623702840369507
Standard Deviation of Recall: 0.03536243510349699
Average F1: 0.9601063605816151
Standard Deviation of F1: 0.0364433482627812


In [234]:
PARAMETERS = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'max_iter': np.arange(1, 51, 5)
}

results_by_params = {}


for c_value in PARAMETERS['C']:
    for kernel_value in PARAMETERS['kernel']:
        for max_iter_value in PARAMETERS['max_iter']:
            results_for_params = []
            avg_scores, std_dev_scores = conduct_tests_svm(FOLDS_N, SCORES_DICT, RANDOM_STATES, {'C': c_value, 'kernel': kernel_value, 'max_iter': max_iter_value})
            for metric in avg_scores.keys():
                results_for_params.append({
                    'C': c_value,
                    'Kernel': kernel_value,
                    'Max iter': max_iter_value,
                    'Quality Metric': metric,
                    'Mean': avg_scores[metric],
                    'Std Dev': std_dev_scores[metric]
                })

            results_df = pd.DataFrame(results_for_params)
            #results_df = results_df.pivot_table(index=['Criterion', 'Splitter', 'Max Depth'], columns='Quality Metric')
            results_by_params[(c_value, kernel_value, max_iter_value)] = results_df

# Display the results for each combination of parameters
#for params, results_df in results_by_params.items():
#display(results_df)

# Concatenate all dataframes into one
all_results_df = pd.concat(results_by_params.values())

# Group by each parameter and calculate mean and standard deviation for each metric
summary_df = all_results_df.groupby(['C', 'Kernel', 'Max iter']).agg({'Mean': 'mean', 'Std Dev': 'mean'})
#summary_df = summary_df.map(lambda x: '{:.3f}'.format(x).replace('.', ','))
# Display the summary table
display(summary_df)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Mean,Std Dev
C,Kernel,Max iter,Unnamed: 3_level_1,Unnamed: 4_level_1
0.1,linear,1,0.920105,0.047956
0.1,linear,6,0.950992,0.034718
0.1,linear,11,0.973008,0.026003
0.1,linear,16,0.964139,0.041600
0.1,linear,21,0.968440,0.034952
...,...,...,...,...
10.0,sigmoid,26,0.026264,0.027500
10.0,sigmoid,31,0.026366,0.027506
10.0,sigmoid,36,0.029988,0.032677
10.0,sigmoid,41,0.104140,0.083438


In [235]:
# Pivot the DataFrame for Criterion
c_df = summary_df.reset_index().pivot_table(index='C', values=['Mean', 'Std Dev'], aggfunc='mean')
c_df = c_df.map(lambda x: '{:.3f}'.format(x).replace('.', ','))
display(c_df)

# Pivot the DataFrame for Splitter
kernel_df = summary_df.reset_index().pivot_table(index='Kernel', values=['Mean', 'Std Dev'], aggfunc='mean')
kernel_df = kernel_df.map(lambda x: '{:.3f}'.format(x).replace('.', ','))
display(kernel_df)

# Pivot the DataFrame for Max Depth
max_iter_df = summary_df[summary_df.index.get_level_values('Kernel') != 'sigmoid']
max_iter_df = max_iter_df.reset_index().pivot_table(index=['Max iter'], values=['Mean', 'Std Dev'], aggfunc='mean')
max_iter_df = max_iter_df.map(lambda x: '{:.3f}'.format(x).replace('.', ','))
display(max_iter_df)

Unnamed: 0_level_0,Mean,Std Dev
C,Unnamed: 1_level_1,Unnamed: 2_level_1
0.1,725,40
1.0,731,38
10.0,718,46


Unnamed: 0_level_0,Mean,Std Dev
Kernel,Unnamed: 1_level_1,Unnamed: 2_level_1
linear,960,34
poly,914,56
rbf,953,37
sigmoid,71,39


Unnamed: 0_level_0,Mean,Std Dev
Max iter,Unnamed: 1_level_1,Unnamed: 2_level_1
1,808,74
6,931,57
11,950,49
16,961,41
21,961,40
26,964,31
31,964,32
36,964,32
41,960,34
46,960,32
