In [None]:
from sklearn.datasets import load_wine
import numpy as np
dataset = load_wine()
X=dataset["data"]  
Y=dataset["target"] 

In [None]:
feature_names = dataset["feature_names"]
feature_names

In [None]:
X.shape

In [None]:
Y.shape

In [None]:
len(feature_names)

In [None]:
np.isnan(X).sum()


In [None]:
classes = np.unique(Y)
for c in classes:
    count = np.count_nonzero(Y == c)
    print(f"In class {c} there are {count}")

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(X,Y)

In [None]:
#%pip install pydot

In [None]:

import pydot
from IPython.display import Image
import graphviz
from sklearn.tree import plot_tree, export_graphviz

# First plotting method
dot_code = export_graphviz(clf, feature_names=feature_names)
graph = pydot.graph_from_dot_data(dot_code)
Image(graph[0].create_png())

In [None]:
from graphviz import Source
from sklearn import tree

# Second plotting method
Source(tree.export_graphviz(clf, out_file=None, feature_names=feature_names))

In [None]:
# Third plotting method
plot = plot_tree(clf)

In [None]:
# POINT 4
y_pred = clf.predict(X)

from sklearn.metrics import accuracy_score

accuracy_score(Y, y_pred)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,Y, train_size=.8 )
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape  )  

In [None]:
clf_tts = DecisionTreeClassifier()
clf_tts.fit(X_train, y_train)
y_pred_tts = clf_tts.predict(X_test)
accuracy_score(y_test, y_pred_tts)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
print(f"Precision score: {precision_score(y_test, y_pred_tts, average=None)}")
print(f"Recall score: {recall_score(y_test, y_pred_tts, average=None)}")
print(f"f1 score: {f1_score(y_test, y_pred_tts, average=None)}")  


print(classification_report(y_test, y_pred_tts))


In [None]:
from sklearn.model_selection import ParameterGrid
params = {
    "max_depth": [None, 2, 4, 8],
    "splitter": ["best", "random"],
    "criterion": ["gini", "entropy", "log_loss"],
    "min_samples_split": [2,3]
}
classifiers = []
accs = []
for i, config in enumerate(ParameterGrid(params)):
    new_clf = DecisionTreeClassifier(**config)
    new_clf.fit(X_train, y_train)
    y_pred = new_clf.predict(X_test)  
    print(f"Classifier {i}: ", end="")
    #print(f"Config: {config} \n{classification_report(y_test, y_pred)}")
    acc = accuracy_score(y_test, y_pred)
    print(f"Acc: {acc}, Config: {config}")
    classifiers.append(new_clf)
    accs.append(acc)

index_best_clf = np.array(accs).argmax()
best_clf = classifiers[index_best_clf]
print(f"Best classifier is the classifier {index_best_clf} with hyperparams: {ParameterGrid(params)[index_best_clf]} and accuracy: {accs[index_best_clf]}")

    

In [None]:
Source(tree.export_graphviz(best_clf, out_file=None, feature_names=feature_names))  # type: ignore

In [None]:
from sklearn.model_selection import KFold
# Split the datasets into two:
# - X_train_valid: the dataset used for the k-fold cross-validation
# - X_test: the dataset used for the final testing (this will NOT
# be seen by the classifier during the training/validation phases)
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, Y)
kf = KFold(5) # 5-fold cross-validation
# X and y are the arrays to be split
local_accuracy = []
classifiers = []
accs = []

params = {
    "max_depth": [None, 2, 4, 8],
    "splitter": ["best", "random"],
    "criterion": ["gini", "entropy", "log_loss"],
    "min_samples_split": [2,3]
}

for i, config in enumerate(ParameterGrid(params)):
    clf = DecisionTreeClassifier(**config)
    print(f"Classifier {i}: ", end="")
    classifiers.append(new_clf)
    for train_indices, validation_indices in kf.split(X_train_valid):
        X_train = X_train_valid[train_indices]
        X_valid = X_train_valid[validation_indices]
        y_train = y_train_valid[train_indices]
        y_valid = y_train_valid[validation_indices]
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_valid)
        acc = accuracy_score(y_valid, y_pred)
        local_accuracy.append(acc)
    
    avg = np.array(local_accuracy).mean()
    print(f"Average Accuracy of {i}-classifier: {avg}. Config: {config}")
    accs.append(avg)


i_best = np.array(accs).argmax()
best_k_clf = classifiers[i_best]
classifiers[i_best].fit(X_train_valid, y_train_valid)

pred = best_k_clf.predict(X_test)
print(f"Accuracy on new data of Best Classifier ({i_best}): {accuracy_score(y_test, pred)}")
Source(tree.export_graphviz(best_k_clf, out_file=None, feature_names=feature_names))  # type: ignore
