# Q-15 WAP to implement Decision Trees through  scratch and library. Perform hyper-parameter tuning

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
iris = load_iris()
X = iris.data
y = iris.target

In [3]:
class TreeNode:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value  # For leaf nodes

class DecisionTreeClassifier:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.n_classes = len(np.unique(y))
        self.n_features = X.shape[1]
        self.tree_ = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples_per_class = [np.sum(y == i) for i in range(self.n_classes)]
        predicted_class = np.argmax(n_samples_per_class)
        node = TreeNode(value=predicted_class)

        if depth < self.max_depth:
            best_gini = np.inf
            best_criteria = None
            best_sets = None

            for feature_index in range(self.n_features):
                thresholds = np.unique(X[:, feature_index])
                for threshold in thresholds:
                    left_indices = np.where(X[:, feature_index] <= threshold)[0]
                    right_indices = np.where(X[:, feature_index] > threshold)[0]
                    if len(left_indices) == 0 or len(right_indices) == 0:
                        continue
                    gini = self._gini_index(y[left_indices], y[right_indices])
                    if gini < best_gini:
                        best_gini = gini
                        best_criteria = (feature_index, threshold)
                        best_sets = (left_indices, right_indices)

            if best_gini == np.inf:
                return node

            left = self._grow_tree(X[best_sets[0], :], y[best_sets[0]], depth + 1)
            right = self._grow_tree(X[best_sets[1], :], y[best_sets[1]], depth + 1)
            node = TreeNode(feature_index=best_criteria[0], threshold=best_criteria[1], left=left, right=right)

        return node

    def _gini_index(self, left_y, right_y):
        p_left = len(left_y) / (len(left_y) + len(right_y))
        p_right = len(right_y) / (len(left_y) + len(right_y))
        gini = 1.0 - (p_left ** 2 + p_right ** 2)
        return gini

    def _predict(self, inputs):
        node = self.tree_
        while node.left:
            if inputs[node.feature_index] <= node.threshold:
                node = node.left
            else:
                node = node.right
        return node.value

    def predict(self, X):
        return [self._predict(x) for x in X]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
dt_clf_scratch = DecisionTreeClassifier(max_depth=3)

In [6]:
dt_clf_scratch.fit(X_train, y_train)

In [7]:
y_pred_scratch = dt_clf_scratch.predict(X_test)

In [8]:
accuracy_scratch = np.mean(y_pred_scratch == y_test)
print("Test set accuracy (from scratch):", accuracy_scratch)

Test set accuracy (from scratch): 0.3


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
dt_clf = DecisionTreeClassifier()

In [11]:
param_grid = {
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [12]:
grid_search = GridSearchCV(dt_clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

TypeError: If no scoring is specified, the estimator passed should have a 'score' method. The estimator <__main__.DecisionTreeClassifier object at 0x000001333149B510> does not.

In [13]:
print("Best hyperparameters:", grid_search.best_params_)

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [14]:
best_dt_clf = grid_search.best_estimator_
accuracy = best_dt_clf.score(X_test, y_test)
print("Test set accuracy:", accuracy)

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'