# Exercise 6



### Create a custom Decision Tree Classifier and compute the accuracy.

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

class DecisionTreeClassifierCustom:
    def __init__(self, max_depth=None, criterion='entropy'):
        self.max_depth = max_depth
        self.criterion = criterion
        self.tree = None

    def entropy(self, y):
        unique_classes, counts = np.unique(y, return_counts=True)
        probabilities = counts / counts.sum()
        return -np.sum(probabilities * np.log2(probabilities))

    def gini_index(self, y):
        unique_classes, counts = np.unique(y, return_counts=True)
        probabilities = counts / counts.sum()
        return 1 - np.sum(probabilities ** 2)

    def impurity(self, y):
        if self.criterion == 'gini':
            return self.gini_index(y)
        return self.entropy(y)

    def information_gain(self, X_column, y, threshold):
        left_mask = X_column <= threshold
        right_mask = ~left_mask
        left_impurity = self.impurity(y[left_mask])
        right_impurity = self.impurity(y[right_mask])
        left_weight = np.sum(left_mask) / len(y)
        right_weight = np.sum(right_mask) / len(y)
        return self.impurity(y) - (left_weight * left_impurity + right_weight * right_impurity)

    def best_split(self, X, y):
        best_gain = 0
        best_feature = None
        best_threshold = None
        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                gain = self.information_gain(X[:, feature], y, threshold)
                if gain > best_gain:
                    best_gain, best_feature, best_threshold = gain, feature, threshold
        return best_feature, best_threshold, best_gain

    def build_tree(self, X, y, depth=0):
        if len(np.unique(y)) == 1 or (self.max_depth and depth >= self.max_depth):
            return np.bincount(y).argmax()
        feature, threshold, gain = self.best_split(X, y)
        if gain == 0:
            return np.bincount(y).argmax()
        left_mask = X[:, feature] <= threshold
        right_mask = ~left_mask
        left_subtree = self.build_tree(X[left_mask], y[left_mask], depth + 1)
        right_subtree = self.build_tree(X[right_mask], y[right_mask], depth + 1)
        return (feature, threshold, left_subtree, right_subtree)

    def fit(self, X, y):
        self.tree = self.build_tree(X, y)

    def predict_sample(self, sample, node):
        if isinstance(node, (int, np.integer)):
            return node
        feature, threshold, left_subtree, right_subtree = node
        if sample[feature] <= threshold:
            return self.predict_sample(sample, left_subtree)
        else:
            return self.predict_sample(sample, right_subtree)

    def predict(self, X):
        return np.array([self.predict_sample(sample, self.tree) for sample in X])

# Load dataset
file_path = "/content/classification.csv"
df = pd.read_csv(file_path)

# Handle missing values
df.dropna(inplace=True)

# Extract features and target
X = df[['Age', 'EstimatedSalary']].values
y = df['Purchased'].values

# Feature Scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate the model
dt = DecisionTreeClassifierCustom(max_depth=10, criterion='gini')
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
accuracy = np.mean(y_pred == y_test)
print(f"Model Accuracy: {accuracy:.4f}")

Model Accuracy: 0.8625


### Compare the accuracy with sklearn's DecisionTreeClassifier.

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

file_path = "/content/classification.csv"
df = pd.read_csv(file_path)
X = df[['Age', 'EstimatedSalary']].values
y = df['Purchased'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dt = DecisionTreeClassifier(max_depth=10, random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

Model Accuracy: 0.8500
