In [1]:
import numpy as np
from collections import Counter


In [3]:
class CustomDecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _entropy(self, y):
        counts = np.bincount(y)
        probs = counts / len(y)
        return -np.sum([p * np.log2(p + 1e-9) for p in probs if p > 0])

    def _information_gain(self, y, y_left, y_right):
        parent_entropy = self._entropy(y)
        w_left = len(y_left) / len(y)
        w_right = len(y_right) / len(y)
        return parent_entropy - (w_left * self._entropy(y_left) + w_right * self._entropy(y_right))

    def _best_split(self, X, y):
        best_gain = -1
        best_feature, best_threshold = None, None

        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for t in thresholds:
                left_mask = X[:, feature] <= t
                right_mask = X[:, feature] > t
                if len(y[left_mask]) == 0 or len(y[right_mask]) == 0:
                    continue

                gain = self._information_gain(y, y[left_mask], y[right_mask])
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = t

        return best_feature, best_threshold

    def _build_tree(self, X, y, depth=0):
        if len(set(y)) == 1:
            return {"class": y[0]}

        if self.max_depth is not None and depth >= self.max_depth:
            return {"class": Counter(y).most_common(1)[0][0]}

        feature, threshold = self._best_split(X, y)
        if feature is None:
            return {"class": Counter(y).most_common(1)[0][0]}

        left_mask = X[:, feature] <= threshold
        right_mask = X[:, feature] > threshold

        return {
            "feature": feature,
            "threshold": threshold,
            "left": self._build_tree(X[left_mask], y[left_mask], depth + 1),
            "right": self._build_tree(X[right_mask], y[right_mask], depth + 1)
        }

    def _predict_one(self, x, tree):
        if "class" in tree:
            return tree["class"]
        if x[tree["feature"]] <= tree["threshold"]:
            return self._predict_one(x, tree["left"])
        else:
            return self._predict_one(x, tree["right"])

    def predict(self, X):
        return np.array([self._predict_one(x, self.tree) for x in X])


In [4]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [5]:
data = load_iris()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [6]:
custom_tree = CustomDecisionTree(max_depth=3)
custom_tree.fit(X_train, y_train)

y_pred_custom = custom_tree.predict(X_test)
accuracy_custom = accuracy_score(y_test, y_pred_custom)

print(f"Custom Decision Tree Accuracy: {accuracy_custom:.4f}")

Custom Decision Tree Accuracy: 0.9667


In [7]:
from sklearn.tree import DecisionTreeClassifier

In [8]:
sk_tree = DecisionTreeClassifier(max_depth=3, random_state=42)
sk_tree.fit(X_train, y_train)

y_pred_sk = sk_tree.predict(X_test)
accuracy_sk = accuracy_score(y_test, y_pred_sk)

print(f"Scikit-learn Decision Tree Accuracy: {accuracy_sk:.4f}")

Scikit-learn Decision Tree Accuracy: 1.0000


In [9]:
from sklearn.datasets import load_wine
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [10]:
wine = load_wine()
X = wine.data
y = wine.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [11]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)
f1_dt = f1_score(y_test, y_pred_dt, average="weighted")

print(f"Decision Tree F1 Score: {f1_dt:.4f}")

Decision Tree F1 Score: 0.9440


In [12]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
f1_rf = f1_score(y_test, y_pred_rf, average="weighted")

print(f"Random Forest F1 Score: {f1_rf:.4f}")

Random Forest F1 Score: 1.0000


In [13]:
from sklearn.model_selection import GridSearchCV

In [16]:
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5, 10]
}

grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring="f1_weighted"
)

grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)
print("Best F1 Score:", grid.best_score_)

Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Best F1 Score: 0.9782952128219708


In [17]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error

In [19]:
from sklearn.datasets import load_diabetes

data = load_diabetes()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [20]:
dt_reg = DecisionTreeRegressor(random_state=42)
dt_reg.fit(X_train, y_train)

pred_dt = dt_reg.predict(X_test)
print("Decision Tree MSE:", mean_squared_error(y_test, pred_dt))


Decision Tree MSE: 4976.797752808989


In [21]:
param_dist = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10]
}

random_search = RandomizedSearchCV(
    RandomForestRegressor(random_state=42),
    param_dist,
    n_iter=10,
    cv=5,
    scoring="neg_mean_squared_error",
    random_state=42
)

random_search.fit(X_train, y_train)

best_rf = random_search.best_estimator_
pred_rf = best_rf.predict(X_test)

print("Best Parameters:", random_search.best_params_)
print("Random Forest MSE:", mean_squared_error(y_test, pred_rf))


Best Parameters: {'n_estimators': 200, 'min_samples_split': 10, 'max_depth': 5}
Random Forest MSE: 2860.4202872607243
