In [None]:
import numpy as np
from collections import Counter

---
## Random Forest for Classification task

In [None]:
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.n_classes = len(np.unique(y))
        self.tree = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        # Stopping criteria
        if (self.max_depth is not None and depth >= self.max_depth) or n_labels == 1:
            leaf_value = self._most_common_label(y)
            return {'type': 'leaf', 'value': leaf_value}

        # Find the best split
        best_split = self._best_split(X, y)
        if best_split['impurity'] == 0:
            return {'type': 'leaf', 'value': best_split['class']}

        # Recursive split
        left_indices = np.where(X[:, best_split['feature_index']] <= best_split['threshold'])[0]
        right_indices = np.where(X[:, best_split['feature_index']] > best_split['threshold'])[0]

        left_subtree = self._grow_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._grow_tree(X[right_indices], y[right_indices], depth + 1)

        return {'type': 'split', 'feature_index': best_split['feature_index'],
                'threshold': best_split['threshold'],
                'left': left_subtree, 'right': right_subtree}

    def _best_split(self, X, y):
        n_samples, n_features = X.shape
        best_split = {'impurity': float('inf')}

        for feature_index in range(n_features):
            feature_values = np.unique(X[:, feature_index])
            for threshold in feature_values:
                left_indices = np.where(X[:, feature_index] <= threshold)[0]
                right_indices = np.where(X[:, feature_index] > threshold)[0]

                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue

                impurity = self._calculate_impurity(y, left_indices, right_indices)
                if impurity < best_split['impurity']:
                    best_split = {'feature_index': feature_index, 'threshold': threshold,
                                  'impurity': impurity, 'class': self._most_common_label(y)}

        return best_split

    def _calculate_impurity(self, y, left_indices, right_indices):
        p_left = len(left_indices) / len(y)
        p_right = len(right_indices) / len(y)
        return p_left * self._gini_impurity(y[left_indices]) + p_right * self._gini_impurity(y[right_indices])

    def _gini_impurity(self, y):
        _, counts = np.unique(y, return_counts=True)
        p = counts / len(y)
        return 1 - np.sum(p ** 2)

    def _most_common_label(self, y):
        return Counter(y).most_common(1)[0][0]

    def predict(self, X):
        return np.array([self._predict_sample(x, self.tree) for x in X])

    def _predict_sample(self, x, node):
        if node['type'] == 'leaf':
            return node['value']
        if x[node['feature_index']] <= node['threshold']:
            return self._predict_sample(x, node['left'])
        else:
            return self._predict_sample(x, node['right'])

class RandomForest:
    def __init__(self, n_estimators=100, max_depth=None, max_features=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.estimators = [DecisionTree(max_depth=self.max_depth) for _ in range(self.n_estimators)]

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.feature_indices = []

        for estimator in self.estimators:
            if self.max_features is not None:
                indices = np.random.choice(n_features, size=self.max_features, replace=False)
            else:
                indices = np.arange(n_features)
            self.feature_indices.append(indices)
            estimator.fit(X[:, indices], y)

    def predict(self, X):
        return np.array([self._predict_sample(x) for x in X])

    def _predict_sample(self, x):
        predictions = [estimator.predict([x[indices]])[0] for estimator, indices in zip(self.estimators, self.feature_indices)]
        return Counter(predictions).most_common(1)[0][0]


### Test on a sample classification data

In [4]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the Breast Cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest classifier
clf = RandomForest(n_estimators=100, max_depth=5)
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9298245614035088


---
## Random Forest for Regression task

In [8]:
class DecisionTreeRegressor:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.tree = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape

        # Stopping criteria
        if self.max_depth is not None and depth >= self.max_depth:
            return {'type': 'leaf', 'value': np.mean(y)}

        # Find the best split
        best_split = self._best_split(X, y)
        if best_split['mse'] == float('inf'):
            return {'type': 'leaf', 'value': np.mean(y)}

        # Recursive split
        left_indices = np.where(X[:, best_split['feature_index']] <= best_split['threshold'])[0]
        right_indices = np.where(X[:, best_split['feature_index']] > best_split['threshold'])[0]

        left_subtree = self._grow_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._grow_tree(X[right_indices], y[right_indices], depth + 1)

        return {'type': 'split', 'feature_index': best_split['feature_index'],
                'threshold': best_split['threshold'],
                'left': left_subtree, 'right': right_subtree}

    def _best_split(self, X, y):
        n_samples, n_features = X.shape
        best_split = {'mse': float('inf')}

        for feature_index in range(n_features):
            feature_values = np.unique(X[:, feature_index])
            for threshold in feature_values:
                left_indices = np.where(X[:, feature_index] <= threshold)[0]
                right_indices = np.where(X[:, feature_index] > threshold)[0]

                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue

                mse = self._calculate_mse(y[left_indices], y[right_indices])
                if mse < best_split['mse']:
                    best_split = {'feature_index': feature_index, 'threshold': threshold, 'mse': mse}

        return best_split

    def _calculate_mse(self, y_left, y_right):
        return np.mean((y_left - np.mean(y_left))**2) + np.mean((y_right - np.mean(y_right))**2)

    def predict(self, X):
        return np.array([self._predict_tree(x, self.tree) for x in X])

    def _predict_tree(self, x, tree):
        if tree['type'] == 'leaf':
            return tree['value']
        if x[tree['feature_index']] <= tree['threshold']:
            return self._predict_tree(x, tree['left'])
        else:
            return self._predict_tree(x, tree['right'])


class RandomForestRegressor:
    def __init__(self, n_estimators=100, max_depth=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X, y):
        for _ in range(self.n_estimators):
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            indices = np.random.choice(len(X), len(X), replace=True)
            tree.fit(X[indices], y[indices])
            self.trees.append(tree)

    def predict(self, X):
        return np.mean([tree.predict(X) for tree in self.trees], axis=0)


### Test on a sample regression data

In [13]:
from sklearn.datasets import load_diabetes
from sklearn.metrics import mean_squared_error

# Load diabetes dataset
X, y = load_diabetes(return_X_y=True)
X = data.data
y = data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, max_depth=5)
rf_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_regressor.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

Mean Squared Error (MSE): 0.036820513601105415


**Note:** An MSE of 0.037 from the Random Forest regression model signifies minimal prediction error, implying close alignment between predicted and actual values. We compare this result with the MSE of 2751.53 from the gradient boosting model indicates substantially higher prediction error. Therefore, the Random Forest model outperforms the gradient boosting model in terms of predictive accuracy for this diabetes dataset.