Decision Tree Classifier Algorithm :

In [None]:
#Neccesary Library :

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

In [None]:
# Calculate Entropy :
def entropy(y):
    proportions = np.bincount(y) / len(y)
    return -np.sum([p * np.log2(p) for p in proportions if p > 0])

Create bestest Decision Tree :

In [None]:
def best_decision_stump(X ,y):

    
    best_gain =-1
    best_feature =None
    best_threshold =None
    best_value_left =None
    best_value_right =None

     #این حلقه بر روی هر کدام از فیچر های ورودی حرکت میکند و در اینده عملیات هایی را روی هر کدام انجام میدهیم
    for feature_index in range(X.shape[1]):
        #این دستور مقادیر منحصر به فرد یک فیچر را در قالب یک ارایه بر می گرداند
        thresholds =np.unique(X[: ,feature_index])
        
        for threshold in thresholds:
            left_mask =X[: ,feature_index] <=threshold
            right_mask =~left_mask
            left_y ,right_y =y[left_mask] ,y[right_mask]
            
            if len(left_y) >0 and len(right_y) >0:
              #محاسبه وزنها 
              left_weight =len(left_y) /len(y)
              right_weight=1 -left_weight
              gain = entropy(y) - (left_weight * entropy(left_y) + right_weight * entropy(right_y))
                
              if gain > best_gain:
                  best_gain = gain
                  best_feature = feature_index
                  best_threshold = threshold
                  best_value_left = np.bincount(left_y).argmax()
                  best_value_right = np.bincount(right_y).argmax()



    return best_feature, best_threshold, best_value_left, best_value_right

In [None]:
#Prediction with DTS :
def predict(X, feature, threshold, value_left, value_right):
    return np.where(X[:, feature] <= threshold, value_left, value_right)

In [None]:
# importing DataSet :
iris = load_iris()
X, y = iris.data, iris.target

In [None]:
# Divided DataSet to Train Set and TestSet :
X_train, X_test, y_train, y_test = train_test_split(X, y.flatten(), test_size=0.2, random_state=42)

In [None]:
# Find Best Parameter for DTS:
best_feature, best_threshold, value_left, value_right = best_decision_stump(X_train, y_train)
print("best_feature :" ,best_feature)
print("best_threshold :" ,best_threshold)
print("value_left :" ,value_left)
print("value_right :" ,value_right)

In [None]:
predictions = predict(X_test, best_feature, best_threshold, value_left, value_right)
print("Y_test      :" ,y_test)
print("predictions :" ,predictions)

In [None]:
# Evaluation Model :
print(f"Decision Stump Accuracy: {accuracy_score(y_test, predictions):.3f}")
print(f"Decision Stump F1-Score: {f1_score(y_test, predictions, average='weighted'):.3f}")

                                                     Using Scikit-learn's Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

dt_sklearn = DecisionTreeClassifier(criterion='entropy', max_depth=2)
dt_sklearn.fit(X_train, y_train)

In [None]:
dt_skl_predictions = dt_sklearn.predict(X_test)

In [None]:
print(f"Sklearn DT Accuracy: {accuracy_score(y_test, dt_skl_predictions):.3f}")
print(f"Sklearn DT F1-Score: {f1_score(y_test, dt_skl_predictions, average='weighted'):.3f}")

In [None]:
plt.figure(figsize =(10 ,6))
plt.title("Decision Tree Visualization - Sklearn")

plot_tree(dt_sklearn
         ,feature_names =list(iris.feature_names)
         ,class_names=list(iris.target_names)
         ,filled=True
         ,rounded=True)
plt.show()

In the plot above, you can see the decision rules that have been made for splitting the dataset.

                                                        ((Random Forest Algorithm))

Quick Review:

1) Each tree is trained on random subsets of the data and features, and their predictions are averaged for better performance.
2) It’s robust, reduces variance, and works well for both classification and regression.

Steps to Create a Random Forest:
A  :Bagging Randomly sample subsets of the data (with replacement) for each tree.
B  :For each tree, randomly select a subset of features at each split.
C  :Train a decision tree on each sampled dataset.
D  :Repeat steps 1–3 to build multiple decision trees.
E  :Aggregating: For predictions, aggregate the outputs of all the trees (e.g., majority vote for classification or averaging for regression).

In [None]:
import numpy as np
from sklearn.utils import resample
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier  # درخت تصمیم پایه

def bootstrap_samples(X, y):
    """
    Applies bootstrap resampling to the dataset.
    """
    return resample(X, y, n_samples=len(X), replace=True)

def fit_random_forest(X, y, n_trees=7):
    """
    Fits a random forest to the dataset (X, y).
    """
    trees = []
    for _ in range(n_trees):
        stump = DecisionTreeClassifier(max_depth=1)  # درخت تصمیم با عمق 1
        X_sample, y_sample = bootstrap_samples(X, y)
        stump.fit(X_sample, y_sample)
        trees.append(stump)
    return trees

def predict_random_forest(trees, X):
    """
    Predicts class labels for samples in X.
    """
    stump_predictions = np.array([tree.predict(X) for tree in trees])
    return majority_vote(stump_predictions)

def majority_vote(predictions):
    """
    Returns the majority vote of the predictions.
    """
    return np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=predictions)

In [None]:
# بارگذاری داده‌های سرطان سینه
breast_cancer = load_breast_cancer()
X, y = breast_cancer.data, breast_cancer.target

In [None]:
# تقسیم داده‌ها به مجموعه‌های آموزشی و تست
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# آموزش مدل Random Forest
trees = fit_random_forest(X_train, y_train)

# پیش‌بینی با مدل Random Forest
rf_custom_predictions = predict_random_forest(trees, X_test)

In [None]:
# نمایش نتایج
print(f"Custom RF Accuracy: {accuracy_score(y_test, rf_custom_predictions):.3f}")
print(f"Custom RF F1-Score: {f1_score(y_test, rf_custom_predictions, average='weighted'):.3f}")

Using Scikit-learn's Random Forest Classifier :

After implementing a custom Random Forest from scratch, we will now compare the results by utilizing the Scikit-learn library,
which provides a highly optimized implementation of random forests.
This allows us to observe the behavior of random forests without dealing with the complexities of manually building one.
We’ll also visualize the decision trees used in the random forest to gain insight into how it makes decisions.
Make sure to try different combinations of hyperparameters (n_estimators, max_depth, min_samples, etc.)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_sklearn =RandomForestClassifier(n_estimators=7 ,
                                   max_depth= 1 ,
                                   criterion="entropy" ,
                                   random_state =42)
rf_sklearn.fit(X_train ,y_train.ravel())
rf_skl_predictions =rf_sklearn.predict(X_test)


In [None]:
print(f"Sklearn RF Accuracy: {accuracy_score(y_test, rf_skl_predictions):.3f}")
print(f"Sklearn RF F1-Score: {f1_score(y_test, rf_skl_predictions, average='weighted'):.3f}")

In [None]:
for idx, tree in enumerate(rf_sklearn.estimators_):
    plt.figure(figsize=(8,6))
    plot_tree(tree, filled=True, feature_names=list(breast_cancer.feature_names), class_names=list(breast_cancer.target_names))
    plt.title(f"Random Forest Visualization - Tree {idx + 1}")
    plt.tight_layout()
    plt.show()

In [None]:
#Now choose a random sample to illustrate.


In [None]:
sample_idx = 112

pd.set_option('display.max_columns', None)
pd.DataFrame(X_test[112].reshape(1, -1), columns=breast_cancer.feature_names).head()

In [None]:
votes = [tree.predict(X_test[112].reshape(1, -1)) for tree in rf_sklearn.estimators_]
final_prediction = rf_sklearn.predict(X_test[sample_idx].reshape(1, -1))[0]


plt.figure(figsize=(8, 4))
plt.scatter([range(1, len(rf_sklearn.estimators_) + 1)], votes, s=100, alpha=0.7, label='Votes')
plt.axhline(y=final_prediction, color='r', linestyle='--', label='Final Prediction')
plt.yticks([0, 1], ['Class 0', 'Class 1'])
plt.xlabel('Decision Trees')
plt.ylabel('Votes')
plt.title(f'Random Forest: Votes from Each DT for Sample #{sample_idx + 1}')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()