# Decision Tree and Random Forest

In [855]:
import pandas as pd
from abc import ABC, abstractmethod
from collections import Counter
import numpy as np
from sklearn.tree import DecisionTreeClassifier as skl_DTC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier as skl_RFC

### Future Implementations
- visualize_tree method
- Decision Tree Regressor
- Random Forest Regressor

In [856]:
class Classifier(ABC):
    class Tree():
        def __init__(self, feature_index: int = None, threshold: float = None, left: 'Classifier.Tree' = None, right: 'Classifier.Tree' = None, lvalue: int = None, rvalue: int = None):
            """
            A node in the decision tree.
            :param feature_index: Index of the feature to split on.
            :param threshold: Threshold value for the split.
            :param left: Left child node.
            :param right: Right child node.
            :param value: Class label if it's a leaf node.
            """
            self.feature_index = feature_index
            self.threshold = threshold
            self.left = left
            self.right = right
            self.lvalue = lvalue
            self.rvalue = rvalue
            
    def gini_impurity(self, x):
        """
        Calculate the Gini impurity for a given set of labels.

        :param x: A list of labels. (Series, list, DataFrame-column, etc.)
        :return: The Gini impurity value.
        """
        if x.size == 0:
            return 0.0

        counts = np.bincount(x)
        prob_sq = (counts / len(x)) ** 2
        return 1 - prob_sq.sum()
    
    def weighted_average(self, gi1, gi2, w1, w2):
        """Calculate Weighted Gini Impurity Average of two Gini impurity values
        """
        
        return {0:(gi1 * w1)/(w1+w2), 1:(gi2 * w2) / (w1 + w2)}

## Decision Tree

In [857]:
class DecisionTreeClassifier(Classifier):
    def __init__(self, max_depth:int =None, max_features:int = None):
        self.max_depth = max_depth
        self.root = None
        self.max_features = max_features
        
    def __check_GI(self, X, y):
        """
        This method should implement the logic to find the best feature and threshold to split on based on Gini impurity.
        """
        selected_features = np.random.choice(np.arange(X.shape[1]), size=self.max_features, replace=False) if self.max_features else range(X.shape[1])
        
        col_lbl_cts = [np.unique(X[:, i]) for i in selected_features]
        thresholds = [(a[:-1]+a[1:])/2 if a.size>1 else [] for a in col_lbl_cts]
        GIs = {}
        for i in range(len(selected_features)):
            for j in thresholds[i]:
                mask = X[:, selected_features[i]] <= j
                X_l, X_r, y_l, y_r = X[mask], X[~mask], y[mask], y[~mask]
                gi_l = self.gini_impurity(y_l)
                gi_r = self.gini_impurity(y_r)
                
                w1, w2 = len(X_l), len(X_r)
                gi_avg = self.weighted_average(gi_l, gi_r, w1, w2)
                s = gi_avg[0] + gi_avg[1]
                if s not in GIs:
                    GIs[s] =[(selected_features[i], j, gi_l, gi_r, s)]
                else:
                    GIs[s].append((selected_features[i], j, gi_l, gi_r, s))
                #This gi_avg sum is the Gini impurity for the split, which we want to minimize. 
        l = GIs[min(GIs.keys())] if GIs else None
        return l[np.random.choice(len(l), size=1)[0]] if l else (None, None, None, None, None)
    
    def __build_tree(self, X, y, root, gi=None, depth=1):
        """
        Recursively build the decision tree.
        :param root: The current node in the tree.
        :param depth: Current depth of the tree.
        """
        fi, thr, gi_l, gi_r = None, None, None, None
        if self.max_depth==None or (self.max_depth >= depth):
            fi, thr, gi_l, gi_r, GI = self.__check_GI(X, y)
            root.feature_index = fi
            root.threshold = thr
        
        if GI is None:
            return Counter(y).most_common(1)[0][0]
        
        lc_l, lc_r = Counter(y[X[:, fi] <= thr]), Counter(y[X[:, fi] > thr])
        
        if (self.max_depth and self.max_depth == depth)or(gi and (gi-GI)<0.01):
            root.lvalue = lc_l.most_common(1)[0][0]
            root.rvalue = lc_r.most_common(1)[0][0]
        else:
            if gi_l != 0:
                root.left = self.Tree()
                lrt = self.__build_tree(X[X[:, fi] <= thr], y[X[:, fi] <= thr], root.left, GI, depth + 1)
                
                if lrt is not None:
                    root.left = None
                    root.lvalue = lrt
                    
            else:
                root.lvalue = lc_l.most_common(1)[0][0]
            if gi_r != 0:
                root.right = self.Tree()
                rrt = self.__build_tree(X[X[:, fi] > thr], y[X[:, fi] > thr], root.right, GI, depth + 1)
                
                if rrt is not None:
                    root.right = None
                    root.rvalue = rrt
                    
            else:
                root.rvalue = lc_r.most_common(1)[0][0]
            
    def fit(self, X, y):
        if not isinstance(X, pd.DataFrame):
            raise TypeError("X must be a pandas DataFrame")
        if not isinstance(y, pd.Series):
            raise TypeError("y must be a pandas Series")
        if len(X) != len(y):
            raise ValueError("X and y must have the same number of samples")
        self.root = self.Tree()
        rt = self.__build_tree(X.to_numpy(), y.to_numpy(), root=self.root)
        if rt:
            self.root.lvalue = rt
            self.root.rvalue = rt
        
    def predict(self, X):
        y_pred = []
        if not isinstance(X, pd.DataFrame):
            raise TypeError("X must be a pandas DataFrame")
        for _, row in X.iterrows():
            node = self.root
            if node.threshold is None:
                y_pred.append(node.lvalue)
                continue
            while node.left or node.right:
                if row.iloc[node.feature_index] <= node.threshold:
                    if node.left:
                        node = node.left
                    else:
                        y_pred.append(node.lvalue)
                        break
                else:
                    if node.right:
                        node = node.right
                    else:
                        y_pred.append(node.rvalue)
                        break
            else:
                y_pred.append(node.lvalue if row.iloc[node.feature_index] <= node.threshold else node.rvalue)
                
        return pd.Series(y_pred)

    def score(self, X, y):
        if not isinstance(X, pd.DataFrame):
            raise TypeError("X must be a pandas DataFrame")
        if not isinstance(y, pd.Series):
            raise TypeError("y must be a pandas Series")
        if len(X) != len(y):
            raise ValueError("X and y must have the same number of samples")
        
        y.reset_index(drop=True, inplace=True)
        y_pred = self.predict(X)
        return float((y_pred == y).mean())
    
    def visualize_tree(self):
        """
        Visualize the decision tree.
        This method can be implemented using libraries like graphviz or matplotlib.
        """
        raise NotImplementedError("Visualization method is not implemented yet.")
        

## Random Forest

In [858]:

class RandomForestClassifier(Classifier):
    def __init__(self, n_estimators=100, max_depth=None, max_features=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features #if None, it will use sqrt(n_features)
        self.Dtrees = []
    
    def fit(self, X, y):
        if not isinstance(X, pd.DataFrame):
            raise TypeError("X must be a pandas DataFrame")
        if not isinstance(y, pd.Series):
            raise TypeError("y must be a pandas Series")
        if len(X) != len(y):
            raise ValueError("X and y must have the same number of samples")
        
        for _ in range(self.n_estimators):
            bootstrap_indices = np.random.choice(X.index, size=len(X), replace=True)
            X_bootstrap = X.loc[bootstrap_indices]
            y_bootstrap = y.loc[bootstrap_indices]
            tree = DecisionTreeClassifier(max_depth=self.max_depth, max_features=self.max_features if self.max_features else int(np.sqrt(X.shape[1])))
            tree.fit(X_bootstrap, y_bootstrap)
            self.Dtrees.append(tree)
        
    def predict(self, X):
        if not isinstance(X, pd.DataFrame):
            raise TypeError("X must be a pandas DataFrame")
        
        predictions = np.array([tree.predict(X) for tree in self.Dtrees])
        return pd.Series([Counter(predictions[:, i]).most_common(1)[0][0] for i in range(predictions.shape[1])], index=X.index)

## Evaluation

In [859]:
df = pd.read_csv("data/sample1.csv")
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].map(lambda x: 1 if x == "Yes" else 0)
sample_train_X = df[["Loves Popcorn", "Loves Soda", "Age"]]
sample_train_Y = df["Loves Cool As Ice"]
df

Unnamed: 0,Loves Popcorn,Loves Soda,Age,Loves Cool As Ice
0,0,1,35,1
1,1,1,7,0
2,1,1,38,1
3,1,0,12,0
4,1,0,50,0
5,0,1,18,1
6,0,0,83,0


In [860]:
df.sort_values(by="Age")

Unnamed: 0,Loves Popcorn,Loves Soda,Age,Loves Cool As Ice
1,1,1,7,0
3,1,0,12,0
5,0,1,18,1
0,0,1,35,1
2,1,1,38,1
4,1,0,50,0
6,0,0,83,0


In [861]:
model1 = DecisionTreeClassifier(max_depth=3)
model1.fit(sample_train_X, sample_train_Y)
model1.predict(sample_train_X)

0    1
1    0
2    1
3    0
4    0
5    1
6    0
dtype: int64

In [862]:
model2 = skl_DTC(max_depth=3)
model2.fit(sample_train_X, sample_train_Y)
model2.predict(sample_train_X)

array([1, 0, 1, 0, 0, 1, 0])

### Titanic Datase

In [863]:
titanic_df = pd.read_csv("data/titanic_sample.csv")
titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


#### Data Cleaning - titanic

In [864]:
mod_titanic_df = titanic_df.drop(columns=["Name", "Ticket", "Cabin", "Fare"])
mod_titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,1,0,3,male,22.0,1,0,S
1,2,1,1,female,38.0,1,0,C
2,3,1,3,female,26.0,0,0,S
3,4,1,1,female,35.0,1,0,S
4,5,0,3,male,35.0,0,0,S
...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,S
887,888,1,1,female,19.0,0,0,S
888,889,0,3,female,,1,2,S
889,890,1,1,male,26.0,0,0,C


In [865]:
mod_titanic_df.columns[mod_titanic_df.isnull().any()].to_list()

['Age', 'Embarked']

In [866]:
mod_titanic_df["Age"] = mod_titanic_df["Age"].fillna(mod_titanic_df["Age"].mean())
mod_titanic_df["Embarked"] = mod_titanic_df["Embarked"].fillna(mod_titanic_df["Embarked"].mode()[0])
mod_titanic_df.isnull().any()

PassengerId    False
Survived       False
Pclass         False
Sex            False
Age            False
SibSp          False
Parch          False
Embarked       False
dtype: bool

In [867]:
#Encoding categorical variables
embarked_map = {x:i for i, x in enumerate(Counter(mod_titanic_df["Embarked"]).keys())}
sex_map = {x:i for i, x in enumerate(Counter(mod_titanic_df["Sex"]).keys())}
mod_titanic_df["Embarked"] = mod_titanic_df["Embarked"].map(embarked_map)
mod_titanic_df["Sex"] = mod_titanic_df["Sex"].map(sex_map)
mod_titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,1,0,3,0,22.000000,1,0,0
1,2,1,1,1,38.000000,1,0,1
2,3,1,3,1,26.000000,0,0,0
3,4,1,1,1,35.000000,1,0,0
4,5,0,3,0,35.000000,0,0,0
...,...,...,...,...,...,...,...,...
886,887,0,2,0,27.000000,0,0,0
887,888,1,1,1,19.000000,0,0,0
888,889,0,3,1,29.699118,1,2,0
889,890,1,1,0,26.000000,0,0,1


####  Training and Testing

In [868]:
titan_X = mod_titanic_df.drop(columns=["Survived"])
titan_Y = mod_titanic_df["Survived"]
titan_train_x, titan_test_x, titan_train_y, titan_test_y = train_test_split(titan_X, titan_Y, test_size=0.2)

In [869]:
model_titanic = DecisionTreeClassifier(max_depth = 5)
model_titanic.fit(titan_train_x, titan_train_y)
titanic_y_pred = model_titanic.predict(titan_test_x)
titanic_y_pred

0      0
1      1
2      0
3      1
4      0
      ..
174    0
175    0
176    0
177    0
178    0
Length: 179, dtype: int64

In [870]:
skl_model_titanic = skl_DTC(max_depth=5)
skl_model_titanic.fit(titan_train_x, titan_train_y)
skl_titanic_y_pred = pd.Series(skl_model_titanic.predict(titan_test_x))
skl_titanic_y_pred

0      0
1      1
2      0
3      1
4      0
      ..
174    0
175    0
176    0
177    0
178    1
Length: 179, dtype: int64

In [871]:
titan_rfc = RandomForestClassifier(n_estimators=50, max_depth=5)
titan_rfc.fit(titan_train_x, titan_train_y)
titan_rfc_y_pred = titan_rfc.predict(titan_test_x)
titan_rfc_y_pred

836    0
307    1
465    0
767    1
722    0
      ..
493    0
277    0
203    0
721    0
404    1
Length: 179, dtype: int64

In [872]:
skl_titan_rfc = skl_RFC(n_estimators=50, max_depth=5)
skl_titan_rfc.fit(titan_train_x, titan_train_y)
skl_titan_rfc_y_pred = titan_rfc.predict(titan_test_x)
skl_titan_rfc_y_pred

836    0
307    1
465    0
767    1
722    0
      ..
493    0
277    0
203    0
721    0
404    1
Length: 179, dtype: int64

In [873]:
titan_accuracy = accuracy_score(titan_test_y, titanic_y_pred)
skl_titan_accuracy = accuracy_score(titan_test_y, skl_titanic_y_pred)
titan_rfc_accuracy = accuracy_score(titan_test_y, titan_rfc_y_pred)
skl_titan_rfc_accuracy = accuracy_score(titan_test_y, skl_titan_rfc_y_pred)
print(f"Custom Model Accuracy: {titan_accuracy}")
print(f"Sklearn Model Accuracy: {skl_titan_accuracy}")
print(f"Random Forest Model Accuracy: {titan_rfc_accuracy}")
print(f"Sklearn Random Forest Model Accuracy: {skl_titan_rfc_accuracy}")

Custom Model Accuracy: 0.8156424581005587
Sklearn Model Accuracy: 0.8044692737430168
Random Forest Model Accuracy: 0.8156424581005587
Sklearn Random Forest Model Accuracy: 0.8156424581005587


### Forest_Cover_Type - real world dataset

In [874]:
forest_train = pd.read_csv("data/forest_cover_type_prediction/train.csv")
forest_test = pd.read_csv("data/forest_cover_type_prediction/test.csv")
forest_train.info()
forest_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 56 columns):
 #   Column                              Non-Null Count  Dtype
---  ------                              --------------  -----
 0   Id                                  15120 non-null  int64
 1   Elevation                           15120 non-null  int64
 2   Aspect                              15120 non-null  int64
 3   Slope                               15120 non-null  int64
 4   Horizontal_Distance_To_Hydrology    15120 non-null  int64
 5   Vertical_Distance_To_Hydrology      15120 non-null  int64
 6   Horizontal_Distance_To_Roadways     15120 non-null  int64
 7   Hillshade_9am                       15120 non-null  int64
 8   Hillshade_Noon                      15120 non-null  int64
 9   Hillshade_3pm                       15120 non-null  int64
 10  Horizontal_Distance_To_Fire_Points  15120 non-null  int64
 11  Wilderness_Area1                    15120 non-null  int64
 12  Wild

In [875]:
forest_train.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,5
2,3,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,0,0,0,0,2
3,4,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,0,0,0,0,2
4,5,2595,45,2,153,-1,391,220,234,150,...,0,0,0,0,0,0,0,0,0,5


In [876]:
forest_test.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,15121,2680,354,14,0,0,2684,196,214,156,...,0,0,0,0,0,0,0,0,0,0
1,15122,2683,0,13,0,0,2654,201,216,152,...,0,0,0,0,0,0,0,0,0,0
2,15123,2713,16,15,0,0,2980,206,208,137,...,0,0,0,0,0,0,0,0,0,0
3,15124,2709,24,17,0,0,2950,208,201,125,...,0,0,0,0,0,0,0,0,0,0
4,15125,2706,29,19,0,0,2920,210,195,115,...,0,0,0,0,0,0,0,0,0,0


In [877]:
forest_x = forest_train.drop(columns = "Cover_Type")
forest_y = forest_train["Cover_Type"]
forest_trn_x, forest_tst_x, forest_trn_y, forest_tst_y = train_test_split(forest_x, forest_y, test_size=0.2)
forest_x.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,0
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,0
2,3,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,0,0,0,0,0
3,4,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,0,0,0,0,0
4,5,2595,45,2,153,-1,391,220,234,150,...,0,0,0,0,0,0,0,0,0,0


In [878]:
forest_y.head()

0    5
1    5
2    2
3    2
4    5
Name: Cover_Type, dtype: int64

#### Training and Testing

In [879]:
model_forest = DecisionTreeClassifier()
model_forest.fit(forest_trn_x, forest_trn_y)
forest_tst_y_pred = model_forest.predict(forest_tst_x)
forest_tst_y_pred

0       4
1       3
2       5
3       5
4       7
       ..
3019    6
3020    6
3021    5
3022    3
3023    7
Length: 3024, dtype: int64

In [880]:
skl_model_forest = skl_DTC()
skl_model_forest.fit(forest_trn_x, forest_trn_y)
skl_forest_tst_y_pred = pd.Series(skl_model_forest.predict(forest_tst_x))
skl_forest_tst_y_pred

0       4
1       6
2       5
3       5
4       7
       ..
3019    5
3020    3
3021    5
3022    6
3023    7
Length: 3024, dtype: int64

In [881]:
skl_RFC_forest = skl_RFC(n_estimators=1000)
skl_RFC_forest.fit(forest_trn_x, forest_trn_y)
skl_RFC_forest_tst_y_pred = pd.Series(skl_RFC_forest.predict(forest_tst_x))
skl_RFC_forest_tst_y_pred

0       4
1       6
2       5
3       5
4       7
       ..
3019    3
3020    3
3021    5
3022    6
3023    7
Length: 3024, dtype: int64

In [883]:
RFC_forest = RandomForestClassifier(n_estimators=100)
RFC_forest.fit(forest_trn_x, forest_trn_y)
RFC_forest_tst_y_pred = RFC_forest.predict(forest_tst_x)
RFC_forest_tst_y_pred

12669    4
5450     6
8493     5
11637    1
9668     7
        ..
7007     3
7162     3
1252     5
4475     4
9452     7
Length: 3024, dtype: int64

In [884]:
forest_accuracy = accuracy_score(forest_tst_y, forest_tst_y_pred)
skl_forest_accuracy = accuracy_score(forest_tst_y, skl_forest_tst_y_pred)
rfc_forest_accuracy = accuracy_score(forest_tst_y, RFC_forest_tst_y_pred)
skl_rfc_forest_accuracy = accuracy_score(forest_tst_y,skl_RFC_forest_tst_y_pred)
print(f"Custom Model Accuracy: {forest_accuracy}")
print(f"Sklearn Model Accuracy: {skl_forest_accuracy}")
print(f"Random Forest Model Accuracy: {rfc_forest_accuracy}")
print(f"Sklearn Random Forest Model Accuracy: {skl_rfc_forest_accuracy}")

Custom Model Accuracy: 0.6646825396825397
Sklearn Model Accuracy: 0.7886904761904762
Random Forest Model Accuracy: 0.6117724867724867
Sklearn Random Forest Model Accuracy: 0.8693783068783069
