# Decision Tree

In [802]:
import pandas as pd
from abc import ABC, abstractmethod
from collections import Counter
import numpy as np
from sklearn.tree import DecisionTreeClassifier as skl_DTC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

### Future Implementations
- Linear Regression or some other way for missing data
- visualize_tree method
- Random Forest Classifier
- Decision Tree Regressor
- Random Forest Regressor

In [803]:
class Classifier(ABC):
    class Tree():
        def __init__(self, feature_index: int = None, threshold: float = None, left: 'Classifier.Tree' = None, right: 'Classifier.Tree' = None, lvalue: int = None, rvalue: int = None):
            """
            A node in the decision tree.
            :param feature_index: Index of the feature to split on.
            :param threshold: Threshold value for the split.
            :param left: Left child node.
            :param right: Right child node.
            :param value: Class label if it's a leaf node.
            """
            self.feature_index = feature_index
            self.threshold = threshold
            self.left = left
            self.right = right
            self.lvalue = lvalue
            self.rvalue = rvalue
            
    def gini_impurity(self, x):
        """
        Calculate the Gini impurity for a given set of labels.

        :param x: A list of labels. (Series, list, DataFrame-column, etc.)
        :return: The Gini impurity value.
        """
        if x.size == 0:
            return 0.0

        counts = np.bincount(x)
        prob_sq = (counts / len(x)) ** 2
        return 1 - prob_sq.sum()
    
    def weighted_average(self, gi1, gi2, w1, w2):
        """Calculate Weighted Gini Impurity Average of two Gini impurity values
        """
        
        return {0:(gi1 * w1)/(w1+w2), 1:(gi2 * w2) / (w1 + w2)}

In [804]:
class DecisionTreeClassifier(Classifier):
    def __init__(self, max_depth:int =None):
        self.max_depth = max_depth
        self.root = None
        
    def __check_GI(self, X, y):
        """
        This method should implement the logic to find the best feature and threshold to split on based on Gini impurity.
        """
        col_lbl_cts =[np.unique(X[:, i]) for i in range(X.shape[1])]
        thresholds = [(a[:-1]+a[1:])/2 if a.size>1 else [] for a in col_lbl_cts]
        GIs = {}
        for i in range(len(thresholds)):
            for j in thresholds[i]:
                mask = X[:, i] <= j
                X_l, X_r, y_l, y_r = X[mask], X[~mask], y[mask], y[~mask]
                gi_l = self.gini_impurity(y_l)
                gi_r = self.gini_impurity(y_r)
                
                w1, w2 = len(X_l), len(X_r)
                gi_avg = self.weighted_average(gi_l, gi_r, w1, w2)
                s = gi_avg[0] + gi_avg[1]
                if s not in GIs:
                    GIs[s] =[(i, j, gi_l, gi_r, s)]
                else:
                    GIs[s].append((i, j, gi_l, gi_r, s))
                #This gi_avg sum is the Gini impurity for the split, which we want to minimize. 
        l = GIs[min(GIs.keys())]
        return l[np.random.choice(len(l), size=1)[0]] if GIs else (None, None, None, None, None)
    
    def __build_tree(self, X, y, root, gi=None, depth=1):
        """
        Recursively build the decision tree.
        :param root: The current node in the tree.
        :param depth: Current depth of the tree.
        """
        fi, thr, gi_l, gi_r, lc_l, lc_r = None, None, None, None, None, None
        if self.max_depth >= depth:
            fi, thr, gi_l, gi_r, GI = self.__check_GI(X, y)
            root.feature_index = fi
            root.threshold = thr
        
        lc_l, lc_r = Counter(y[X[:, fi] <= thr]), Counter(y[X[:, fi] > thr])
        
        if (self.max_depth == depth)or(gi and (gi-GI)<0.005):
            root.lvalue = lc_l.most_common(1)[0][0]
            root.rvalue = lc_r.most_common(1)[0][0]
        else:
            if gi_l != 0:
                root.left = self.Tree()
                self.__build_tree(X[X[:, fi] <= thr], y[X[:, fi] <= thr], root.left, GI, depth + 1)
            else:
                root.lvalue = lc_l.most_common(1)[0][0]
            if gi_r != 0:
                root.right = self.Tree()
                self.__build_tree(X[X[:, fi] > thr], y[X[:, fi] > thr], root.right, GI, depth + 1)
            else:
                root.rvalue = lc_r.most_common(1)[0][0]
            
    def fit(self, X, y):
        if not isinstance(X, pd.DataFrame):
            raise TypeError("X must be a pandas DataFrame")
        if not isinstance(y, pd.Series):
            raise TypeError("y must be a pandas Series")
        if len(X) != len(y):
            raise ValueError("X and y must have the same number of samples")
        self.root = self.Tree()
        self.__build_tree(X.to_numpy(), y.to_numpy(), root=self.root)
        
    def predict(self, X):
        y_pred = []
        if not isinstance(X, pd.DataFrame):
            raise TypeError("X must be a pandas DataFrame")
        for _, row in X.iterrows():
            node = self.root
            while node.left or node.right:
                if row.iloc[node.feature_index] <= node.threshold:
                    if node.left:
                        node = node.left
                    else:
                        y_pred.append(node.lvalue)
                        break
                else:
                    if node.right:
                        node = node.right
                    else:
                        y_pred.append(node.rvalue)
                        break
            else:
                y_pred.append(node.lvalue if row.iloc[node.feature_index] <= node.threshold else node.rvalue)
                
        return pd.Series(y_pred)

    def score(self, X, y):
        if not isinstance(X, pd.DataFrame):
            raise TypeError("X must be a pandas DataFrame")
        if not isinstance(y, pd.Series):
            raise TypeError("y must be a pandas Series")
        if len(X) != len(y):
            raise ValueError("X and y must have the same number of samples")
        
        y.reset_index(drop=True, inplace=True)
        y_pred = self.predict(X)
        return float((y_pred == y).mean())
    
    def visualize_tree(self):
        """
        Visualize the decision tree.
        This method can be implemented using libraries like graphviz or matplotlib.
        """
        raise NotImplementedError("Visualization method is not implemented yet.")

In [805]:
df = pd.read_csv("data/sample1.csv")
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].map(lambda x: 1 if x == "Yes" else 0)
train_X = df[["Loves Popcorn", "Loves Soda", "Age"]]
train_Y = df["Loves Cool As Ice"]
df

Unnamed: 0,Loves Popcorn,Loves Soda,Age,Loves Cool As Ice
0,0,1,35,1
1,1,1,7,0
2,1,1,38,1
3,1,0,12,0
4,1,0,50,0
5,0,1,18,1
6,0,0,83,0


In [806]:
df.sort_values(by="Age")

Unnamed: 0,Loves Popcorn,Loves Soda,Age,Loves Cool As Ice
1,1,1,7,0
3,1,0,12,0
5,0,1,18,1
0,0,1,35,1
2,1,1,38,1
4,1,0,50,0
6,0,0,83,0


In [807]:
model1 = DecisionTreeClassifier(max_depth=3)
model1.fit(train_X, train_Y)
model1.predict(train_X)

0    1
1    0
2    1
3    0
4    0
5    1
6    0
dtype: int64

In [808]:
model2 = skl_DTC(max_depth=3)
model2.fit(train_X, train_Y)
model2.predict(train_X)

array([1, 0, 1, 0, 0, 1, 0])

In [809]:
titanic_df = pd.read_csv("data/titanic_sample.csv")
titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


#### Data Cleaning

In [810]:
mod_titanic_df = titanic_df.drop(columns=["Name", "Ticket", "Cabin", "Fare"])
mod_titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,1,0,3,male,22.0,1,0,S
1,2,1,1,female,38.0,1,0,C
2,3,1,3,female,26.0,0,0,S
3,4,1,1,female,35.0,1,0,S
4,5,0,3,male,35.0,0,0,S
...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,S
887,888,1,1,female,19.0,0,0,S
888,889,0,3,female,,1,2,S
889,890,1,1,male,26.0,0,0,C


In [811]:
mod_titanic_df.columns[mod_titanic_df.isnull().any()].to_list()

['Age', 'Embarked']

In [812]:
mod_titanic_df["Age"] = mod_titanic_df["Age"].fillna(mod_titanic_df["Age"].mean())
mod_titanic_df["Embarked"] = mod_titanic_df["Embarked"].fillna(mod_titanic_df["Embarked"].mode()[0])
mod_titanic_df.isnull().any()

PassengerId    False
Survived       False
Pclass         False
Sex            False
Age            False
SibSp          False
Parch          False
Embarked       False
dtype: bool

In [813]:
#Encoding categorical variables
embarked_map = {x:i for i, x in enumerate(Counter(mod_titanic_df["Embarked"]).keys())}
sex_map = {x:i for i, x in enumerate(Counter(mod_titanic_df["Sex"]).keys())}
mod_titanic_df["Embarked"] = mod_titanic_df["Embarked"].map(embarked_map)
mod_titanic_df["Sex"] = mod_titanic_df["Sex"].map(sex_map)
mod_titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,1,0,3,0,22.000000,1,0,0
1,2,1,1,1,38.000000,1,0,1
2,3,1,3,1,26.000000,0,0,0
3,4,1,1,1,35.000000,1,0,0
4,5,0,3,0,35.000000,0,0,0
...,...,...,...,...,...,...,...,...
886,887,0,2,0,27.000000,0,0,0
887,888,1,1,1,19.000000,0,0,0
888,889,0,3,1,29.699118,1,2,0
889,890,1,1,0,26.000000,0,0,1


In [814]:
titan_train = mod_titanic_df.drop(columns=["Survived"])
titan_test = mod_titanic_df["Survived"]
titan_train_x, titan_test_x, titan_train_y, titan_test_y = train_test_split(titan_train, titan_test, test_size=0.2)

In [815]:
model_titanic = DecisionTreeClassifier(max_depth = 5)
model_titanic.fit(titan_train_x, titan_train_y)
y_pred = model_titanic.predict(titan_test_x)
y_pred

0      1
1      0
2      0
3      0
4      1
      ..
174    0
175    1
176    1
177    1
178    1
Length: 179, dtype: int64

In [816]:
skl_model_titanic = skl_DTC(max_depth=5)
skl_model_titanic.fit(titan_train_x, titan_train_y)
skl_y_pred = pd.Series(skl_model_titanic.predict(titan_test_x))
skl_y_pred

0      1
1      0
2      0
3      0
4      1
      ..
174    0
175    1
176    1
177    1
178    1
Length: 179, dtype: int64

In [817]:
if (y_pred == skl_y_pred).all():
    print("Yes")
else:
    print("No, the predictions do not match.")

No, the predictions do not match.


In [818]:
accuracy = accuracy_score(titan_test_y, y_pred)
skl_accuracy = accuracy_score(titan_test_y, skl_y_pred)
print(f"Custom Model Accuracy: {accuracy}")
print(f"Sklearn Model Accuracy: {skl_accuracy}")

Custom Model Accuracy: 0.7988826815642458
Sklearn Model Accuracy: 0.770949720670391


Edge cases: no data, all same label, single row, etc.