In [1]:
# Import the libraries

import pandas as pd
import numpy as np

# Decision Trees

`Decision Tree Algorithm` is a non-parametric supervised learning method used for classification and regression tasks. The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features.  
  
There are many different algorithms used to implement decision trees. In this notebook, we will implement the most basic one, which is called [ID3](https://en.wikipedia.org/wiki/ID3_algorithm). It is a greedy algorithm that builds a decision tree by selecting the best attribute that yields the highest information gain for each node.

Let us start with a few definitions:

- **Entropy**: Entropy is a measure of the randomness in the information being processed. The higher the entropy, the harder it is to draw any conclusions from that information. In other words, higher entropy means higher uncertainty. Entropy is defined as:

$$H(X) = \mathbb{E} [- \log p(X)] =  -\sum_{x \in X} p(x) \log p(x)$$

where $p(x_i)$ is the probability of the $i^{th}$ outcome. For example, if we have a coin, the probability of getting heads is $p(x_i) = 0.5$ and the probability of getting tails is $p(x_i) = 0.5$. Therefore, the entropy of the coin is:

$$H(X) = -0.5 \log 0.5 - 0.5 \log 0.5 = 1$$


- **Information Gain**: Information gain is the measure of the difference in entropy from before to after the set $S$ is split on an attribute $A$. In other words, how much uncertainty in $S$ was reduced after splitting set $S$ on attribute $A$. Information gain is defined as:

$$IG(S, A) = H(S) - \sum_{t \in T} \frac{|t|}{|S|} H(t)$$

where $S$ is the set of all samples at the current node, $A$ is an attribute being tested, $T$ is the set of all possible subsets of $S$ resulting from splitting on attribute $A$, and $H(t)$ is the entropy of subset $t$.


The main essence of the ID3 algorithm is to select the attribute that has the highest information gain to split the data at each node. The algorithm will stop when all attributes have been used or when all instances of the node belong to the same class. (i.e. the entropy is zero).

## Data Preprocessing

Let's start with loading the training datasets into a pandas dataframe

In [11]:
df = pd.read_csv('car_train.csv')
df.head()

Unnamed: 0,Buying Price,Maintenance Price,Number of Doors,Capacity,Size of Luggage Boot,Estimated Safety,Decision
0,low,low,3,2,small,high,unacc
1,low,high,4,2,big,high,unacc
2,vhigh,low,3,4,small,high,acc
3,vhigh,med,5more,more,small,med,unacc
4,low,vhigh,4,more,med,high,acc


In [7]:
#one hot encoding 
df = pd.get_dummies(df, columns=['Buying Price','Maintenance Price','Number of Doors','Capacity','Size of Luggage Boot','Estimated Safety','Decision']).astype(float)
df.head()

Unnamed: 0,Buying Price_high,Buying Price_low,Buying Price_med,Buying Price_vhigh,Maintenance Price_high,Maintenance Price_low,Maintenance Price_med,Maintenance Price_vhigh,Number of Doors_2,Number of Doors_3,...,Size of Luggage Boot_big,Size of Luggage Boot_med,Size of Luggage Boot_small,Estimated Safety_high,Estimated Safety_low,Estimated Safety_med,Decision_acc,Decision_good,Decision_unacc,Decision_vgood
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [13]:
class Tree:
    def __init__(self, max_depth=None):
        self.root = None
        self.max_depth = max_depth
    
    def fit(self, dataset: pd.DataFrame, features: list, target: str, node: Node=None, depth: int=1):
        """Trains the decision tree model on the given dataset

        Args:
            dataset (pd.DataFrame): Dataset on which we want to train the model
            features (list): List of features to consider for training
            target (str): The target attribute
            node (Node): The current node. Defaults to None.
            depth (int): Current depth of the tree.
        """

        if node is not None and node.is_pure(): # Do not split if the node is pure
            return
        
        if self.max_depth is not None and depth > self.max_depth: # Do not split if the max depth is reached
            return
        
        if node is None:
            node = Node(dataset, features, target)
            self.root = node
        
        # Find the best feature to split on
        best_feature = node.find_best_feature()
        node.best_feature = best_feature # Set the best feature of the node
        best_feature_values = dataset[best_feature].unique() # Find the unique values of the best feature

        for best_feature_value in best_feature_values:

            # Create a subset of the dataset which contains only the current best feature value and remove the best feature from the dataset
            #! NOTE: Remember to create a copy of the dataset while removing the feature. Otherwise, the original dataset will be modified
            best_feature_subset = None # TODO: Create a subset of the dataset which contains only the current best feature value and remove the best feature from the dataset

            # Create a subset of the features which does not contain the best feature
            best_feature_subset_features = None # TODO: Create a subset of the features which does not contain the best feature

            # Create a new node for the best split
            best_feature_subset_root = Node(best_feature_subset, best_feature_subset_features, target)
            node.best_feature_values[best_feature_value] = best_feature_subset_root

            # Recursively fit the model on the best feature subset
            self.fit(best_feature_subset, best_feature_subset_features, target, node=best_feature_subset_root, depth=depth+1)
        
    def predict(self, features: pd.Series) -> tuple:
        """Predict the class for the given features using the trained model

        Args:
            features (pd.Series): features

        Returns:
            str: Prediction of the class
            list: List of decisions made by the model. These are the features on which the model split the dataset
        """


        node = self.root
        decisions = []

        
        return prediction, decisions

Let's get our list of features for our decision tree

In [14]:
features = df.columns.tolist()
features.remove('Decision')
target = 'Decision'

Create an instance of the tree

In [34]:
from sklearn.tree import DecisionTreeClassifier

# Create a decision tree classifier instance
clf = DecisionTreeClassifier()
df.head()

Unnamed: 0,Buying Price,Maintenance Price,Number of Doors,Capacity,Size of Luggage Boot,Estimated Safety,Decision
0,low,low,3,2,small,high,unacc
1,low,high,4,2,big,high,unacc
2,vhigh,low,3,4,small,high,acc
3,vhigh,med,5more,more,small,med,unacc
4,low,vhigh,4,more,med,high,acc


Fit the tree to the training data

In [39]:
clf.fit(df, features, target)

ValueError: could not convert string to float: 'Buying Price'

## Predictions on the Test Set

In [36]:
df_test = pd.read_csv('car_test.csv')
df_test.head()

Unnamed: 0,Buying Price,Maintenance Price,Number of Doors,Capacity,Size of Luggage Boot,Estimated Safety,Decision
0,low,vhigh,2,more,med,high,acc
1,vhigh,high,2,4,big,high,unacc
2,high,med,2,2,small,med,unacc
3,vhigh,med,3,2,big,med,unacc
4,low,med,5more,2,big,low,unacc


In [None]:
preds = []
decisions = []
for _, row in df_test.iterrows():
    # Write code to predict the class of the current set of features and append the prediction to the preds list
    # Also append the decisions made by the model to the decisions list
    
    pred, decision = clf.predict(row)
    preds.append(pred)
    decisions.append(decision)


In [None]:
# Let's check the accuracy of the model

true = df_test['Decision'].tolist()

correct = 0
for i in range(len(true)):
    if true[i] == preds[i]:
        correct += 1

accuracy = correct / len(true)

print(f'Accuracy: {accuracy}')