# Decision Trees Algorithm Implementation

In [2]:
import numpy as np
import pandas as pd

### Function to calculate 'Entropy'

In [3]:
y = np.random.randint(low = 0, high=2, size=10)
y

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0])

In [4]:
def entropy(variable):
    
    variable = np.array(variable)
    vals, vals_count = np.unique(variable, return_counts= True)
    total = variable.shape[0]
    result_entropy = 0
    
    for idx, val in enumerate(vals):
        prob = vals_count[idx]/total
        result_entropy = result_entropy + (prob * np.log2(prob))
        
    return -1 * result_entropy

In [5]:
entropy(y)

0.8812908992306927

In [6]:
var = np.array([1,1,1,1,1,1])
entropy(var)   # should be zero because there is no randomness/uncertainty

-0.0

In [7]:
var = np.array([1,1,1,0,0,0])
entropy(var)   # should be one because there is maximum randomness/uncertainty (50% chances for all the unique values to occur)

1.0

### Function to calculate 'Information Gain'

In [8]:
data = pd.read_csv("Datasets/golf.csv")
data.head()

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes


In [36]:
def feature_points_dataset(data, target_col, column_to_split):
    
    target = target_col.name
   
    if target not in data.columns:
        data = pd.concat([data,target_col], axis = 1)
    
    features = data.columns[data.columns != target].tolist()
    
    df_list = {}
    
    if column_to_split in data.columns:
        unique_feature_values = data[column_to_split].value_counts().index.tolist()
        for unique_feature in unique_feature_values:
            df_list[unique_feature] = data[data[column_to_split] == unique_feature]
    else:
        print(f"Column '{column_to_split}' not found in the dataframe!")
        return
    return df_list

In [39]:
feature_points_dataset(data, target_col=data['Play'], column_to_split = 'Outlook')

{'sunny':    Outlook Temperature Humidity  Windy Play
 0    sunny         hot     high  False   no
 1    sunny         hot     high   True   no
 7    sunny        mild     high  False   no
 8    sunny        cool   normal  False  yes
 10   sunny        mild   normal   True  yes,
 'rainy':    Outlook Temperature Humidity  Windy Play
 3    rainy        mild     high  False  yes
 4    rainy        cool   normal  False  yes
 5    rainy        cool   normal   True   no
 9    rainy        mild   normal  False  yes
 13   rainy        mild     high   True   no,
 'overcast':      Outlook Temperature Humidity  Windy Play
 2   overcast         hot     high  False  yes
 6   overcast        cool   normal   True  yes
 11  overcast        mild     high   True  yes
 12  overcast         hot   normal  False  yes}

In [40]:
feature_points_dataset(data, target_col=data['Play'], column_to_split = 'RandomCol')

Column 'RandomCol' not found in the dataframe!


In [41]:
def information_gain(data, target, col=None):
    
    ig = {}

    entropy_target = entropy(data[target])
    features = data.columns[data.columns != target].tolist()

    for feature in features:

        feature_points_df = feature_points_dataset(data, target_col=data[target], column_to_split = feature)
        sum_unique_feature_value_counts = sum([df.shape[0] for df in feature_points_df.values()])
        sum_temp = 0
        
        for unique_feature in feature_points_df.keys():
            unique_feature_entropy = entropy(feature_points_df[unique_feature][target])
            sum_temp = sum_temp + (feature_points_df[unique_feature].shape[0]/sum_unique_feature_value_counts)*unique_feature_entropy

        ig[feature] = round(entropy_target - sum_temp,3)
        
    if col:
        if col in features:
            return ig[col]
        else:
            print(f"The column you passed '{col}' is not present in the dataset!")
            return
    return ig

In [42]:
information_gain(data, 'Play')

{'Outlook': 0.247, 'Temperature': 0.029, 'Humidity': 0.152, 'Windy': 0.048}

In [43]:
information_gain(data, 'Play', 'Outlook')

0.247

In [44]:
information_gain(data, 'Play', 'Wiindy')

The column you passed 'Wiindy' is not present in the dataset!


### Decision Tree Code Implementation

In [13]:
class DecisionTree():
    
    def __init__(self, depth = 0, max_depth = 9):
        
        self.children = {}
        self.feature_key = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None
    
    def train(self, X, y):
        ig_dict = information_gain(X, y.name)
        
        # finding the best key
        self.feature_key = max(ig_dict, key=ig_dict.get)
        
        # splitting the data by the best key
        DATA = feature_points_df(data = X, target_col = y, column_to_split = self.feature_key)
        
        # assigning target label to the node
        target_label = y.value_counts()[0].index
    
    

In [19]:
d = information_gain(data, 'Play')
max(d, key=d.get)

'Outlook'

In [45]:
data.Play.value_counts()

yes    9
no     5
Name: Play, dtype: int64