# Decision Trees Algorithm Implementation

In [1]:
import numpy as np
import pandas as pd

### function to calculate Entropy

In [2]:
y = np.random.randint(low = 0, high=2, size=10)
y

array([1, 0, 1, 0, 0, 1, 0, 0, 0, 1])

In [3]:
def entropy(variable):
    
    variable = np.array(variable)
    vals, vals_count = np.unique(variable, return_counts= True)
    total = variable.shape[0]
    result_entropy = 0
    
    for idx, val in enumerate(vals):
        prob = vals_count[idx]/total
        result_entropy = result_entropy + (prob * np.log2(prob))
        
    return -1 * result_entropy

In [4]:
entropy(y)

0.9709505944546686

In [5]:
var = np.array([1,1,1,1,1,1])
entropy(var)   # should be zero because there is no randomness/uncertainty

-0.0

In [6]:
var = np.array([1,1,1,0,0,0])
entropy(var)   # should be one because there is maximum randomness/uncertainty (50% chances for all the unique values to occur)

1.0

### calculating Information Gain

In [7]:
data = pd.read_csv("Datasets/golf.csv")
data.head()

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes


In [37]:
def information_gain(data, target, col=None):
    # func
    ig = {}

    entropy_target = entropy(data[target])
    features = data.columns[data.columns != target].tolist()

    for feature in features:

        unique_feature_values = data[feature].value_counts().index.tolist()
        unique_feature_value_counts = data[feature].value_counts().values.tolist()
        sum_unique_feature_value_counts = sum(unique_feature_value_counts)
        sum_temp = 0

        for idx, unique_feature in enumerate(unique_feature_values):

            unique_feature_df = data[data[feature] == unique_feature]
            unique_feature_entropy = entropy(unique_feature_df[target])

            sum_temp = sum_temp + (unique_feature_value_counts[idx]/sum_unique_feature_value_counts)*unique_feature_entropy

        ig[feature] = round(entropy_target - sum_temp,3)
        
    if col:
        if col in features:
            return ig[col]
        else:
            print(f"The column you passed '{col}' is not present in the dataset! \nTherefore, returning the dict of information gain for all columns.\n")
    return ig

In [39]:
information_gain(data, 'Play')

{'Outlook': 0.247, 'Temperature': 0.029, 'Humidity': 0.152, 'Windy': 0.048}

In [40]:
information_gain(data, 'Play', 'Outlook')

0.247

In [38]:
information_gain(data, 'Play', 'Wiindy')

The column you passed 'Wiindy' is not present in the dataset! 
Therefore, returning the dict of information gain for all columns.



{'Outlook': 0.247, 'Temperature': 0.029, 'Humidity': 0.152, 'Windy': 0.048}