---
#   **Decision Tree**
---

# Importing the Libraries

In [15]:
import pandas as pd
import math

# Calculate target count values in each column

In [16]:
def calculate_counts(data, column):
    counts = data[column].value_counts().to_dict()
    return counts

# Function to calculate Entropy

In [17]:
def calculate_entropy(counts):
    total_count = sum(counts.values())
    entropy = 0
    for count in counts.values():
        probability = count / total_count
        entropy -= probability * math.log2(probability)
    return round(entropy, 3)

# Function to calculate Information Gain

In [18]:
def calculate_information_gain(data, column, target):
    dataset_entropy = calculate_entropy(calculate_counts(data, target))
    unique_values = data[column].unique()
    weighted_entropies = 0
    for value in unique_values:
        subset = data[data[column] == value]
        subset_entropy = calculate_entropy(calculate_counts(subset, target))
        weighted_entropies += (len(subset) / len(data)) * subset_entropy

    information_gain = dataset_entropy - weighted_entropies
    return round(information_gain, 3)

# Calculate Entropy and Information Gain of all columns in the dataset

In [19]:
def calculate_all(data, column_names, target):
    values = {}
    for column in column_names:
        print("=========================================")
        print(f"Split: {column}")
        print(f"{column} Count:")
        counts = calculate_counts(data, column)
        column_entropy = 0
        for value, count in counts.items():
            print(f"   {value}: {count}")

            subset = data[data[column] == value]
            subset_counts = calculate_counts(subset, target)
            for key, val in subset_counts.items():
                print(f"      {key}:{subset_counts.get(key)}")
            subset_entropy = calculate_entropy(subset_counts)
            print(f"        Entropy of {column} = {value}: {subset_entropy}")

            column_entropy += (count / len(data)) * subset_entropy

        print(f"Entropy of {column} = {round(column_entropy,3)}")
        info_gain = calculate_information_gain(data, column, target)
        print(f"Information Gain of {column} = {info_gain}")
        values[column] = info_gain
        print("=========================================")
    root_node = max(values.keys(), key = lambda x:values[x])
    print(f"Root Node of the Decision Tree is: {root_node}")
    print("=========================================")

# Reading the Dataset

In [20]:
data = pd.read_csv('./play_tennis.csv')
target = data.columns[-1]
column_names = data.columns[1:-1]
excluded_columns = ['Day']

# Calculation of Entropy, Information Gain and finding the Root Node

In [21]:
calculate_all(data, column_names, target)

Split: Outlook
Outlook Count:
   Sunny: 5
      No:3
      Yes:2
        Entropy of Outlook = Sunny: 0.971
   Rain: 5
      Yes:3
      No:2
        Entropy of Outlook = Rain: 0.971
   Overcast: 4
      Yes:4
        Entropy of Outlook = Overcast: 0.0
Entropy of Outlook = 0.694
Information Gain of Outlook = 0.246
Split: Temperature
Temperature Count:
   Mild: 6
      Yes:4
      No:2
        Entropy of Temperature = Mild: 0.918
   Hot: 4
      No:2
      Yes:2
        Entropy of Temperature = Hot: 1.0
   Cool: 4
      Yes:3
      No:1
        Entropy of Temperature = Cool: 0.811
Entropy of Temperature = 0.911
Information Gain of Temperature = 0.029
Split: Humidity
Humidity Count:
   High: 7
      No:4
      Yes:3
        Entropy of Humidity = High: 0.985
   Normal: 7
      Yes:6
      No:1
        Entropy of Humidity = Normal: 0.592
Entropy of Humidity = 0.788
Information Gain of Humidity = 0.151
Split: Wind
Wind Count:
   Weak: 8
      Yes:6
      No:2
        Entropy of Wind = Weak: 

---