## Constructing Decision Trees

### Top-Down Approach (Categorical Data)

In [29]:
import pandas as pd
import numpy as np
from collections import Counter

# Sample dataset
data = {
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild'],
    'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High'],
    'Windy': ['False', 'True', 'False', 'False', 'False', 'True', 'True', 'False', 'False', 'False', 'True', 'True', 'False', 'True'],
    'Play Tennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}

# Create DataFrame
df = pd.DataFrame(data)

# Function to calculate entropy
def entropy(target):
    total = len(target)
    value_counts = Counter(target)
    return -sum((count / total) * np.log2(count / total) for count in value_counts.values() if count > 0)

# Function to calculate Information Gain
def information_gain(data, feature, target):
    total_entropy = entropy(data[target])
    weighted_entropy = 0
    
    for value in data[feature].unique():
        subset = data[data[feature] == value]
        weighted_entropy += (len(subset) / len(data)) * entropy(subset[target])
        
    return total_entropy - weighted_entropy

# Class for Tree Node
class TreeNode:
    def __init__(self, feature=None, value=None, children=None, prediction=None, ig=None):
        self.feature = feature
        self.value = value
        self.children = children if children is not None else {}
        self.prediction = prediction
        self.ig = ig  # Store information gain for this node

    def is_leaf(self):
        return self.prediction is not None

# Function to build the decision tree
def build_tree(data, target):
    # If all target values are the same, return a leaf node
    if len(data[target].unique()) == 1:
        return TreeNode(prediction=data[target].values[0])

    best_gain = 0
    best_feature = None

    # Determine the best feature to split on using Information Gain
    for feature in data.columns[:-1]:  # Exclude target
        gain = information_gain(data, feature, target)
        if gain > best_gain:
            best_gain = gain
            best_feature = feature

    # Create a node for the best feature
    tree_node = TreeNode(feature=best_feature, ig=best_gain)

    # Split the dataset on the best feature
    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]
        # Recursively build the tree for the subset
        tree_node.children[value] = build_tree(subset, target)

    return tree_node

# Function to display the tree with feature names and Information Gain
def display_tree(node, level=0):
    indent = " " * (level * 4)  # Indentation based on level
    if node.is_leaf():
        print(f"{indent}Prediction: {node.prediction}")
    else:
        print(f"{indent}{node.feature} (IG: {node.ig:.4f})")  # Display feature and IG
        for value in node.children:
            print(f"{indent}    {value}:")
            display_tree(node.children[value], level + 1)

# Build the decision tree
decision_tree = build_tree(df, 'Play Tennis')

# Display the decision tree
print("Decision Tree:")
display_tree(decision_tree)


Decision Tree:
Outlook (IG: 0.2467)
    Sunny:
    Humidity (IG: 0.9710)
        High:
        Prediction: No
        Normal:
        Prediction: Yes
    Overcast:
    Prediction: Yes
    Rain:
    Windy (IG: 0.9710)
        False:
        Prediction: Yes
        True:
        Prediction: No


### Top-Down Approach (Numerical Data)

In [4]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_text

# Sample dataset without Animal ID
data = {
    'Weight (kg)': [22, 25, 47, 15, 40, 10, 5, 35],
    'Class': ['Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes']
}

# Create DataFrame
df = pd.DataFrame(data)

# Features and target
X = df[['Weight (kg)']]  # Input feature(s)
y = df['Class']          # Target variable

# Create and train the decision tree classifier
clf = DecisionTreeClassifier(criterion='entropy')  # You can use 'gini' or 'entropy'
clf.fit(X, y)

# Display the decision tree
tree_rules = export_text(clf, feature_names=['Weight (kg)'])
print("Decision Tree Rules:")
print(tree_rules)


Decision Tree Rules:
|--- Weight (kg) <= 18.50
|   |--- class: No
|--- Weight (kg) >  18.50
|   |--- class: Yes



### CART Algorithm for Classification

In [5]:
import pandas as pd
import numpy as np
from collections import Counter

# Sample dataset without Animal ID
data = {
    'Color': ['Brown', 'Black', 'White', 'Black', 'Green', 'Brown', 'Green', 'White'],
    'Size': ['Small', 'Small', 'Large', 'Large', 'Small', 'Medium', 'Large', 'Medium'],
    'Type': ['Mammal', 'Mammal', 'Mammal', 'Reptile', 'Reptile', 'Mammal', 'Fish', 'Mammal'],
    'Class': ['Yes', 'Yes', 'Yes', 'No', 'No', 'Yes', 'No', 'Yes']
}

# Create DataFrame
df = pd.DataFrame(data)

# Function to calculate Gini Index
def gini_index(target):
    total = len(target)
    if total == 0:
        return 0
    value_counts = Counter(target)
    return 1 - sum((count / total) ** 2 for count in value_counts.values())

# Function to calculate Gini Gain
def gini_gain(data, feature, target):
    total_gini = gini_index(data[target])
    weighted_gini = 0
    
    for value in data[feature].unique():
        subset = data[data[feature] == value]
        weighted_gini += (len(subset) / len(data)) * gini_index(subset[target])
        
    return total_gini - weighted_gini

# Class for Tree Node
class TreeNode:
    def __init__(self, feature=None, value=None, children=None, prediction=None):
        self.feature = feature
        self.value = value
        self.children = children if children is not None else {}
        self.prediction = prediction

    def is_leaf(self):
        return self.prediction is not None

# Function to build the decision tree using CART algorithm
def build_tree(data, target):
    # If all target values are the same, return a leaf node
    if len(data[target].unique()) == 1:
        return TreeNode(prediction=data[target].values[0])

    best_gain = 0
    best_feature = None

    # Determine the best feature to split on using Gini Gain
    for feature in data.columns[:-1]:  # Exclude target
        gain = gini_gain(data, feature, target)
        if gain > best_gain:
            best_gain = gain
            best_feature = feature

    # Create a node for the best feature
    tree_node = TreeNode(feature=best_feature)

    # Split the dataset on the best feature
    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]
        # Recursively build the tree for the subset
        tree_node.children[value] = build_tree(subset, target)

    return tree_node

# Function to display the tree with feature names and Gini values
def display_tree(node, level=0):
    indent = " " * (level * 4)  # Indentation based on level
    if node.is_leaf():
        print(f"{indent}Prediction: {node.prediction}")
    else:
        gini_value = gini_gain(df, node.feature, 'Class')  # Calculate Gini gain for the current split
        print(f"{indent}{node.feature} (Gini Gain: {gini_value:.4f})")
        for value in node.children:
            print(f"{indent}    {value}:")
            display_tree(node.children[value], level + 1)

# Build the decision tree
decision_tree = build_tree(df, 'Class')

# Display the decision tree
print("Decision Tree:")
display_tree(decision_tree)


Decision Tree:
Type (Gini Gain: 0.4688)
    Mammal:
    Prediction: Yes
    Reptile:
    Prediction: No
    Fish:
    Prediction: No


### Cart Algorithm for Regression

In [6]:
import pandas as pd
import numpy as np

# Sample dataset
data = {
    'Size (sq ft)': [1500, 2000, 2500, 1800, 2200, 1300, 1700, 1600],
    'Rooms': [3, 4, 4, 3, 4, 2, 3, 3],
    'Price (in $)': [300000, 400000, 450000, 350000, 410000, 250000, 320000, 310000]
}

# Create DataFrame
df = pd.DataFrame(data)

# Function to calculate Mean Squared Error (MSE)
def mse(target):
    if len(target) == 0:
        return 0
    mean_value = np.mean(target)
    return np.mean((target - mean_value) ** 2)

# Function to calculate MSE for a split
def mse_split(data, feature, target):
    total_mse = mse(data[target])
    weighted_mse = 0
    
    for value in data[feature].unique():
        subset = data[data[feature] == value]
        weighted_mse += (len(subset) / len(data)) * mse(subset[target])
        
    return total_mse - weighted_mse

# Class for Tree Node
class TreeNode:
    def __init__(self, feature=None, value=None, children=None, prediction=None):
        self.feature = feature
        self.value = value
        self.children = children if children is not None else {}
        self.prediction = prediction

    def is_leaf(self):
        return self.prediction is not None

# Function to build the regression tree
def build_regression_tree(data, target):
    # If all target values are the same, return a leaf node
    if len(data[target].unique()) == 1:
        return TreeNode(prediction=data[target].values[0])

    best_mse_gain = 0
    best_feature = None

    # Determine the best feature to split on using MSE
    for feature in data.columns[:-1]:  # Exclude target
        gain = mse_split(data, feature, target)
        if gain > best_mse_gain:
            best_mse_gain = gain
            best_feature = feature

    # Create a node for the best feature
    tree_node = TreeNode(feature=best_feature)

    # Split the dataset on the best feature
    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]
        # Recursively build the tree for the subset
        tree_node.children[value] = build_regression_tree(subset, target)

    return tree_node

# Function to display the regression tree
def display_regression_tree(node, level=0):
    indent = " " * (level * 4)  # Indentation based on level
    if node.is_leaf():
        print(f"{indent}Prediction: {node.prediction}")
    else:
        print(f"{indent}{node.feature}")
        for value in node.children:
            print(f"{indent}    {value}:")
            display_regression_tree(node.children[value], level + 1)

# Build the regression tree
regression_tree = build_regression_tree(df, 'Price (in $)')

# Display the regression tree
print("Regression Tree:")
display_regression_tree(regression_tree)


Regression Tree:
Size (sq ft)
    1500:
    Prediction: 300000
    2000:
    Prediction: 400000
    2500:
    Prediction: 450000
    1800:
    Prediction: 350000
    2200:
    Prediction: 410000
    1300:
    Prediction: 250000
    1700:
    Prediction: 320000
    1600:
    Prediction: 310000
