In [33]:
import math
from collections import Counter
from graphviz import Digraph
import pandas as pd

In [34]:
# Node class

class DecisionTreeNode:
    def __init__(self, feature=None, threshold=None, children=None, label=None, value=None):
        self.feature = feature #feature index for splitting
        self.threshold = threshold #threshold value for splitting
        self.children = children or {} #dict of child nodes
        self.label = label #class label for leaf nodes
        self.value = value #class dist at the node

    def is_leaf(self):
        return self.label is not None 

In [None]:
# Decision tree class
class DecisionTree:
    def __init__(self, attributes, types):
        self.attributes = attributes #list of attribute names
        self.types = types #list of attribute types ('categorical' or 'continuous')
        self.root = None #root node of the tree

    def entropy(self, data):
        if not data:
            return 0
        labels = [row[-1] for row in data]
        total = len(labels)
        count = Counter(labels)
        return -sum((c/total) * math.log2(c/total) for c in count.values() if c > 0)
    
    def split_numeric(self, data, feature_idx, threshold):
        left = [row for row in data if float(row[feature_idx]) <= threshold]
        right = [row for row in data if float(row[feature_idx]) > threshold]
        return left, right
    
    def info_gain(self, data, feature_idx, is_numeric=False):
        if not data:
            return -1, None
            
        total_entropy = self.entropy(data)

        if is_numeric:
            # Get unique values and sort them
            values = sorted(set(float(row[feature_idx]) for row in data))
            if len(values) <= 1:
                return -1, None  # Can't split if only one value
                
            best_gain, best_threshold = -1, None
            for i in range(len(values) - 1):
                threshold = (values[i] + values[i+1]) / 2
                left, right = self.split_numeric(data, feature_idx, threshold)
                if not left or not right:
                    continue
                weighted_entropy = (len(left)/len(data) * self.entropy(left)) + (len(right)/len(data) * self.entropy(right))
                gain = total_entropy - weighted_entropy
                if gain > best_gain:
                    best_gain, best_threshold = gain, threshold
            return best_gain, best_threshold
        
        else:
            # Categorical attribute
            values = set(row[feature_idx] for row in data)
            if len(values) <= 1:
                return -1, None  # Can't split if only one value
                
            weighted_entropy = 0
            for v in values:
                subset = [row for row in data if row[feature_idx] == v]
                if subset:  # Only calculate if subset is not empty
                    weighted_entropy += len(subset)/len(data) * self.entropy(subset)
            gain = total_entropy - weighted_entropy
            return gain, None
        
    def best_split(self, data, attributes):
        if not data or not attributes:
            return None, None, None
            
        best_gain, best_idx, best_threshold, best_feature = -1, None, None, None

        for attr in attributes:
            feature_idx = self.attributes.index(attr)  # Get index from original attributes
            is_numeric = self.types[attr] == 'numeric'
            gain, threshold = self.info_gain(data, feature_idx, is_numeric)
            
            if gain is not None and gain > best_gain:
                best_gain, best_idx, best_threshold, best_feature = gain, feature_idx, threshold, attr

        return best_idx, best_threshold, best_feature
    
    def build_tree(self, data, attributes):
        if not data:
            return DecisionTreeNode(label=None)
            
        labels = [row[-1] for row in data]

        # Base case 1: pure label
        if len(set(labels)) == 1:
            return DecisionTreeNode(label=labels[0])
        
        # Base case 2: no attributes left
        if not attributes:
            majority = Counter(labels).most_common(1)[0][0]
            return DecisionTreeNode(label=majority)
        
        best_idx, best_threshold, best_attr = self.best_split(data, attributes)
        
        # Base case 3: no good split found
        if best_attr is None:
            majority = Counter(labels).most_common(1)[0][0]
            return DecisionTreeNode(label=majority)

        node = DecisionTreeNode(feature=best_attr, threshold=best_threshold)

        if self.types[best_attr] == 'numeric':
            left, right = self.split_numeric(data, best_idx, best_threshold)
            if not left or not right:
                majority = Counter(labels).most_common(1)[0][0]
                return DecisionTreeNode(label=majority)
                
            # For numeric, we can reuse the attribute
            node.children = {
                f" <= {best_threshold:.2f}": self.build_tree(left, attributes),
                f" > {best_threshold:.2f}": self.build_tree(right, attributes)
            }
        else:
            # Categorical attribute
            unique_values = set(row[best_idx] for row in data)
            node.children = {}
            
            # Remove the used attribute for categorical splits
            remaining_attrs = [attr for attr in attributes if attr != best_attr]
            
            for value in unique_values:
                subset = [row for row in data if row[best_idx] == value]
                if subset:
                    node.children[value] = self.build_tree(subset, remaining_attrs)
                else:
                    # If no data for this value, use majority class
                    majority = Counter(labels).most_common(1)[0][0]
                    node.children[value] = DecisionTreeNode(label=majority)

        return node
    
    def fit(self, data):
        if not data:
            raise ValueError("No data provided for training")
        self.root = self.build_tree(data, self.attributes[:])  # Use copy of attributes

    def predict(self, sample):
        return self._predict_node(self.root, sample)
    
    def _predict_node(self, node, sample):
        if node.is_leaf():
            return node.label
        
        if node.feature is None:
            return None
            
        feature_idx = self.attributes.index(node.feature)
        
        if self.types[node.feature] == 'numeric':
            value = float(sample[feature_idx])
            if value <= node.threshold:
                child_key = f" <= {node.threshold:.2f}"
            else:
                child_key = f" > {node.threshold:.2f}"
                
            if child_key in node.children:
                return self._predict_node(node.children[child_key], sample)
        else:
            value = sample[feature_idx]
            if value in node.children:
                return self._predict_node(node.children[value], sample)
        
        # If we can't find a matching child, return None
        return None
            
    def visualize(self, filename="decision_tree"):
        dot = Digraph()
    
        def add_nodes_edges(node, parent=None, edge_label=""):
            if node is None:
                return
            
            # Create node label
            if node.is_leaf():
                label = f"Label: {node.label}"
                color = "lightblue"
            else:
                if node.threshold is not None:
                    label = f"{node.feature}\\n<= {node.threshold:.2f}?"
                else:
                    label = f"{node.feature}"
                color = "lightgrey"

            dot.node(str(id(node)), label=label, shape='box', style='filled', fillcolor=color)

            # Add edge from parent
            if parent is not None:
                edge_label_str = str(edge_label) if edge_label is not None else ""
                dot.edge(str(id(parent)), str(id(node)), label=edge_label_str)

            # Recursively add children
            if not node.is_leaf() and node.children:
                for child_edge_label, child in node.children.items():
                    add_nodes_edges(child, node, child_edge_label)

        add_nodes_edges(self.root)
        dot.render(filename, format="png", cleanup=True)
        print(f"Tree visual saved as '{filename}.png'")

    # method to print the tree structure for debugging
    def print_tree(self, node=None, depth=0):
        if node is None:
            node = self.root
        if node is None:
            print("Tree is empty")
            return
        
        indent = "  " * depth
        if node.is_leaf():
            print(f"{indent}Leaf: {node.label}")
        else:
            print(f"{indent}{node.feature} (threshold: {node.threshold})")
            for edge_label, child in node.children.items():
                print(f"{indent}  -> {edge_label}:")
                self.print_tree(child, depth + 2)

In [None]:
if __name__ == "__main__":
    
    data = pd.read_csv("Loan_Approval_Prediction.csv")
    
   # inspectin data
    print("Data columns:", data.columns.tolist())
    print("Data shape:", data.shape)
    print("First few rows:")
    print(data.head())
    
    # Use the correct column names based 
    attributes = ['AGE', 'JOB_STATUS', 'OWNS_HOUSE', 'CREDIT_RATING']
    types = {
        'AGE': 'categorical',
        'JOB_STATUS': 'categorical', 
        'OWNS_HOUSE': 'categorical',
        'CREDIT_RATING': 'categorical'
    }

    # Convert to list and ensure we have the right columns
    data_list = data[attributes + [data.columns[-1]]].values.tolist()
    
    print(f"First data row: {data_list[0]}")
    print(f"Number of samples: {len(data_list)}")
    
    # Fit the decision tree
    tree = DecisionTree(attributes, types)
    tree.fit(data_list)
    
    print("Root Feature:", tree.root.feature)
    print("Root Threshold:", tree.root.threshold)
    print("Root Label:", tree.root.label)
    
    # tree structure for debugging
    print("\n=== Tree Structure ===")
    tree.print_tree()
    
    # visualize
    tree.visualize("loan_approval_tree")

    # Predict a sample
    sample = ['MIDDLE', "TRUE", "FALSE", "GOOD"]
    print(f"Prediction for sample {sample}: {tree.predict(sample)}")

Data columns: ['ID', 'AGE', 'JOB_STATUS', 'OWNS_HOUSE', 'CREDIT_RATING', 'CLASS']
Data shape: (15, 6)
First few rows:
   ID    AGE  JOB_STATUS  OWNS_HOUSE CREDIT_RATING CLASS
0   1  Young       False       False          Fair    No
1   2  Young       False       False          Good    No
2   3  Young        True       False          Good   Yes
3   4  Young        True        True          Fair   Yes
4   5  Young       False       False          Fair    No
First data row: ['Young', False, False, 'Fair', 'No']
Number of samples: 15
Root Feature: OWNS_HOUSE
Root Threshold: None
Root Label: None

=== Tree Structure ===
OWNS_HOUSE (threshold: None)
  -> False:
    JOB_STATUS (threshold: None)
      -> False:
        Leaf: No
      -> True:
        Leaf: Yes
  -> True:
    Leaf: Yes
Tree visual saved as 'loan_approval_tree.png'
Prediction for sample ['MIDDLE', 'TRUE', 'FALSE', 'GOOD']: None


In [None]:
import numpy as np
import pandas as pd
from collections import Counter

def entropy(data):
    labels = [row[-1] for row in data]
    total = len(labels)
    count = Counter(labels)
    return 

def info_gain(data, feature_idx):
    total_entropy = entropy(dataa)
    values = set(row[feature] for row in data)
    weighted_entropy = 0
    for v in values:
        subset = [row for row in data if row[feature_idx] == v]
        if subset:
            weighted_entropy += len(subset)/len(data) * entropy(subset)
            
    gain = total - weighted_entropy
    return gain

gain = {}
for col in cols:
    gain[col] = info_gain(data, col)

root = max(gain, key=gain.get)


Information Gain for ID: 0.9710
Information Gain for AGE: 0.0830
Information Gain for JOB_STATUS: 0.3237
Information Gain for OWNS_HOUSE: 0.4200
Information Gain for CREDIT_RATING: 0.3630

Root Node:  ID
