In [None]:
#Original 
import numpy as np
import math
import csv
def read_data(filename):
    with open(filename, 'r') as csvfile:
        datareader = csv.reader(csvfile, delimiter=',')
        headers = next(datareader)
        metadata = []
        traindata = []
        for name in headers:
            metadata.append(name)
        for row in datareader:
            traindata.append(row)

    return (metadata, traindata)
class Node:
    def __init__(self, attribute):
        self.attribute = attribute
        self.children = []
        self.answer = ""
        
    def __str__(self):
        return self.attribute
def subtables(data, col, delete):
    dict = {}
    items = np.unique(data[:, col])
    count = np.zeros((items.shape[0], 1), dtype=np.int32)    
    
    for x in range(items.shape[0]):
        for y in range(data.shape[0]):
            if data[y, col] == items[x]:
                count[x] += 1
                
    for x in range(items.shape[0]):
        dict[items[x]] = np.empty((int(count[x]), data.shape[1]), dtype="|S32")
        pos = 0
        for y in range(data.shape[0]):
            if data[y, col] == items[x]:
                dict[items[x]][pos] = data[y]
                pos += 1       
        if delete:
            dict[items[x]] = np.delete(dict[items[x]], col, 1)
        
    return items, dict
def entropy(S):
    items = np.unique(S)

    if items.size == 1:
        return 0
    
    counts = np.zeros((items.shape[0], 1))
    sums = 0
    
    for x in range(items.shape[0]):
        counts[x] = sum(S == items[x]) / (S.size * 1.0)

    for count in counts:
        sums += -1 * count * math.log(count, 2)
    return sums
def gain_ratio(data, col):
    items, dict = subtables(data, col, delete=False) 
                
    total_size = data.shape[0]
    entropies = np.zeros((items.shape[0], 1))
    intrinsic = np.zeros((items.shape[0], 1))
    
    for x in range(items.shape[0]):
        ratio = dict[items[x]].shape[0]/(total_size * 1.0)
        entropies[x] = ratio * entropy(dict[items[x]][:, -1])
        intrinsic[x] = ratio * math.log(ratio, 2)
        
    total_entropy = entropy(data[:, -1])
    iv = -1 * sum(intrinsic)
    
    for x in range(entropies.shape[0]):
        total_entropy -= entropies[x]
        
    return total_entropy / iv
def create_node(data, metadata):
    if (np.unique(data[:, -1])).shape[0] == 1:
        node = Node("")
        node.answer = np.unique(data[:, -1])[0]
        return node
        
    gains = np.zeros((data.shape[1] - 1, 1))
    
    for col in range(data.shape[1] - 1):
        gains[col] = gain_ratio(data, col)
        
    split = np.argmax(gains)
    
    node = Node(metadata[split])    
    metadata = np.delete(metadata, split, 0)    
    
    items, dict = subtables(data, split, delete=True)
    
    for x in range(items.shape[0]):
        child = create_node(dict[items[x]], metadata)
        node.children.append((items[x], child))
    
    return node
def empty(size):
    s = ""
    for x in range(size):
        s += "   "
    return s

def print_tree(node, level):
    if node.answer != "":
        print(empty(level), node.answer)
        return
    print(empty(level), node.attribute)
    for value, n in node.children:
        print(empty(level + 1), value)
        print_tree(n, level + 2)
metadata, traindata = read_data("tennis.csv")
data = np.array(traindata)
node = create_node(data, metadata)
print_tree(node, 0)


 Outlook
    Overcast
       b'Yes'
    Rain
       Wind
          b'Strong'
             b'No'
          b'Weak'
             b'Yes'
    Sunny
       Humidity
          b'High'
             b'No'
          b'Normal'
             b'Yes'


In [None]:
#same but better structured and with comments
import numpy as np
import math
import csv

def read_data(filename):
    """Read CSV file and return headers and data"""
    with open(filename, 'r') as csvfile:
        datareader = csv.reader(csvfile, delimiter=',')
        headers = next(datareader)
        traindata = list(datareader)
    return headers, traindata

class Node:
    """Decision tree node"""
    def __init__(self, attribute=""):
        self.attribute = attribute
        self.children = []
        self.answer = ""

def split_data(data, col, delete=False):
    """Split data based on column values"""
    unique_values = np.unique(data[:, col])
    subsets = {}
    
    for value in unique_values:
        # Get rows matching this value
        mask = data[:, col] == value
        subset = data[mask]
        
        # Remove the split column if requested
        if delete:
            subset = np.delete(subset, col, 1)
        
        subsets[value] = subset
    
    return unique_values, subsets

def entropy(labels):
    """Calculate entropy of labels"""
    unique_labels = np.unique(labels)
    
    if len(unique_labels) == 1:
        return 0
    
    total = len(labels)
    entropy_sum = 0
    
    for label in unique_labels:
        prob = np.sum(labels == label) / total
        entropy_sum += -prob * math.log(prob, 2)
    
    return entropy_sum

def gain_ratio(data, col):
    """Calculate gain ratio for splitting on column"""
    values, subsets = split_data(data, col, delete=False)
    
    total_size = len(data)
    total_entropy = entropy(data[:, -1])
    
    # Calculate weighted entropy and intrinsic value
    weighted_entropy = 0
    intrinsic_value = 0
    
    for value in values:
        subset = subsets[value]
        ratio = len(subset) / total_size
        
        weighted_entropy += ratio * entropy(subset[:, -1])
        intrinsic_value += -ratio * math.log(ratio, 2)
    
    information_gain = total_entropy - weighted_entropy
    return information_gain / intrinsic_value

def build_tree(data, attributes):
    """Build decision tree recursively"""
    labels = data[:, -1]
    
    # If all labels are the same, create leaf node
    if len(np.unique(labels)) == 1:
        leaf = Node()
        leaf.answer = labels[0]
        return leaf
    
    # Find best attribute to split on
    best_gain = -1
    best_col = 0
    
    for col in range(len(attributes) - 1):  # Exclude target column
        gain = gain_ratio(data, col)
        if gain > best_gain:
            best_gain = gain
            best_col = col
    
    # Create internal node
    node = Node(attributes[best_col])
    
    # Remove used attribute
    remaining_attributes = np.delete(attributes, best_col)
    
    # Split data and create child nodes
    values, subsets = split_data(data, best_col, delete=True)
    
    for value in values:
        child = build_tree(subsets[value], remaining_attributes)
        node.children.append((value, child))
    
    return node

def print_tree(node, level=0):
    """Print decision tree with indentation"""
    indent = "   " * level
    
    if node.answer:
        print(f"{indent} {node.answer}")
    else:
        print(f"{indent} {node.attribute}")
        for value, child in node.children:
            print(f"{indent}    {value}")
            print_tree(child, level + 2)

# Main execution
headers, traindata = read_data("tennis.csv")
data = np.array(traindata)
tree = build_tree(data, np.array(headers))
print_tree(tree)

In [None]:
#unsing sklearn to create a decision tree similar to ID3 using entropy smaller 
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.preprocessing import LabelEncoder

# Load and prepare data
data = pd.read_csv("tennis.csv")
X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target

# Encode categorical variables
label_encoders = {}
for column in X.columns:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])
    label_encoders[column] = le

# Encode target
target_encoder = LabelEncoder()
y = target_encoder.fit_transform(y)

# Create and train decision tree (using entropy like ID3)
tree = DecisionTreeClassifier(criterion='entropy', random_state=42)
tree.fit(X, y)

# Print tree structure
tree_rules = export_text(tree, feature_names=list(X.columns))
print(tree_rules)


|--- Outlook <= 0.50
|   |--- class: 1
|--- Outlook >  0.50
|   |--- Humidity <= 0.50
|   |   |--- Outlook <= 1.50
|   |   |   |--- Wind <= 0.50
|   |   |   |   |--- class: 0
|   |   |   |--- Wind >  0.50
|   |   |   |   |--- class: 1
|   |   |--- Outlook >  1.50
|   |   |   |--- class: 0
|   |--- Humidity >  0.50
|   |   |--- Wind <= 0.50
|   |   |   |--- Temperature <= 1.00
|   |   |   |   |--- class: 0
|   |   |   |--- Temperature >  1.00
|   |   |   |   |--- class: 1
|   |   |--- Wind >  0.50
|   |   |   |--- class: 1


Feature importance: {'Outlook': 0.3626285821279446, 'Temperature': 0.15192946522259376, 'Humidity': 0.21123657918595443, 'Wind': 0.2742053734635072}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = le.fit_transform(X[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = le.fit_transform(X[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = le.fit_transform(X[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row