In [36]:
import pandas as pd
import numpy as np
import math

df = pd.read_csv("lab11.csv")

# ID3

In [37]:
def entropy(data):
    labels = data['decision']
    unique_labels, label_counts = np.unique(labels, return_counts=True)
    probabilities = label_counts / len(labels)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

def information_gain(data, attribute):
    total_entropy = entropy(data)
    attribute_values = data[attribute].unique()
    weighted_entropy = 0

    for value in attribute_values:
        subset = data[data[attribute] == value]
        weighted_entropy += (len(subset) / len(data)) * entropy(subset)

    gain = total_entropy - weighted_entropy
    return gain

def id3(data, features, target_attribute):
    if len(data['decision'].unique()) == 1:
        return data['decision'].iloc[0]
    
    if len(features) == 0:
        return data['decision'].value_counts().idxmax()
    
    information_gains = {f: information_gain(data, f) for f in features}
    best_feature = max(information_gains, key=information_gains.get)
    
    tree = {best_feature: {}}
    remaining_features = [f for f in features if f != best_feature]
    
    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]
        tree[best_feature][value] = id3(subset, remaining_features, target_attribute)
    
    return tree
def print_tree(tree, depth=0):
    if type(tree) == dict:
        for key, value in tree.items():
            print("  " * depth + f"{key}:")
            print_tree(value, depth + 1)
    else:
        print("  " * (depth + 1) + f"Class: {tree}")
def classify_id3(tree, sample):
    if isinstance(tree, str):
        return tree
    else:
        attribute = list(tree.keys())[0]
        attribute_value = sample[attribute]
        if attribute_value in tree[attribute]:
            return classify_id3(tree[attribute][attribute_value], sample)
        else:
            return "Unknown"

new_sample = {
    'outlook': 'sunny',
    'temp': 'medium',
    'humidity': 'medium',
    'wind': 'strong'
}
features = ['outlook', 'temp', 'humidity', 'wind']
target_attribute = 'decision'

decision_tree = id3(df, features, target_attribute)
print_tree(decision_tree)
result = classify_id3(decision_tree, new_sample)
print(f"The predicted class for the new sample is: {result}")

outlook:
  sunny:
    humidity:
      high:
          Class: no
      medium:
          Class: yes
  overcast:
      Class: yes
  rain:
    wind:
      weak:
          Class: yes
      strong:
          Class: no
The predicted class for the new sample is: yes


# C4.5

In [38]:
def entropy(data):
    labels = data['decision']
    unique_labels, label_counts = np.unique(labels, return_counts=True)
    probabilities = label_counts / len(labels)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

def gain_ratio(data, attribute):
    total_entropy = entropy(data)
    attribute_values = data[attribute].unique()
    weighted_entropy = 0
    intrinsic_info = 0

    for value in attribute_values:
        subset = data[data[attribute] == value]
        weight = len(subset) / len(data)
        weighted_entropy += weight * entropy(subset)
        intrinsic_info += -weight * math.log2(weight)

    gain = total_entropy - weighted_entropy
    gain_ratio = gain / intrinsic_info if intrinsic_info != 0 else 0
    return gain_ratio

def c45(data, features, target_attribute):
    if len(data['decision'].unique()) == 1:
        return data['decision'].iloc[0]

    if len(features) == 0:
        return data['decision'].value_counts().idxmax()

    gain_ratios = {f: gain_ratio(data, f) for f in features}
    best_feature = max(gain_ratios, key=gain_ratios.get)

    tree = {best_feature: {}}
    remaining_features = [f for f in features if f != best_feature]

    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]
        tree[best_feature][value] = c45(subset, remaining_features, target_attribute)

    return tree
def print_tree(tree, depth=0):
    if type(tree) == dict:
        for key, value in tree.items():
            print("  " * depth + f"{key}:")
            print_tree(value, depth + 1)
    else:
        print("  " * (depth + 1) + f"Class: {tree}")
def classify_c45(tree, sample):
    if isinstance(tree, dict):
        attribute = list(tree.keys())[0]
        attribute_value = sample[attribute]
        if attribute_value in tree[attribute]:
            return classify_c45(tree[attribute][attribute_value], sample)
        else:
            return "Unknown"
    else:
        return tree
features = ['outlook', 'temp', 'humidity', 'wind']
target_attribute = 'decision'
decision_tree = c45(df, features, target_attribute)
print_tree(decision_tree)
new_sample = {
    'outlook': 'sunny',
    'temp': 'medium',
    'humidity': 'medium',
    'wind': 'strong'
}
result = classify_c45(decision_tree, new_sample)
print(f"The predicted class for the new sample is: {result}")

outlook:
  sunny:
    humidity:
      high:
          Class: no
      medium:
          Class: yes
  overcast:
      Class: yes
  rain:
    wind:
      weak:
          Class: yes
      strong:
          Class: no
The predicted class for the new sample is: yes


# CART

In [39]:
import pandas as pd
import numpy as np

# Define the dataset
data = {
    'day': list(range(1, 15)),
    'outlook': ['sunny', 'sunny', 'overcast', 'rain', 'rain', 'rain', 'overcast', 'sunny', 'sunny', 'rain', 'sunny', 'overcast', 'overcast', 'rain'],
    'temp': ['high', 'high', 'high', 'medium', 'low', 'low', 'low', 'medium', 'low', 'medium', 'medium', 'medium', 'high', 'medium'],
    'humidity': ['high', 'high', 'medium', 'high', 'high', 'medium', 'low', 'high', 'medium', 'high', 'medium', 'high', 'medium', 'high'],
    'wind': ['weak', 'strong', 'weak', 'weak', 'weak', 'strong', 'strong', 'weak', 'weak', 'weak', 'strong', 'strong', 'weak', 'strong'],
    'decision': ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']
}

df = pd.DataFrame(data)

# Define a class for a Decision Tree Node
class DecisionTreeNode:
    def __init__(self, gini, num_samples, num_samples_per_class, predicted_class):
        self.gini = gini
        self.num_samples = num_samples
        self.num_samples_per_class = num_samples_per_class
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.threshold = 0
        self.left = None
        self.right = None

# Define the CART algorithm for building a decision tree
def cart(X, y):
    n_samples, n_features = X.shape
    num_parent = [np.sum(y == 0), np.sum(y == 1)]  # Convert 'no' to 0 and 'yes' to 1
    gini = 1.0 - sum((n / n_samples) ** 2 for n in num_parent)

    best_gini = 0
    best_idx, best_thr = None, None

    for idx in range(n_features):
        thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
        num_left = [0, 0]
        num_right = num_parent.copy()

        for i in range(1, n_samples):
            c = classes[i - 1]
            num_left[c] += 1
            num_right[c] -= 1
            gini_left = 1.0 - sum(
                (num_left[x] / i) ** 2 for x in range(2)
            )
            gini_right = 1.0 - sum(
                (num_right[x] / (n_samples - i)) ** 2 for x in range(2)
            )
            gini = (i * gini_left + (n_samples - i) * gini_right) / n_samples
            if thresholds[i] == thresholds[i - 1]:
                continue
            if gini < best_gini:
                best_gini = gini
                best_idx = idx
                best_thr = (thresholds[i] + thresholds[i - 1]) / 2
    if best_gini == 0:
        return DecisionTreeNode(
            gini=best_gini,
            num_samples=n_samples,
            num_samples_per_class=num_parent,
            predicted_class=np.argmax(num_parent),
        )
    X_left, y_left, X_right, y_right = X[X[:, best_idx] <= best_thr], y[X[:, best_idx] <= best_thr], X[X[:, best_idx] > best_thr], y[X[:, best_idx] > best_thr]
    left = cart(X_left, y_left)
    right = cart(X_right, y_right)
    return DecisionTreeNode(
        gini=best_gini,
        num_samples=n_samples,
        num_samples_per_class=num_parent,
        predicted_class=np.argmax(num_parent),
        feature_index=best_idx,
        threshold=best_thr,
        left=left,
        right=right,
    )

# Prepare the data
X = df[['outlook', 'temp', 'humidity', 'wind']]
y = df['decision']
X = pd.get_dummies(X)  # One-hot encode categorical features

# Build the decision tree
X = X.to_numpy()
y = (y == 'yes').astype(int)  # Convert 'yes' to 1 and 'no' to 0
tree = cart(X, y)

# Define a function to make predictions using the tree
def predict_tree(node, x):
    if node.feature_index is None:
        return node.predicted_class
    if x[node.feature_index] <= node.threshold:
        if node.left is not None:
            return predict_tree(node.left, x)
    else:
        if node.right is not None:
            return predict_tree(node.right, x)

# Test the decision tree with a sample input
sample_input = np.array([0, 0, 0, 1])  # Sample input: ['outlook_overcast', 'temp_high', 'humidity_high', 'wind_strong']
prediction = predict_tree(tree, sample_input)

if prediction == 1:
    print("The decision is 'yes'")
else:
    print("The decision is 'no'")


The decision is 'no'
