In [1]:
import math
import pandas as pd

data = {
    'Age': ['Young', 'Young', 'Middle-aged', 'Old', 'Old', 'Old', 'Middle-aged', 'Young', 'Young', 'Old', 'Young', 'Middle-aged', 'Middle-aged', 'Old'],
    'Income': ['High', 'High', 'High', 'Medium', 'Low', 'Low', 'Low', 'Medium', 'Low', 'Medium', 'Medium', 'Medium', 'High', 'Medium'],
    'Student': ['No', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No'],
    'Credit': ['Fair', 'Excellent', 'Fair', 'Fair', 'Fair', 'Excellent', 'Excellent', 'Fair', 'Fair', 'Fair', 'Excellent', 'Excellent', 'Fair', 'Excellent'],
    'BuysComputer': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}
df = pd.DataFrame(data)

def entropy(target_col):
    counts = target_col.value_counts()
    total = len(target_col)
    ent = 0
    for count in counts:
        p = count / total
        ent -= p * math.log2(p)
    return ent

def split_entropy(subsets, total_len):
    split_ent = 0
    for subset in subsets:
        p = len(subset) / total_len
        if p > 0:
            split_ent -= p * math.log2(p)
    return split_ent

def gain_ratio(data, split_attribute, target_name):
    total_entropy = entropy(data[target_name])
    total_len = len(data)
    vals = data[split_attribute].unique()

    subsets = []
    weighted_entropy = 0
    for val in vals:
        subset = data[data[split_attribute] == val]
        subsets.append(subset)
        weighted_entropy += (len(subset) / total_len) * entropy(subset[target_name])

    info_gain = total_entropy - weighted_entropy
    split_info = split_entropy(subsets, total_len)

    if split_info == 0:
        return 0
    return info_gain / split_info

def majority_class(data, target_name):
    return data[target_name].mode()[0]

def c45(data, originaldata, features, target_attribute_name, parent_class=None):
    if len(data[target_attribute_name].unique()) == 1:
        return data[target_attribute_name].iloc[0]

    elif len(data) == 0:
        return majority_class(originaldata, target_attribute_name)

    elif len(features) == 0:
        return parent_class

    else:
        parent_class = majority_class(data, target_attribute_name)
        gain_ratios = [gain_ratio(data, feature, target_attribute_name) for feature in features]
        best_feature_index = gain_ratios.index(max(gain_ratios))
        best_feature = features[best_feature_index]

        tree = {best_feature: {}}

        for value in data[best_feature].unique():
            sub_data = data[data[best_feature] == value]
            new_features = [f for f in features if f != best_feature]
            subtree = c45(sub_data, data, new_features, target_attribute_name, parent_class)
            tree[best_feature][value] = subtree

        return tree

features = list(df.columns[:-1])
target = 'BuysComputer'
decision_tree = c45(df, df, features, target)

print("C4.5 Decision Tree:")
print(decision_tree)


C4.5 Decision Tree:
{'Age': {'Young': {'Student': {'No': 'No', 'Yes': 'Yes'}}, 'Middle-aged': 'Yes', 'Old': {'Credit': {'Fair': 'Yes', 'Excellent': 'No'}}}}


In [2]:
import pandas as pd

data = {
    'Age': ['Young', 'Young', 'Middle-aged', 'Old', 'Old', 'Old', 'Middle-aged', 'Young', 'Young', 'Old', 'Young', 'Middle-aged', 'Middle-aged', 'Old'],
    'Income': ['High', 'High', 'High', 'Medium', 'Low', 'Low', 'Low', 'Medium', 'Low', 'Medium', 'Medium', 'Medium', 'High', 'Medium'],
    'Student': ['No', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No'],
    'Credit': ['Fair', 'Excellent', 'Fair', 'Fair', 'Fair', 'Excellent', 'Excellent', 'Fair', 'Fair', 'Fair', 'Excellent', 'Excellent', 'Fair', 'Excellent'],
    'BuysComputer': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}

df = pd.DataFrame(data)

def gini_impurity(target_col):
    counts = target_col.value_counts()
    total = len(target_col)
    impurity = 1
    for count in counts:
        p = count / total
        impurity -= p ** 2
    return impurity

def weighted_gini(data, split_attribute, target_name):
    total_len = len(data)
    vals = data[split_attribute].unique()
    weighted_impurity = 0
    for val in vals:
        subset = data[data[split_attribute] == val]
        weighted_impurity += (len(subset) / total_len) * gini_impurity(subset[target_name])
    return weighted_impurity

def best_split(data, features, target_name):
    best_gini = 1
    best_feature = None
    for feature in features:
        gini = weighted_gini(data, feature, target_name)
        if gini < best_gini:
            best_gini = gini
            best_feature = feature
    return best_feature

def majority_class(data, target_name):
    return data[target_name].mode()[0]

def cart(data, originaldata, features, target_name, parent_class=None):
    if len(data[target_name].unique()) == 1:
        return data[target_name].iloc[0]

    elif len(data) == 0:
        return majority_class(originaldata, target_name)

    elif len(features) == 0:
        return parent_class

    else:
        parent_class = majority_class(data, target_name)
        best_feature = best_split(data, features, target_name)
        tree = {best_feature: {}}

        for value in data[best_feature].unique():
            sub_data = data[data[best_feature] == value]
            new_features = [f for f in features if f != best_feature]
            subtree = cart(sub_data, data, new_features, target_name, parent_class)
            tree[best_feature][value] = subtree

        return tree

features = list(df.columns[:-1])
target = 'BuysComputer'
decision_tree = cart(df, df, features, target)

print("CART Decision Tree:")
print(decision_tree)


CART Decision Tree:
{'Age': {'Young': {'Student': {'No': 'No', 'Yes': 'Yes'}}, 'Middle-aged': 'Yes', 'Old': {'Credit': {'Fair': 'Yes', 'Excellent': 'No'}}}}
