In [34]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = "/content/diabetes.csv"
df = pd.read_csv(file_path)

def entropy(df, col):
    ppos = (df[col] == 1).sum()
    pneg = len(df[col]) - ppos
    total = ppos + pneg
    if ppos == 0 or pneg == 0:
        return 0
    ppos /= total
    pneg /= total
    return -ppos * np.log2(ppos) - pneg * np.log2(pneg)

def information_gain(df, feature, target):
    total_entropy = entropy(df, target)
    values, counts = np.unique(df[feature], return_counts=True)
    weighted_entropy = sum((counts[i] / sum(counts)) * entropy(df[df[feature] == values[i]], target)
                            for i in range(len(values)))
    return total_entropy - weighted_entropy

def id3(df, features, target):
    # If all target values are the same, return that class
    if len(df[target].unique()) == 1:
        return df[target].iloc[0]

    # If no features left, return the most common target value
    if not features:
        return df[target].mode()[0]

    # Compute information gain for all features
    info_gains = {feature: information_gain(df, feature, target) for feature in features}

    # Print information gain for each feature
    print("Information Gain for each feature:")
    for feature, gain in info_gains.items():
        print(f"{feature}: {gain}")

    # Select the best feature
    best_feature = max(info_gains, key=info_gains.get)
    tree = {best_feature: {}}

    # Recur for each unique value in the selected feature
    for value in df[best_feature].unique():
        sub_df = df[df[best_feature] == value].drop(columns=[best_feature])
        tree[best_feature][value] = id3(sub_df, [f for f in features if f != best_feature], target)

    return tree

def print_tree(tree, indent=""):
    if not isinstance(tree, dict):
        print(indent + "→ " + str(tree))
        return
    for key, value in tree.items():
        print(indent + str(key))
        for sub_key in value:
            print(indent + " ├── " + str(sub_key))
            print_tree(value[sub_key], indent + " │   ")

# Compute the decision tree
features = [col for col in df.columns if col != "Outcome"]
decision_tree = id3(df, features, "Outcome")

print("Decision Tree:")
print_tree(decision_tree)

Information Gain for each feature:
Pregnancies: 0.0618253416801795
Glucose: 0.30420112715337566
BloodPressure: 0.05930957964167771
SkinThickness: 0.08166434463609218
Insulin: 0.2770945287640215
BMI: 0.3438106271268244
DiabetesPedigreeFunction: 0.6509177483835626
Age: 0.14094080665096176
Information Gain for each feature:
Pregnancies: 1.0
Glucose: 1.0
BloodPressure: 0.0
SkinThickness: 1.0
Insulin: 1.0
BMI: 1.0
Age: 1.0
Information Gain for each feature:
Pregnancies: 0.0
Glucose: 1.0
BloodPressure: 1.0
SkinThickness: 1.0
Insulin: 1.0
BMI: 1.0
Age: 1.0
Information Gain for each feature:
Pregnancies: 0.2516291673878229
Glucose: 0.9182958340544896
BloodPressure: 0.9182958340544896
SkinThickness: 0.9182958340544896
Insulin: 0.9182958340544896
BMI: 0.9182958340544896
Age: 0.9182958340544896
Information Gain for each feature:
Pregnancies: 0.8112781244591328
Glucose: 0.8112781244591328
BloodPressure: 0.8112781244591328
SkinThickness: 0.8112781244591328
Insulin: 0.8112781244591328
BMI: 0.8112781