In [113]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split


In [114]:
data = pd.read_csv('diabetes_data_upload.csv')

print("Columns:", data.columns.tolist())

TARGET = 'class'


Columns: ['Age', 'Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss', 'weakness', 'Polyphagia', 'Genital thrush', 'visual blurring', 'Itching', 'Irritability', 'delayed healing', 'partial paresis', 'muscle stiffness', 'Alopecia', 'Obesity', 'class']


In [115]:
# Binning numerical attributes (e.g., Age)
def bin_age(age):
    if age < 30:
        return '<30'
    elif age <= 50:
        return '30-50'
    else:
        return '>50'

# Apply binning
if 'Age' in data.columns:
    data['Age'] = data['Age'].apply(bin_age)

# If other numerical features exist, bin similarly if needed
data.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,30-50,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,>50,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,30-50,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,30-50,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,>50,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


Split the df into a test and training set

In [116]:
#split here
X = data.drop(columns=[TARGET])
y = data[TARGET]

# Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optional: print shape to verify
print("Train set size:", X_train.shape)
print("Test set size:", X_test.shape)

Train set size: (416, 16)
Test set size: (104, 16)


Implement the entropy function

In [117]:
def entropy(column):
    counts = Counter(column)

    total = len(column)
    
    entropy_value = 0
    for count in counts.values():

        prob = count / total
        if prob > 0:
            entropy_value -= prob * np.log2(prob)
    
    return entropy_value


Implement the info gain function

In [118]:


def information_gain(data, split_attr, target_attr=TARGET):

       # print("Calculating information gain for attribute:", split_attr)
        
        total_entropy=entropy(data[target_attr])
        entropy_weigh=0

        for val in data[split_attr].unique():

                subset=data[data[split_attr]==val]  # subset has many other columns
                # subset=dataset[dataset[split_attr]==val][target_attr]  # subset has only target column
                
                weight=len(subset)/len(data)        # S_v/S
                entropy_weigh+=weight*entropy(subset[target_attr])
                
        info_gain=total_entropy-entropy_weigh
        return info_gain


Implement the ID3 algo

In [None]:

def id3(data, original_data, features, target_attr=TARGET, parent_node_class=None):

        if len(data) == 0:                
                return Counter(original_data[target_attr]).most_common(1)[0][0]
        
        elif len(data[target_attr].unique()) == 1:   # if all values in target column are same i.e all yes then we achieved the target
                return data[target_attr].iloc[0]
        
        elif len(features) == 0:
                return Counter(data[target_attr]).most_common(1)[0][0]
        else:
                parent_node_class = Counter(data[target_attr]).most_common(1)[0][0]
                
                info_gains = {feature: information_gain(data, feature) for feature in features}  # recursively calculate information gain for each feature
                #if outlook selectd first now in outlook if sunny selected then calculate information gain for humidit, wind and other features but only against sunny in outlook coloumn
                best_feature = max(info_gains, key=info_gains.get)
                
                tree = {best_feature: {}}
                
                for value in data[best_feature].unique():
                        subset = data[data[best_feature] == value]
                        new_features = [f for f in features if f != best_feature]  # excluding that coloumn of which we already computed information gain
                        subtree = id3(subset, data, new_features, target_attr, parent_node_class)
                        tree[best_feature][value] = subtree
                
        return tree


Prediction Function

In [120]:
def predict(Quer_Y, tree, default=None):

    if not isinstance(tree, dict):
        return tree
    
    attribute = list(tree.keys())[0]
    if attribute not in Quer_Y:
        return default
    
    value = Quer_Y[attribute]
    if value not in tree[attribute]:
        return default
    
    result = predict(Quer_Y, tree[attribute][value], default)
    return result

Implement the testing functiom

In [121]:
#testing function
def test(data, tree):
    predictions = []
    for i, row in data.iterrows():
        Quer_Y = row.drop(TARGET).to_dict()
        predictions.append(predict(Quer_Y, tree))
    
    actual = data[TARGET].tolist()
    correct = sum(1 for a, p in zip(actual, predictions) if a == p)
    
    accuracy = correct / len(actual)
    print(f"Accuracy: {accuracy:.4f} ({correct}/{len(actual)} correct)")
    
    classes = sorted(data[TARGET].unique())
    cm = [[0 for _ in classes] for _ in classes]
    
    for actual_class, pred_class in zip(actual, predictions):

        i = classes.index(actual_class)
        j = classes.index(pred_class) if pred_class in classes else 0
        cm[i][j] += 1
    
    print("\nConfusion Matrix:")
    print(" " * 10, end="")
    for c in classes:
        print(f"{c:>10}", end="")
    print()
    
    for i, row in enumerate(cm):
        print(f"{classes[i]:>10}", end="")
        for cell in row:
            print(f"{cell:>10}", end="")
        print()
    
    return accuracy
    


The rest of the stuff has been implemented for you

In [122]:
def print_tree(tree, indent=""):
    
    if not isinstance(tree, dict):
        print(indent + "->", tree)
        return
    for attr, branches in tree.items():
        for value, subtree in branches.items():
            print(indent + f"[{attr} = {value}]")
            print_tree(subtree, indent + "  ")


In [123]:

features = data.columns.tolist()
features.remove(TARGET)

tree = id3(data, data, features)

print_tree(tree)


test(data, tree)

example_Quer_Y = {col: data[col].iloc[0] for col in features}
print("Example prediction:", predict(example_Quer_Y, tree))


[Polyuria = No]
  [Gender = Male]
    [Polydipsia = Yes]
      [Irritability = No]
        [muscle stiffness = Yes]
          [visual blurring = No]
            -> Positive
          [visual blurring = Yes]
            -> Negative
        [muscle stiffness = No]
          [partial paresis = No]
            -> Positive
          [partial paresis = Yes]
            [Age = >50]
              -> Positive
            [Age = 30-50]
              -> Negative
      [Irritability = Yes]
        -> Positive
    [Polydipsia = No]
      [Irritability = No]
        [weakness = Yes]
          [Itching = No]
            [Alopecia = Yes]
              [sudden weight loss = No]
                -> Positive
              [sudden weight loss = Yes]
                -> Negative
            [Alopecia = No]
              -> Negative
          [Itching = Yes]
            [Alopecia = No]
              [Age = 30-50]
                -> Positive
              [Age = >50]
                -> Negative
            [Al