In [6]:
import numpy as np
import pandas as pd
from math import log2

In [3]:
df = pd.read_csv('classification_dataset.csv')
df.head()

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Target
0,C,Y,Medium,Yes,Class1
1,A,X,Low,No,Class1
2,A,X,Medium,Yes,Class2
3,C,X,High,No,Class1
4,B,X,Medium,No,Class1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Feature1  15 non-null     object
 1   Feature2  15 non-null     object
 2   Feature3  15 non-null     object
 3   Feature4  15 non-null     object
 4   Target    15 non-null     object
dtypes: object(5)
memory usage: 728.0+ bytes


In [7]:
target_values = df['Target'].value_counts()
total = len(df)

In [8]:
entropy = -sum((count / total) * log2(count / total) for count in target_values)
print(f"Initial Entropy: {entropy}")

Initial Entropy: 0.9967916319816366


In [9]:
for feature in df.columns[:-1]:
    feature_entropy = 0
    for value in df[feature].unique():
        subset = df[df[feature] == value]
        subset_target_values = subset['Target'].value_counts()
        subset_total = len(subset)
        subset_entropy = -sum((count / subset_total) * log2(count / subset_total) for count in subset_target_values)
        feature_entropy += (subset_total / total) * subset_entropy

    info_gain = entropy - feature_entropy
    print(f"Information Gain for {feature}: {info_gain}")

Information Gain for Feature1: 0.07724537168300172
Information Gain for Feature2: 0.31788806997878927
Information Gain for Feature3: 0.036366834877189946
Information Gain for Feature4: 0.0036851684989858136


In [10]:
root_node = None
max_info_gain = -1

for feature in df.columns[:-1]:
    feature_entropy = 0
    for value in df[feature].unique():
        subset = df[df[feature] == value]
        subset_target_values = subset['Target'].value_counts()
        subset_total = len(subset)
        subset_entropy = -sum((count / subset_total) * log2(count / subset_total) for count in subset_target_values)
        feature_entropy += (subset_total / total) * subset_entropy

    info_gain = entropy - feature_entropy
    if info_gain > max_info_gain:
        max_info_gain = info_gain
        root_node = feature

print(f"Root Node: {root_node}")

Root Node: Feature2


In [12]:
for feature in df.columns[:-1]:
    gini = 0
    for value in df[feature].unique():
        subset = df[df[feature] == value]
        subset_total = len(subset)
        subset_target_values = subset['Target'].value_counts(normalize=True)
        subset_gini = 1 - sum((prob ** 2) for prob in subset_target_values)
        gini += (subset_total / total) * subset_gini

    print(f"Gini Index for {feature}: {gini}")

Gini Index for Feature1: 0.45714285714285724
Gini Index for Feature2: 0.3047619047619048
Gini Index for Feature3: 0.47301587301587306
Gini Index for Feature4: 0.49523809523809526


In [3]:
df = pd.read_csv('golf_players.csv')
df.head()

Unnamed: 0,Day,Outlook,Temp.,Humidity,Wind,Golf Players
0,1,Sunny,Hot,High,Weak,25
1,2,Sunny,Hot,High,Strong,30
2,3,Overcast,Hot,High,Weak,46
3,4,Rain,Mild,High,Weak,45
4,5,Rain,Cool,Normal,Weak,52


In [4]:
target_values = df['Golf Players']
total = len(target_values)

In [7]:
bins = [0, 40, np.inf]
labels = ['Low', 'High']
df['Golf Players Category'] = pd.cut(df['Golf Players'], bins=bins, labels=labels)

In [8]:
category_counts = df['Golf Players Category'].value_counts()
initial_entropy = -sum((count / total) * log2(count / total) for count in category_counts)
print(f"Initial Entropy: {initial_entropy}")

Initial Entropy: 0.9852281360342516


In [11]:
root_node = None
max_info_gain = -1

for feature in df.columns[1:-2]:
    feature_entropy = 0
    for value in df[feature].unique():
        subset = df[df[feature] == value]
        subset_total = len(subset)
        subset_counts = subset['Golf Players Category'].value_counts()
        subset_entropy = -sum((count / subset_total) * log2(count / subset_total) 
                      for count in subset_counts if count > 0)
        feature_entropy += (subset_total / total) * subset_entropy

    info_gain = initial_entropy - feature_entropy
    print(f"Information Gain for {feature}: {info_gain}")

    if info_gain > max_info_gain:
        max_info_gain = info_gain
        root_node = feature

print(f"Root Node: {root_node}")

Information Gain for Outlook: 0.38062860412638344
Information Gain for Temp.: 0.02024420715375619
Information Gain for Humidity: 0.06105378373381032
Information Gain for Wind: 0.011265848648557286
Root Node: Outlook
