In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import mutual_info_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

In [13]:
df = pd.read_csv(r'C:\CodingProjects\Mushroom\UCIrvine_Mushroom_Data\agaricus-lepiota.data', header=None, na_values = '?')

schema = ['Edible', 'Cap Shape', 'Cap Surface', 'Cap Color', 'Bruises?', 'Odor',
          'Gill Attachment', 'Gill Spacing', 'Gill Size', 'Gill Color', 'Stalk Shape', 'Stalk Root',
          'Stalk Surface Above Ring', 'Stalk Surface Below Ring', 'Stalk Color Above Ring', 'Stalk Color Below Ring',
          'Veil Type', 'Veil Color', 'Ring Number', 'Ring Type', 'Spore Print Color', 'Population', 'Habitat']
df.columns = schema

## Calculate the information gain of each attribute

##### Using the Scikit-learn library, the information gain for each attribute can be found.

In [14]:
# Prepare the target variable
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(df['Edible'])

# Get the list of original features
original_features = schema[1:-1]

# Calculate mutual information for each feature individually
mi_scores = {}
for feature in original_features:
    # Convert categorical feature to numerical using LabelEncoder
    X_feature = df[feature].copy()
    le = LabelEncoder()
    X_feature_encoded = le.fit_transform(X_feature).reshape(-1, 1)
    
    # Calculate mutual information
    # Setting discrete_features=True is important for categorical variables
    mi = mutual_info_classif(X_feature_encoded, Y, discrete_features=True)[0]
    #divide by log(2) to adjust the log base to 2 instead of e
    mi_scores[feature] = mi / np.log(2)

# Sort features by mutual information score
sorted_mi = sorted(mi_scores.items(), key=lambda x: x[1], reverse=True)

# Print results
for feature, score in sorted_mi:
    print(f'Information Gain for {feature}: {score:.6f}')

Information Gain for Odor: 0.906075
Information Gain for Spore Print Color: 0.480705
Information Gain for Gill Color: 0.416978
Information Gain for Ring Type: 0.318022
Information Gain for Stalk Surface Above Ring: 0.284726
Information Gain for Stalk Surface Below Ring: 0.271894
Information Gain for Stalk Color Above Ring: 0.253845
Information Gain for Stalk Color Below Ring: 0.241416
Information Gain for Gill Size: 0.230154
Information Gain for Population: 0.201958
Information Gain for Bruises?: 0.192379
Information Gain for Stalk Root: 0.134818
Information Gain for Gill Spacing: 0.100883
Information Gain for Cap Shape: 0.048797
Information Gain for Ring Number: 0.038453
Information Gain for Cap Color: 0.036049
Information Gain for Cap Surface: 0.028590
Information Gain for Veil Color: 0.023817
Information Gain for Gill Attachment: 0.014165
Information Gain for Stalk Shape: 0.007517
Information Gain for Veil Type: 0.000000


##### The calculated information gain from each attribute matches that of the outputs of the manual approach in Mushroom.ipynb!

## Calculate the entropy of the parent set

##### Using Scikit-learn again, the entropy of the parent set can be calculated to further verify the output of the Mushroom.ipynb approach

In [None]:

clf = DecisionTreeClassifier(criterion='entropy')
clf.fit(X_feature_encoded, Y)

# Access the tree structure
tree_ = clf.tree_

# Let's say you want the entropy of the root node (index 0)
parent_node_index = 0

# Get the entropy
entropy_parent = tree_.impurity[parent_node_index]

print(f"Entropy at parent node {parent_node_index}: {entropy_parent}")

Entropy at parent node 0: 0.9990678968724604


##### The parent set entropy also matches that of Mushroom.ipynb! 