# ID3 Algorithm

In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame
df_tennis = pd.read_csv('lab3_data.csv')

In [2]:
attribute_names = list(df_tennis.columns)
attribute_names.remove('playtennis')
label="playtennis"
print(attribute_names)

['day', 'outlook', 'temperature', 'humidity', 'wind']


In [3]:
def calc_total_entropy(train_data, label, class_list):
    total_row = train_data.shape[0] 
    total_entr = 0
    
    for c in class_list: 
        total_class_count = train_data[train_data[label] == c].shape[0] 
        total_class_entr = - (total_class_count/total_row)*np.log2(total_class_count/total_row) 
        total_entr += total_class_entr 
    
    return total_entr

In [4]:
def calc_entropy(feature_value_data, label, class_list):
    class_count = feature_value_data.shape[0]
    entropy = 0
    
    for c in class_list:
        label_class_count = feature_value_data[feature_value_data[label] == c].shape[0]  
        entropy_class = 0
        if label_class_count != 0:
            probability_class = label_class_count/class_count 
            entropy_class = - probability_class * np.log2(probability_class)  
        entropy += entropy_class
    return entropy

In [5]:
def calc_info_gain(train_data,feature_name, label, class_list):
    feature_value_list = train_data[feature_name].unique() 
    total_row = train_data.shape[0]
    feature_info = 0.0
    
    for feature_value in feature_value_list:
        feature_value_data = train_data[train_data[feature_name] == feature_value] 
        feature_value_count = feature_value_data.shape[0]
        feature_value_entropy = calc_entropy(feature_value_data, label, class_list)
        feature_value_probability = feature_value_count/total_row
        feature_info += feature_value_probability * feature_value_entropy 
        
    return calc_total_entropy(train_data, label, class_list) - feature_info

In [12]:
def id3(df, target_attribute_name, attribute_names, default_class = None):
    from collections import Counter
    count = Counter(x for x in df[target_attribute_name])
    if len(count)==1:
        return next(iter(count))
    elif df.empty or (not attribute_names):
        return default_class
    else:
        class_list = df[label].unique()
        gain = [
            calc_info_gain(df, attr, target_attribute_name,class_list) for attr in attribute_names
        ]
        print()
        index_of_max = gain.index(max(gain))
        best_attr = attribute_names[index_of_max]
        
        tree = {best_attr:{}}
        
        remaining_attribute_names = [ i for i in attribute_names if i!= best_attr ]
        
        for attr_val, data_subset in df.groupby(best_attr):
                subtree = id3(data_subset, target_attribute_name, remaining_attribute_names, default_class)
                tree[best_attr][attr_val] = subtree
        
        return tree

tree = id3(df_tennis,'playtennis',attribute_names)
print("\n\nThe Resultant Decision Tree is:\n")
print(tree)










The Resultant Decision Tree is:

{'day': {1: 'no', 2: 'no', 3: 'yes', 4: 'yes', 5: 'yes', 6: 'no', 7: 'yes', 8: 'no', 9: 'yes', 10: 'yes', 11: 'yes', 12: 'yes', 13: 'yes', 14: 'no'}}


In [7]:
def predict(tree, instance):
    if not isinstance(tree, dict): 
        return tree 
    else:
        root_node = next(iter(tree)) 
        feature_value = instance[root_node] 
        if feature_value in tree[root_node]: 
            return predict(tree[root_node][feature_value], instance) 
        else:
            return None

In [8]:

def evaluate(tree, test_data_m, label):
      for index, row in test_data_m.iterrows(): 
        result = predict(tree, test_data_m.iloc[index]) 
        print(row)
        print("predicted value=",result)

test_data_m = pd.read_csv("id3_test.csv") 

evaluate(tree, test_data_m, 'playtennis') 

day              t1
outlook        rain
temp           cool
humidity     normal
wind         strong
Name: 0, dtype: object
predicted value= None
day             t2
outlook      sunny
temp          mild
humidity    normal
wind        strong
Name: 1, dtype: object
predicted value= None


In [14]:
import pandas as pd
import numpy as np
from pandas import DataFrame
df_tennis = pd.read_csv('lab3_data.csv')

attribute_names = list(df_tennis.columns)
attribute_names.remove('playtennis')
label="playtennis"
print(attribute_names)

def calc_total_entropy(train_data, label, class_list):
    total_row = train_data.shape[0] #the total size of the dataset
    total_entr = 0
    
    for c in class_list: #for each class in the label
        total_class_count = train_data[train_data[label] == c].shape[0] #number of the class
        total_class_entr = - (total_class_count/total_row)*np.log2(total_class_count/total_row) #entropy of the class
        total_entr += total_class_entr #adding the class entropy to the total entropy of the dataset
    
    return total_entr

def calc_entropy(feature_value_data, label, class_list):
    class_count = feature_value_data.shape[0]
    entropy = 0
    
    for c in class_list:
        label_class_count = feature_value_data[feature_value_data[label] == c].shape[0] #row count of class c 
        entropy_class = 0
        if label_class_count != 0:
            probability_class = label_class_count/class_count #probability of the class
            entropy_class = - probability_class * np.log2(probability_class)  #entropy
        entropy += entropy_class
    return entropy

def calc_info_gain(train_data,feature_name, label, class_list):
    feature_value_list = train_data[feature_name].unique() #unqiue values of the feature
    total_row = train_data.shape[0]
    feature_info = 0.0
    
    for feature_value in feature_value_list:
        feature_value_data = train_data[train_data[feature_name] == feature_value] #filtering rows with that feature_value
        feature_value_count = feature_value_data.shape[0]
        feature_value_entropy = calc_entropy(feature_value_data, label, class_list) #calculcating entropy for the feature value
        feature_value_probability = feature_value_count/total_row
        feature_info += feature_value_probability * feature_value_entropy #calculating information of the feature value
        
    return calc_total_entropy(train_data, label, class_list) - feature_info #calculating information gain by subtracting

def id3(df, target_attribute_name, attribute_names, default_class = None):
    from collections import Counter
    count = Counter(x for x in df[target_attribute_name])
    if len(count)==1:
        return next(iter(count))
    elif df.empty or (not attribute_names):
        return default_class
    else:
        default_class = max(count.keys())
        class_list = df[label].unique()
        gain = [
            calc_info_gain(df, attr, target_attribute_name,class_list) for attr in attribute_names
        ]
        print()
        index_of_max = gain.index(max(gain))
        best_attr = attribute_names[index_of_max]
        
        tree = {best_attr:{}}
        
        remaining_attribute_names = [ i for i in attribute_names if i!= best_attr ]
        
        for attr_val, data_subset in df.groupby(best_attr):
                subtree = id3(data_subset, target_attribute_name, remaining_attribute_names, default_class)
                tree[best_attr][attr_val] = subtree
        
        return tree

tree = id3(df_tennis,'playtennis',attribute_names)
print("\n\nThe Resultant Decision Tree is:\n")
print(tree)

['day', 'outlook', 'temperature', 'humidity', 'wind']



The Resultant Decision Tree is:

{'day': {1: 'no', 2: 'no', 3: 'yes', 4: 'yes', 5: 'yes', 6: 'no', 7: 'yes', 8: 'no', 9: 'yes', 10: 'yes', 11: 'yes', 12: 'yes', 13: 'yes', 14: 'no'}}
